1/* 2 * ARM NEON optimised DSP functions 3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 4 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net> 5 * 6 * This file is part of FFmpeg. 7 * 8 * FFmpeg is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Lesser General Public 10 * License as published by the Free Software Foundation; either 11 * version 2.1 of the License, or (at your option) any later version. 12 * 13 * FFmpeg is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Lesser General Public License for more details. 17 * 18 * You should have received a copy of the GNU Lesser General Public 19 * License along with FFmpeg; if not, write to the Free Software 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21 */ 22 23#include "libavutil/aarch64/asm.S" 24 25.macro pixels16 rnd=1, avg=0 26 .if \avg 27 mov x12, x0 28 .endif 291: ld1 {v0.16B}, [x1], x2 30 ld1 {v1.16B}, [x1], x2 31 ld1 {v2.16B}, [x1], x2 32 ld1 {v3.16B}, [x1], x2 33 .if \avg 34 ld1 {v4.16B}, [x12], x2 35 urhadd v0.16B, v0.16B, v4.16B 36 ld1 {v5.16B}, [x12], x2 37 urhadd v1.16B, v1.16B, v5.16B 38 ld1 {v6.16B}, [x12], x2 39 urhadd v2.16B, v2.16B, v6.16B 40 ld1 {v7.16B}, [x12], x2 41 urhadd v3.16B, v3.16B, v7.16B 42 .endif 43 subs w3, w3, #4 44 st1 {v0.16B}, [x0], x2 45 st1 {v1.16B}, [x0], x2 46 st1 {v2.16B}, [x0], x2 47 st1 {v3.16B}, [x0], x2 48 b.ne 1b 49 ret 50.endm 51 52.macro pixels16_x2 rnd=1, avg=0 531: ld1 {v0.16B, v1.16B}, [x1], x2 54 ld1 {v2.16B, v3.16B}, [x1], x2 55 subs w3, w3, #2 56 ext v1.16B, v0.16B, v1.16B, #1 57 avg v0.16B, v0.16B, v1.16B 58 ext v3.16B, v2.16B, v3.16B, #1 59 avg v2.16B, v2.16B, v3.16B 60 .if \avg 61 ld1 {v1.16B}, [x0], x2 62 ld1 {v3.16B}, [x0] 63 urhadd v0.16B, v0.16B, v1.16B 64 urhadd v2.16B, v2.16B, v3.16B 65 sub x0, x0, x2 66 .endif 67 st1 {v0.16B}, [x0], x2 68 st1 {v2.16B}, [x0], x2 69 b.ne 1b 70 ret 71.endm 72 73.macro pixels16_y2 rnd=1, avg=0 74 sub w3, w3, #2 75 ld1 {v0.16B}, [x1], x2 76 ld1 {v1.16B}, [x1], x2 771: subs w3, w3, #2 78 avg v2.16B, v0.16B, v1.16B 79 ld1 {v0.16B}, [x1], x2 80 avg v3.16B, v0.16B, v1.16B 81 ld1 {v1.16B}, [x1], x2 82 .if \avg 83 ld1 {v4.16B}, [x0], x2 84 ld1 {v5.16B}, [x0] 85 urhadd v2.16B, v2.16B, v4.16B 86 urhadd v3.16B, v3.16B, v5.16B 87 sub x0, x0, x2 88 .endif 89 st1 {v2.16B}, [x0], x2 90 st1 {v3.16B}, [x0], x2 91 b.ne 1b 92 93 avg v2.16B, v0.16B, v1.16B 94 ld1 {v0.16B}, [x1], x2 95 avg v3.16B, v0.16B, v1.16B 96 .if \avg 97 ld1 {v4.16B}, [x0], x2 98 ld1 {v5.16B}, [x0] 99 urhadd v2.16B, v2.16B, v4.16B 100 urhadd v3.16B, v3.16B, v5.16B 101 sub x0, x0, x2 102 .endif 103 st1 {v2.16B}, [x0], x2 104 st1 {v3.16B}, [x0], x2 105 106 ret 107.endm 108 109.macro pixels16_xy2 rnd=1, avg=0 110 sub w3, w3, #2 111 ld1 {v0.16B, v1.16B}, [x1], x2 112 ld1 {v4.16B, v5.16B}, [x1], x2 113NRND movi v26.8H, #1 114 ext v1.16B, v0.16B, v1.16B, #1 115 ext v5.16B, v4.16B, v5.16B, #1 116 uaddl v16.8H, v0.8B, v1.8B 117 uaddl2 v20.8H, v0.16B, v1.16B 118 uaddl v18.8H, v4.8B, v5.8B 119 uaddl2 v22.8H, v4.16B, v5.16B 1201: subs w3, w3, #2 121 ld1 {v0.16B, v1.16B}, [x1], x2 122 add v24.8H, v16.8H, v18.8H 123NRND add v24.8H, v24.8H, v26.8H 124 ext v30.16B, v0.16B, v1.16B, #1 125 add v1.8H, v20.8H, v22.8H 126 mshrn v28.8B, v24.8H, #2 127NRND add v1.8H, v1.8H, v26.8H 128 mshrn2 v28.16B, v1.8H, #2 129 .if \avg 130 ld1 {v16.16B}, [x0] 131 urhadd v28.16B, v28.16B, v16.16B 132 .endif 133 uaddl v16.8H, v0.8B, v30.8B 134 ld1 {v2.16B, v3.16B}, [x1], x2 135 uaddl2 v20.8H, v0.16B, v30.16B 136 st1 {v28.16B}, [x0], x2 137 add v24.8H, v16.8H, v18.8H 138NRND add v24.8H, v24.8H, v26.8H 139 ext v3.16B, v2.16B, v3.16B, #1 140 add v0.8H, v20.8H, v22.8H 141 mshrn v30.8B, v24.8H, #2 142NRND add v0.8H, v0.8H, v26.8H 143 mshrn2 v30.16B, v0.8H, #2 144 .if \avg 145 ld1 {v18.16B}, [x0] 146 urhadd v30.16B, v30.16B, v18.16B 147 .endif 148 uaddl v18.8H, v2.8B, v3.8B 149 uaddl2 v22.8H, v2.16B, v3.16B 150 st1 {v30.16B}, [x0], x2 151 b.gt 1b 152 153 ld1 {v0.16B, v1.16B}, [x1], x2 154 add v24.8H, v16.8H, v18.8H 155NRND add v24.8H, v24.8H, v26.8H 156 ext v30.16B, v0.16B, v1.16B, #1 157 add v1.8H, v20.8H, v22.8H 158 mshrn v28.8B, v24.8H, #2 159NRND add v1.8H, v1.8H, v26.8H 160 mshrn2 v28.16B, v1.8H, #2 161 .if \avg 162 ld1 {v16.16B}, [x0] 163 urhadd v28.16B, v28.16B, v16.16B 164 .endif 165 uaddl v16.8H, v0.8B, v30.8B 166 uaddl2 v20.8H, v0.16B, v30.16B 167 st1 {v28.16B}, [x0], x2 168 add v24.8H, v16.8H, v18.8H 169NRND add v24.8H, v24.8H, v26.8H 170 add v0.8H, v20.8H, v22.8H 171 mshrn v30.8B, v24.8H, #2 172NRND add v0.8H, v0.8H, v26.8H 173 mshrn2 v30.16B, v0.8H, #2 174 .if \avg 175 ld1 {v18.16B}, [x0] 176 urhadd v30.16B, v30.16B, v18.16B 177 .endif 178 st1 {v30.16B}, [x0], x2 179 180 ret 181.endm 182 183.macro pixels8 rnd=1, avg=0 1841: ld1 {v0.8B}, [x1], x2 185 ld1 {v1.8B}, [x1], x2 186 ld1 {v2.8B}, [x1], x2 187 ld1 {v3.8B}, [x1], x2 188 .if \avg 189 ld1 {v4.8B}, [x0], x2 190 urhadd v0.8B, v0.8B, v4.8B 191 ld1 {v5.8B}, [x0], x2 192 urhadd v1.8B, v1.8B, v5.8B 193 ld1 {v6.8B}, [x0], x2 194 urhadd v2.8B, v2.8B, v6.8B 195 ld1 {v7.8B}, [x0], x2 196 urhadd v3.8B, v3.8B, v7.8B 197 sub x0, x0, x2, lsl #2 198 .endif 199 subs w3, w3, #4 200 st1 {v0.8B}, [x0], x2 201 st1 {v1.8B}, [x0], x2 202 st1 {v2.8B}, [x0], x2 203 st1 {v3.8B}, [x0], x2 204 b.ne 1b 205 ret 206.endm 207 208.macro pixels8_x2 rnd=1, avg=0 2091: ld1 {v0.8B, v1.8B}, [x1], x2 210 ext v1.8B, v0.8B, v1.8B, #1 211 ld1 {v2.8B, v3.8B}, [x1], x2 212 ext v3.8B, v2.8B, v3.8B, #1 213 subs w3, w3, #2 214 avg v0.8B, v0.8B, v1.8B 215 avg v2.8B, v2.8B, v3.8B 216 .if \avg 217 ld1 {v4.8B}, [x0], x2 218 ld1 {v5.8B}, [x0] 219 urhadd v0.8B, v0.8B, v4.8B 220 urhadd v2.8B, v2.8B, v5.8B 221 sub x0, x0, x2 222 .endif 223 st1 {v0.8B}, [x0], x2 224 st1 {v2.8B}, [x0], x2 225 b.ne 1b 226 ret 227.endm 228 229.macro pixels8_y2 rnd=1, avg=0 230 sub w3, w3, #2 231 ld1 {v0.8B}, [x1], x2 232 ld1 {v1.8B}, [x1], x2 2331: subs w3, w3, #2 234 avg v4.8B, v0.8B, v1.8B 235 ld1 {v0.8B}, [x1], x2 236 avg v5.8B, v0.8B, v1.8B 237 ld1 {v1.8B}, [x1], x2 238 .if \avg 239 ld1 {v2.8B}, [x0], x2 240 ld1 {v3.8B}, [x0] 241 urhadd v4.8B, v4.8B, v2.8B 242 urhadd v5.8B, v5.8B, v3.8B 243 sub x0, x0, x2 244 .endif 245 st1 {v4.8B}, [x0], x2 246 st1 {v5.8B}, [x0], x2 247 b.ne 1b 248 249 avg v4.8B, v0.8B, v1.8B 250 ld1 {v0.8B}, [x1], x2 251 avg v5.8B, v0.8B, v1.8B 252 .if \avg 253 ld1 {v2.8B}, [x0], x2 254 ld1 {v3.8B}, [x0] 255 urhadd v4.8B, v4.8B, v2.8B 256 urhadd v5.8B, v5.8B, v3.8B 257 sub x0, x0, x2 258 .endif 259 st1 {v4.8B}, [x0], x2 260 st1 {v5.8B}, [x0], x2 261 262 ret 263.endm 264 265.macro pixels8_xy2 rnd=1, avg=0 266 sub w3, w3, #2 267 ld1 {v0.16B}, [x1], x2 268 ld1 {v1.16B}, [x1], x2 269NRND movi v19.8H, #1 270 ext v4.16B, v0.16B, v4.16B, #1 271 ext v6.16B, v1.16B, v6.16B, #1 272 uaddl v16.8H, v0.8B, v4.8B 273 uaddl v17.8H, v1.8B, v6.8B 2741: subs w3, w3, #2 275 ld1 {v0.16B}, [x1], x2 276 add v18.8H, v16.8H, v17.8H 277 ext v4.16B, v0.16B, v4.16B, #1 278NRND add v18.8H, v18.8H, v19.8H 279 uaddl v16.8H, v0.8B, v4.8B 280 mshrn v5.8B, v18.8H, #2 281 ld1 {v1.16B}, [x1], x2 282 add v18.8H, v16.8H, v17.8H 283 .if \avg 284 ld1 {v7.8B}, [x0] 285 urhadd v5.8B, v5.8B, v7.8B 286 .endif 287NRND add v18.8H, v18.8H, v19.8H 288 st1 {v5.8B}, [x0], x2 289 mshrn v7.8B, v18.8H, #2 290 .if \avg 291 ld1 {v5.8B}, [x0] 292 urhadd v7.8B, v7.8B, v5.8B 293 .endif 294 ext v6.16B, v1.16B, v6.16B, #1 295 uaddl v17.8H, v1.8B, v6.8B 296 st1 {v7.8B}, [x0], x2 297 b.gt 1b 298 299 ld1 {v0.16B}, [x1], x2 300 add v18.8H, v16.8H, v17.8H 301 ext v4.16B, v0.16B, v4.16B, #1 302NRND add v18.8H, v18.8H, v19.8H 303 uaddl v16.8H, v0.8B, v4.8B 304 mshrn v5.8B, v18.8H, #2 305 add v18.8H, v16.8H, v17.8H 306 .if \avg 307 ld1 {v7.8B}, [x0] 308 urhadd v5.8B, v5.8B, v7.8B 309 .endif 310NRND add v18.8H, v18.8H, v19.8H 311 st1 {v5.8B}, [x0], x2 312 mshrn v7.8B, v18.8H, #2 313 .if \avg 314 ld1 {v5.8B}, [x0] 315 urhadd v7.8B, v7.8B, v5.8B 316 .endif 317 st1 {v7.8B}, [x0], x2 318 319 ret 320.endm 321 322.macro pixfunc pfx, name, suf, rnd=1, avg=0 323 .if \rnd 324 .macro avg rd, rn, rm 325 urhadd \rd, \rn, \rm 326 .endm 327 .macro mshrn rd, rn, rm 328 rshrn \rd, \rn, \rm 329 .endm 330 .macro mshrn2 rd, rn, rm 331 rshrn2 \rd, \rn, \rm 332 .endm 333 .macro NRND insn:vararg 334 .endm 335 .else 336 .macro avg rd, rn, rm 337 uhadd \rd, \rn, \rm 338 .endm 339 .macro mshrn rd, rn, rm 340 shrn \rd, \rn, \rm 341 .endm 342 .macro mshrn2 rd, rn, rm 343 shrn2 \rd, \rn, \rm 344 .endm 345 .macro NRND insn:vararg 346 \insn 347 .endm 348 .endif 349function ff_\pfx\name\suf\()_neon, export=1 350 \name \rnd, \avg 351endfunc 352 .purgem avg 353 .purgem mshrn 354 .purgem mshrn2 355 .purgem NRND 356.endm 357 358.macro pixfunc2 pfx, name, avg=0 359 pixfunc \pfx, \name, rnd=1, avg=\avg 360 pixfunc \pfx, \name, _no_rnd, rnd=0, avg=\avg 361.endm 362 363function ff_put_h264_qpel16_mc00_neon, export=1 364 mov w3, #16 365endfunc 366 367 pixfunc put_, pixels16, avg=0 368 pixfunc2 put_, pixels16_x2, avg=0 369 pixfunc2 put_, pixels16_y2, avg=0 370 pixfunc2 put_, pixels16_xy2, avg=0 371 372function ff_avg_h264_qpel16_mc00_neon, export=1 373 mov w3, #16 374endfunc 375 376 pixfunc avg_, pixels16, avg=1 377 pixfunc2 avg_, pixels16_x2, avg=1 378 pixfunc2 avg_, pixels16_y2, avg=1 379 pixfunc2 avg_, pixels16_xy2, avg=1 380 381function ff_put_h264_qpel8_mc00_neon, export=1 382 mov w3, #8 383endfunc 384 385 pixfunc put_, pixels8, avg=0 386 pixfunc2 put_, pixels8_x2, avg=0 387 pixfunc2 put_, pixels8_y2, avg=0 388 pixfunc2 put_, pixels8_xy2, avg=0 389 390function ff_avg_h264_qpel8_mc00_neon, export=1 391 mov w3, #8 392endfunc 393 394 pixfunc avg_, pixels8, avg=1 395 pixfunc avg_, pixels8_x2, avg=1 396 pixfunc avg_, pixels8_y2, avg=1 397 pixfunc avg_, pixels8_xy2, avg=1 398