1/* 2 * ARM NEON optimised DSP functions 3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "libavutil/arm/asm.S" 23 24.macro pixels16 rnd=1, avg=0 25 .if \avg 26 mov r12, r0 27 .endif 281: vld1.8 {q0}, [r1], r2 29 vld1.8 {q1}, [r1], r2 30 vld1.8 {q2}, [r1], r2 31 pld [r1, r2, lsl #2] 32 vld1.8 {q3}, [r1], r2 33 pld [r1] 34 pld [r1, r2] 35 pld [r1, r2, lsl #1] 36 .if \avg 37 vld1.8 {q8}, [r12,:128], r2 38 vrhadd.u8 q0, q0, q8 39 vld1.8 {q9}, [r12,:128], r2 40 vrhadd.u8 q1, q1, q9 41 vld1.8 {q10}, [r12,:128], r2 42 vrhadd.u8 q2, q2, q10 43 vld1.8 {q11}, [r12,:128], r2 44 vrhadd.u8 q3, q3, q11 45 .endif 46 subs r3, r3, #4 47 vst1.64 {q0}, [r0,:128], r2 48 vst1.64 {q1}, [r0,:128], r2 49 vst1.64 {q2}, [r0,:128], r2 50 vst1.64 {q3}, [r0,:128], r2 51 bne 1b 52 bx lr 53.endm 54 55.macro pixels16_x2 rnd=1, avg=0 561: vld1.8 {d0-d2}, [r1], r2 57 vld1.8 {d4-d6}, [r1], r2 58 pld [r1] 59 pld [r1, r2] 60 subs r3, r3, #2 61 vext.8 q1, q0, q1, #1 62 avg q0, q0, q1 63 vext.8 q3, q2, q3, #1 64 avg q2, q2, q3 65 .if \avg 66 vld1.8 {q1}, [r0,:128], r2 67 vld1.8 {q3}, [r0,:128] 68 vrhadd.u8 q0, q0, q1 69 vrhadd.u8 q2, q2, q3 70 sub r0, r0, r2 71 .endif 72 vst1.8 {q0}, [r0,:128], r2 73 vst1.8 {q2}, [r0,:128], r2 74 bne 1b 75 bx lr 76.endm 77 78.macro pixels16_y2 rnd=1, avg=0 79 sub r3, r3, #2 80 vld1.8 {q0}, [r1], r2 81 vld1.8 {q1}, [r1], r2 821: subs r3, r3, #2 83 avg q2, q0, q1 84 vld1.8 {q0}, [r1], r2 85 avg q3, q0, q1 86 vld1.8 {q1}, [r1], r2 87 pld [r1] 88 pld [r1, r2] 89 .if \avg 90 vld1.8 {q8}, [r0,:128], r2 91 vld1.8 {q9}, [r0,:128] 92 vrhadd.u8 q2, q2, q8 93 vrhadd.u8 q3, q3, q9 94 sub r0, r0, r2 95 .endif 96 vst1.8 {q2}, [r0,:128], r2 97 vst1.8 {q3}, [r0,:128], r2 98 bne 1b 99 100 avg q2, q0, q1 101 vld1.8 {q0}, [r1], r2 102 avg q3, q0, q1 103 .if \avg 104 vld1.8 {q8}, [r0,:128], r2 105 vld1.8 {q9}, [r0,:128] 106 vrhadd.u8 q2, q2, q8 107 vrhadd.u8 q3, q3, q9 108 sub r0, r0, r2 109 .endif 110 vst1.8 {q2}, [r0,:128], r2 111 vst1.8 {q3}, [r0,:128], r2 112 113 bx lr 114.endm 115 116.macro pixels16_xy2 rnd=1, avg=0 117 sub r3, r3, #2 118 vld1.8 {d0-d2}, [r1], r2 119 vld1.8 {d4-d6}, [r1], r2 120NRND vmov.i16 q13, #1 121 pld [r1] 122 pld [r1, r2] 123 vext.8 q1, q0, q1, #1 124 vext.8 q3, q2, q3, #1 125 vaddl.u8 q8, d0, d2 126 vaddl.u8 q10, d1, d3 127 vaddl.u8 q9, d4, d6 128 vaddl.u8 q11, d5, d7 1291: subs r3, r3, #2 130 vld1.8 {d0-d2}, [r1], r2 131 vadd.u16 q12, q8, q9 132 pld [r1] 133NRND vadd.u16 q12, q12, q13 134 vext.8 q15, q0, q1, #1 135 vadd.u16 q1 , q10, q11 136 shrn d28, q12, #2 137NRND vadd.u16 q1, q1, q13 138 shrn d29, q1, #2 139 .if \avg 140 vld1.8 {q8}, [r0,:128] 141 vrhadd.u8 q14, q14, q8 142 .endif 143 vaddl.u8 q8, d0, d30 144 vld1.8 {d2-d4}, [r1], r2 145 vaddl.u8 q10, d1, d31 146 vst1.8 {q14}, [r0,:128], r2 147 vadd.u16 q12, q8, q9 148 pld [r1, r2] 149NRND vadd.u16 q12, q12, q13 150 vext.8 q2, q1, q2, #1 151 vadd.u16 q0, q10, q11 152 shrn d30, q12, #2 153NRND vadd.u16 q0, q0, q13 154 shrn d31, q0, #2 155 .if \avg 156 vld1.8 {q9}, [r0,:128] 157 vrhadd.u8 q15, q15, q9 158 .endif 159 vaddl.u8 q9, d2, d4 160 vaddl.u8 q11, d3, d5 161 vst1.8 {q15}, [r0,:128], r2 162 bgt 1b 163 164 vld1.8 {d0-d2}, [r1], r2 165 vadd.u16 q12, q8, q9 166NRND vadd.u16 q12, q12, q13 167 vext.8 q15, q0, q1, #1 168 vadd.u16 q1 , q10, q11 169 shrn d28, q12, #2 170NRND vadd.u16 q1, q1, q13 171 shrn d29, q1, #2 172 .if \avg 173 vld1.8 {q8}, [r0,:128] 174 vrhadd.u8 q14, q14, q8 175 .endif 176 vaddl.u8 q8, d0, d30 177 vaddl.u8 q10, d1, d31 178 vst1.8 {q14}, [r0,:128], r2 179 vadd.u16 q12, q8, q9 180NRND vadd.u16 q12, q12, q13 181 vadd.u16 q0, q10, q11 182 shrn d30, q12, #2 183NRND vadd.u16 q0, q0, q13 184 shrn d31, q0, #2 185 .if \avg 186 vld1.8 {q9}, [r0,:128] 187 vrhadd.u8 q15, q15, q9 188 .endif 189 vst1.8 {q15}, [r0,:128], r2 190 191 bx lr 192.endm 193 194.macro pixels8 rnd=1, avg=0 1951: vld1.8 {d0}, [r1], r2 196 vld1.8 {d1}, [r1], r2 197 vld1.8 {d2}, [r1], r2 198 pld [r1, r2, lsl #2] 199 vld1.8 {d3}, [r1], r2 200 pld [r1] 201 pld [r1, r2] 202 pld [r1, r2, lsl #1] 203 .if \avg 204 vld1.8 {d4}, [r0,:64], r2 205 vrhadd.u8 d0, d0, d4 206 vld1.8 {d5}, [r0,:64], r2 207 vrhadd.u8 d1, d1, d5 208 vld1.8 {d6}, [r0,:64], r2 209 vrhadd.u8 d2, d2, d6 210 vld1.8 {d7}, [r0,:64], r2 211 vrhadd.u8 d3, d3, d7 212 sub r0, r0, r2, lsl #2 213 .endif 214 subs r3, r3, #4 215 vst1.8 {d0}, [r0,:64], r2 216 vst1.8 {d1}, [r0,:64], r2 217 vst1.8 {d2}, [r0,:64], r2 218 vst1.8 {d3}, [r0,:64], r2 219 bne 1b 220 bx lr 221.endm 222 223.macro pixels8_x2 rnd=1, avg=0 2241: vld1.8 {q0}, [r1], r2 225 vext.8 d1, d0, d1, #1 226 vld1.8 {q1}, [r1], r2 227 vext.8 d3, d2, d3, #1 228 pld [r1] 229 pld [r1, r2] 230 subs r3, r3, #2 231 vswp d1, d2 232 avg q0, q0, q1 233 .if \avg 234 vld1.8 {d4}, [r0,:64], r2 235 vld1.8 {d5}, [r0,:64] 236 vrhadd.u8 q0, q0, q2 237 sub r0, r0, r2 238 .endif 239 vst1.8 {d0}, [r0,:64], r2 240 vst1.8 {d1}, [r0,:64], r2 241 bne 1b 242 bx lr 243.endm 244 245.macro pixels8_y2 rnd=1, avg=0 246 sub r3, r3, #2 247 vld1.8 {d0}, [r1], r2 248 vld1.8 {d1}, [r1], r2 2491: subs r3, r3, #2 250 avg d4, d0, d1 251 vld1.8 {d0}, [r1], r2 252 avg d5, d0, d1 253 vld1.8 {d1}, [r1], r2 254 pld [r1] 255 pld [r1, r2] 256 .if \avg 257 vld1.8 {d2}, [r0,:64], r2 258 vld1.8 {d3}, [r0,:64] 259 vrhadd.u8 q2, q2, q1 260 sub r0, r0, r2 261 .endif 262 vst1.8 {d4}, [r0,:64], r2 263 vst1.8 {d5}, [r0,:64], r2 264 bne 1b 265 266 avg d4, d0, d1 267 vld1.8 {d0}, [r1], r2 268 avg d5, d0, d1 269 .if \avg 270 vld1.8 {d2}, [r0,:64], r2 271 vld1.8 {d3}, [r0,:64] 272 vrhadd.u8 q2, q2, q1 273 sub r0, r0, r2 274 .endif 275 vst1.8 {d4}, [r0,:64], r2 276 vst1.8 {d5}, [r0,:64], r2 277 278 bx lr 279.endm 280 281.macro pixels8_xy2 rnd=1, avg=0 282 sub r3, r3, #2 283 vld1.8 {q0}, [r1], r2 284 vld1.8 {q1}, [r1], r2 285NRND vmov.i16 q11, #1 286 pld [r1] 287 pld [r1, r2] 288 vext.8 d4, d0, d1, #1 289 vext.8 d6, d2, d3, #1 290 vaddl.u8 q8, d0, d4 291 vaddl.u8 q9, d2, d6 2921: subs r3, r3, #2 293 vld1.8 {q0}, [r1], r2 294 pld [r1] 295 vadd.u16 q10, q8, q9 296 vext.8 d4, d0, d1, #1 297NRND vadd.u16 q10, q10, q11 298 vaddl.u8 q8, d0, d4 299 shrn d5, q10, #2 300 vld1.8 {q1}, [r1], r2 301 vadd.u16 q10, q8, q9 302 pld [r1, r2] 303 .if \avg 304 vld1.8 {d7}, [r0,:64] 305 vrhadd.u8 d5, d5, d7 306 .endif 307NRND vadd.u16 q10, q10, q11 308 vst1.8 {d5}, [r0,:64], r2 309 shrn d7, q10, #2 310 .if \avg 311 vld1.8 {d5}, [r0,:64] 312 vrhadd.u8 d7, d7, d5 313 .endif 314 vext.8 d6, d2, d3, #1 315 vaddl.u8 q9, d2, d6 316 vst1.8 {d7}, [r0,:64], r2 317 bgt 1b 318 319 vld1.8 {q0}, [r1], r2 320 vadd.u16 q10, q8, q9 321 vext.8 d4, d0, d1, #1 322NRND vadd.u16 q10, q10, q11 323 vaddl.u8 q8, d0, d4 324 shrn d5, q10, #2 325 vadd.u16 q10, q8, q9 326 .if \avg 327 vld1.8 {d7}, [r0,:64] 328 vrhadd.u8 d5, d5, d7 329 .endif 330NRND vadd.u16 q10, q10, q11 331 vst1.8 {d5}, [r0,:64], r2 332 shrn d7, q10, #2 333 .if \avg 334 vld1.8 {d5}, [r0,:64] 335 vrhadd.u8 d7, d7, d5 336 .endif 337 vst1.8 {d7}, [r0,:64], r2 338 339 bx lr 340.endm 341 342.macro pixfunc pfx, name, suf, rnd=1, avg=0 343 .if \rnd 344 .macro avg rd, rn, rm 345 vrhadd.u8 \rd, \rn, \rm 346 .endm 347 .macro shrn rd, rn, rm 348 vrshrn.u16 \rd, \rn, \rm 349 .endm 350 .macro NRND insn:vararg 351 .endm 352 .else 353 .macro avg rd, rn, rm 354 vhadd.u8 \rd, \rn, \rm 355 .endm 356 .macro shrn rd, rn, rm 357 vshrn.u16 \rd, \rn, \rm 358 .endm 359 .macro NRND insn:vararg 360 \insn 361 .endm 362 .endif 363function ff_\pfx\name\suf\()_neon, export=1 364 \name \rnd, \avg 365endfunc 366 .purgem avg 367 .purgem shrn 368 .purgem NRND 369.endm 370 371.macro pixfunc2 pfx, name, avg=0 372 pixfunc \pfx, \name, rnd=1, avg=\avg 373 pixfunc \pfx, \name, _no_rnd, rnd=0, avg=\avg 374.endm 375 376function ff_put_h264_qpel16_mc00_neon, export=1 377 mov r3, #16 378endfunc 379 380 pixfunc put_, pixels16, avg=0 381 pixfunc2 put_, pixels16_x2, avg=0 382 pixfunc2 put_, pixels16_y2, avg=0 383 pixfunc2 put_, pixels16_xy2, avg=0 384 385function ff_avg_h264_qpel16_mc00_neon, export=1 386 mov r3, #16 387endfunc 388 389 pixfunc avg_, pixels16, avg=1 390 pixfunc2 avg_, pixels16_x2, avg=1 391 pixfunc2 avg_, pixels16_y2, avg=1 392 pixfunc2 avg_, pixels16_xy2, avg=1 393 394function ff_put_h264_qpel8_mc00_neon, export=1 395 mov r3, #8 396endfunc 397 398 pixfunc put_, pixels8, avg=0 399 pixfunc2 put_, pixels8_x2, avg=0 400 pixfunc2 put_, pixels8_y2, avg=0 401 pixfunc2 put_, pixels8_xy2, avg=0 402 403function ff_avg_h264_qpel8_mc00_neon, export=1 404 mov r3, #8 405endfunc 406 407 pixfunc avg_, pixels8, avg=1 408 pixfunc avg_, pixels8_x2, avg=1 409 pixfunc avg_, pixels8_y2, avg=1 410 pixfunc avg_, pixels8_xy2, avg=1 411