1/* 2 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/arm/asm.S" 22 23 .macro ldcol.8 rd, rs, rt, n=8, hi=0 24.if \n == 8 || \hi == 0 25 vld1.8 {\rd[0]}, [\rs], \rt 26 vld1.8 {\rd[1]}, [\rs], \rt 27 vld1.8 {\rd[2]}, [\rs], \rt 28 vld1.8 {\rd[3]}, [\rs], \rt 29.endif 30.if \n == 8 || \hi == 1 31 vld1.8 {\rd[4]}, [\rs], \rt 32 vld1.8 {\rd[5]}, [\rs], \rt 33 vld1.8 {\rd[6]}, [\rs], \rt 34 vld1.8 {\rd[7]}, [\rs], \rt 35.endif 36 .endm 37 38 .macro add16x8 dq, dl, dh, rl, rh 39 vaddl.u8 \dq, \rl, \rh 40 vadd.u16 \dl, \dl, \dh 41 vpadd.u16 \dl, \dl, \dl 42 vpadd.u16 \dl, \dl, \dl 43 .endm 44 45function ff_pred16x16_128_dc_neon, export=1 46 vmov.i8 q0, #128 47 b .L_pred16x16_dc_end 48endfunc 49 50function ff_pred16x16_top_dc_neon, export=1 51 sub r2, r0, r1 52 vld1.8 {q0}, [r2,:128] 53 add16x8 q0, d0, d1, d0, d1 54 vrshrn.u16 d0, q0, #4 55 vdup.8 q0, d0[0] 56 b .L_pred16x16_dc_end 57endfunc 58 59function ff_pred16x16_left_dc_neon, export=1 60 sub r2, r0, #1 61 ldcol.8 d0, r2, r1 62 ldcol.8 d1, r2, r1 63 add16x8 q0, d0, d1, d0, d1 64 vrshrn.u16 d0, q0, #4 65 vdup.8 q0, d0[0] 66 b .L_pred16x16_dc_end 67endfunc 68 69function ff_pred16x16_dc_neon, export=1 70 sub r2, r0, r1 71 vld1.8 {q0}, [r2,:128] 72 sub r2, r0, #1 73 ldcol.8 d2, r2, r1 74 ldcol.8 d3, r2, r1 75 vaddl.u8 q0, d0, d1 76 vaddl.u8 q1, d2, d3 77 vadd.u16 q0, q0, q1 78 vadd.u16 d0, d0, d1 79 vpadd.u16 d0, d0, d0 80 vpadd.u16 d0, d0, d0 81 vrshrn.u16 d0, q0, #5 82 vdup.8 q0, d0[0] 83.L_pred16x16_dc_end: 84 mov r3, #8 856: vst1.8 {q0}, [r0,:128], r1 86 vst1.8 {q0}, [r0,:128], r1 87 subs r3, r3, #1 88 bne 6b 89 bx lr 90endfunc 91 92function ff_pred16x16_hor_neon, export=1 93 sub r2, r0, #1 94 mov r3, #16 951: vld1.8 {d0[],d1[]},[r2], r1 96 vst1.8 {q0}, [r0,:128], r1 97 subs r3, r3, #1 98 bne 1b 99 bx lr 100endfunc 101 102function ff_pred16x16_vert_neon, export=1 103 sub r0, r0, r1 104 vld1.8 {q0}, [r0,:128], r1 105 mov r3, #8 1061: vst1.8 {q0}, [r0,:128], r1 107 vst1.8 {q0}, [r0,:128], r1 108 subs r3, r3, #1 109 bne 1b 110 bx lr 111endfunc 112 113function ff_pred16x16_plane_neon, export=1 114 sub r3, r0, r1 115 add r2, r3, #8 116 sub r3, r3, #1 117 vld1.8 {d0}, [r3] 118 vld1.8 {d2}, [r2,:64], r1 119 ldcol.8 d1, r3, r1 120 add r3, r3, r1 121 ldcol.8 d3, r3, r1 122 vrev64.8 q0, q0 123 vaddl.u8 q8, d2, d3 124 vsubl.u8 q2, d2, d0 125 vsubl.u8 q3, d3, d1 126 movrel r3, p16weight 127 vld1.8 {q0}, [r3,:128] 128 vmul.s16 q2, q2, q0 129 vmul.s16 q3, q3, q0 130 vadd.i16 d4, d4, d5 131 vadd.i16 d5, d6, d7 132 vpadd.i16 d4, d4, d5 133 vpadd.i16 d4, d4, d4 134 vshll.s16 q3, d4, #2 135 vaddw.s16 q2, q3, d4 136 vrshrn.s32 d4, q2, #6 137 mov r3, #0 138 vtrn.16 d4, d5 139 vadd.i16 d2, d4, d5 140 vshl.i16 d3, d2, #3 141 vrev64.16 d16, d17 142 vsub.i16 d3, d3, d2 143 vadd.i16 d16, d16, d0 144 vshl.i16 d2, d16, #4 145 vsub.i16 d2, d2, d3 146 vshl.i16 d3, d4, #4 147 vext.16 q0, q0, q0, #7 148 vsub.i16 d6, d5, d3 149 vmov.16 d0[0], r3 150 vmul.i16 q0, q0, d4[0] 151 vdup.16 q1, d2[0] 152 vdup.16 q2, d4[0] 153 vdup.16 q3, d6[0] 154 vshl.i16 q2, q2, #3 155 vadd.i16 q1, q1, q0 156 vadd.i16 q3, q3, q2 157 mov r3, #16 1581: 159 vqshrun.s16 d0, q1, #5 160 vadd.i16 q1, q1, q2 161 vqshrun.s16 d1, q1, #5 162 vadd.i16 q1, q1, q3 163 vst1.8 {q0}, [r0,:128], r1 164 subs r3, r3, #1 165 bne 1b 166 bx lr 167endfunc 168 169const p16weight, align=4 170 .short 1,2,3,4,5,6,7,8 171endconst 172 173function ff_pred8x8_hor_neon, export=1 174 sub r2, r0, #1 175 mov r3, #8 1761: vld1.8 {d0[]}, [r2], r1 177 vst1.8 {d0}, [r0,:64], r1 178 subs r3, r3, #1 179 bne 1b 180 bx lr 181endfunc 182 183function ff_pred8x8_vert_neon, export=1 184 sub r0, r0, r1 185 vld1.8 {d0}, [r0,:64], r1 186 mov r3, #4 1871: vst1.8 {d0}, [r0,:64], r1 188 vst1.8 {d0}, [r0,:64], r1 189 subs r3, r3, #1 190 bne 1b 191 bx lr 192endfunc 193 194function ff_pred8x8_plane_neon, export=1 195 sub r3, r0, r1 196 add r2, r3, #4 197 sub r3, r3, #1 198 vld1.32 {d0[0]}, [r3] 199 vld1.32 {d2[0]}, [r2,:32], r1 200 ldcol.8 d0, r3, r1, 4, hi=1 201 add r3, r3, r1 202 ldcol.8 d3, r3, r1, 4 203 vaddl.u8 q8, d2, d3 204 vrev32.8 d0, d0 205 vtrn.32 d2, d3 206 vsubl.u8 q2, d2, d0 207 movrel r3, p16weight 208 vld1.16 {q0}, [r3,:128] 209 vmul.s16 d4, d4, d0 210 vmul.s16 d5, d5, d0 211 vpadd.i16 d4, d4, d5 212 vpaddl.s16 d4, d4 213 vshl.i32 d5, d4, #4 214 vadd.s32 d4, d4, d5 215 vrshrn.s32 d4, q2, #5 216 mov r3, #0 217 vtrn.16 d4, d5 218 vadd.i16 d2, d4, d5 219 vshl.i16 d3, d2, #2 220 vrev64.16 d16, d16 221 vsub.i16 d3, d3, d2 222 vadd.i16 d16, d16, d0 223 vshl.i16 d2, d16, #4 224 vsub.i16 d2, d2, d3 225 vshl.i16 d3, d4, #3 226 vext.16 q0, q0, q0, #7 227 vsub.i16 d6, d5, d3 228 vmov.16 d0[0], r3 229 vmul.i16 q0, q0, d4[0] 230 vdup.16 q1, d2[0] 231 vdup.16 q2, d4[0] 232 vdup.16 q3, d6[0] 233 vshl.i16 q2, q2, #3 234 vadd.i16 q1, q1, q0 235 vadd.i16 q3, q3, q2 236 mov r3, #8 2371: 238 vqshrun.s16 d0, q1, #5 239 vadd.i16 q1, q1, q3 240 vst1.8 {d0}, [r0,:64], r1 241 subs r3, r3, #1 242 bne 1b 243 bx lr 244endfunc 245 246function ff_pred8x8_128_dc_neon, export=1 247 vmov.i8 q0, #128 248 b .L_pred8x8_dc_end 249endfunc 250 251function ff_pred8x8_top_dc_neon, export=1 252 sub r2, r0, r1 253 vld1.8 {d0}, [r2,:64] 254 vpaddl.u8 d0, d0 255 vpadd.u16 d0, d0, d0 256 vrshrn.u16 d0, q0, #2 257 vdup.8 d1, d0[1] 258 vdup.8 d0, d0[0] 259 vtrn.32 d0, d1 260 b .L_pred8x8_dc_end 261endfunc 262 263function ff_pred8x8_left_dc_neon, export=1 264 sub r2, r0, #1 265 ldcol.8 d0, r2, r1 266 vpaddl.u8 d0, d0 267 vpadd.u16 d0, d0, d0 268 vrshrn.u16 d0, q0, #2 269 vdup.8 d1, d0[1] 270 vdup.8 d0, d0[0] 271 b .L_pred8x8_dc_end 272endfunc 273 274function ff_pred8x8_dc_neon, export=1 275 sub r2, r0, r1 276 vld1.8 {d0}, [r2,:64] 277 sub r2, r0, #1 278 ldcol.8 d1, r2, r1 279 vtrn.32 d0, d1 280 vpaddl.u8 q0, q0 281 vpadd.u16 d0, d0, d1 282 vpadd.u16 d1, d0, d0 283 vrshrn.u16 d2, q0, #3 284 vrshrn.u16 d3, q0, #2 285 vdup.8 d0, d2[4] 286 vdup.8 d1, d3[3] 287 vdup.8 d4, d3[2] 288 vdup.8 d5, d2[5] 289 vtrn.32 q0, q2 290.L_pred8x8_dc_end: 291 mov r3, #4 292 add r2, r0, r1, lsl #2 2936: vst1.8 {d0}, [r0,:64], r1 294 vst1.8 {d1}, [r2,:64], r1 295 subs r3, r3, #1 296 bne 6b 297 bx lr 298endfunc 299 300function ff_pred8x8_l0t_dc_neon, export=1 301 sub r2, r0, r1 302 vld1.8 {d0}, [r2,:64] 303 sub r2, r0, #1 304 ldcol.8 d1, r2, r1, 4 305 vtrn.32 d0, d1 306 vpaddl.u8 q0, q0 307 vpadd.u16 d0, d0, d1 308 vpadd.u16 d1, d0, d0 309 vrshrn.u16 d2, q0, #3 310 vrshrn.u16 d3, q0, #2 311 vdup.8 d0, d2[4] 312 vdup.8 d1, d3[0] 313 vdup.8 q2, d3[2] 314 vtrn.32 q0, q2 315 b .L_pred8x8_dc_end 316endfunc 317 318function ff_pred8x8_l00_dc_neon, export=1 319 sub r2, r0, #1 320 ldcol.8 d0, r2, r1, 4 321 vpaddl.u8 d0, d0 322 vpadd.u16 d0, d0, d0 323 vrshrn.u16 d0, q0, #2 324 vmov.i8 d1, #128 325 vdup.8 d0, d0[0] 326 b .L_pred8x8_dc_end 327endfunc 328 329function ff_pred8x8_0lt_dc_neon, export=1 330 sub r2, r0, r1 331 vld1.8 {d0}, [r2,:64] 332 add r2, r0, r1, lsl #2 333 sub r2, r2, #1 334 ldcol.8 d1, r2, r1, 4, hi=1 335 vtrn.32 d0, d1 336 vpaddl.u8 q0, q0 337 vpadd.u16 d0, d0, d1 338 vpadd.u16 d1, d0, d0 339 vrshrn.u16 d3, q0, #2 340 vrshrn.u16 d2, q0, #3 341 vdup.8 d0, d3[0] 342 vdup.8 d1, d3[3] 343 vdup.8 d4, d3[2] 344 vdup.8 d5, d2[5] 345 vtrn.32 q0, q2 346 b .L_pred8x8_dc_end 347endfunc 348 349function ff_pred8x8_0l0_dc_neon, export=1 350 add r2, r0, r1, lsl #2 351 sub r2, r2, #1 352 ldcol.8 d1, r2, r1, 4 353 vpaddl.u8 d2, d1 354 vpadd.u16 d2, d2, d2 355 vrshrn.u16 d1, q1, #2 356 vmov.i8 d0, #128 357 vdup.8 d1, d1[0] 358 b .L_pred8x8_dc_end 359endfunc 360