1/* 2 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/aarch64/asm.S" 22 23.macro ldcol.8 rd, rs, rt, n=8, hi=0 24.if \n >= 8 || \hi == 0 25 ld1 {\rd\().b}[0], [\rs], \rt 26 ld1 {\rd\().b}[1], [\rs], \rt 27 ld1 {\rd\().b}[2], [\rs], \rt 28 ld1 {\rd\().b}[3], [\rs], \rt 29.endif 30.if \n >= 8 || \hi == 1 31 ld1 {\rd\().b}[4], [\rs], \rt 32 ld1 {\rd\().b}[5], [\rs], \rt 33 ld1 {\rd\().b}[6], [\rs], \rt 34 ld1 {\rd\().b}[7], [\rs], \rt 35.endif 36.if \n == 16 37 ld1 {\rd\().b}[8], [\rs], \rt 38 ld1 {\rd\().b}[9], [\rs], \rt 39 ld1 {\rd\().b}[10], [\rs], \rt 40 ld1 {\rd\().b}[11], [\rs], \rt 41 ld1 {\rd\().b}[12], [\rs], \rt 42 ld1 {\rd\().b}[13], [\rs], \rt 43 ld1 {\rd\().b}[14], [\rs], \rt 44 ld1 {\rd\().b}[15], [\rs], \rt 45.endif 46.endm 47 48function ff_pred16x16_128_dc_neon, export=1 49 movi v0.16b, #128 50 b .L_pred16x16_dc_end 51endfunc 52 53function ff_pred16x16_top_dc_neon, export=1 54 sub x2, x0, x1 55 ld1 {v0.16b}, [x2] 56 uaddlv h0, v0.16b 57 rshrn v0.8b, v0.8h, #4 58 dup v0.16b, v0.b[0] 59 b .L_pred16x16_dc_end 60endfunc 61 62function ff_pred16x16_left_dc_neon, export=1 63 sub x2, x0, #1 64 ldcol.8 v0, x2, x1, 16 65 uaddlv h0, v0.16b 66 rshrn v0.8b, v0.8h, #4 67 dup v0.16b, v0.b[0] 68 b .L_pred16x16_dc_end 69endfunc 70 71function ff_pred16x16_dc_neon, export=1 72 sub x2, x0, x1 73 sub x3, x0, #1 74 ld1 {v0.16b}, [x2] 75 ldcol.8 v1, x3, x1, 16 76 uaddlv h0, v0.16b 77 uaddlv h1, v1.16b 78 add v0.4h, v0.4h, v1.4h 79 rshrn v0.8b, v0.8h, #5 80 dup v0.16b, v0.b[0] 81.L_pred16x16_dc_end: 82 mov w3, #8 836: st1 {v0.16b}, [x0], x1 84 st1 {v0.16b}, [x0], x1 85 subs w3, w3, #1 86 b.ne 6b 87 ret 88endfunc 89 90function ff_pred16x16_hor_neon, export=1 91 sub x2, x0, #1 92 mov w3, #16 931: ld1r {v0.16b}, [x2], x1 94 st1 {v0.16b}, [x0], x1 95 subs w3, w3, #1 96 b.ne 1b 97 ret 98endfunc 99 100function ff_pred16x16_vert_neon, export=1 101 sub x2, x0, x1 102 add x1, x1, x1 103 ld1 {v0.16b}, [x2], x1 104 mov w3, #8 1051: st1 {v0.16b}, [x0], x1 106 st1 {v0.16b}, [x2], x1 107 subs w3, w3, #1 108 b.ne 1b 109 ret 110endfunc 111 112function ff_pred16x16_plane_neon, export=1 113 sub x3, x0, x1 114 movrel x4, p16weight 115 add x2, x3, #8 116 sub x3, x3, #1 117 ld1 {v0.8b}, [x3] 118 ld1 {v2.8b}, [x2], x1 119 ldcol.8 v1, x3, x1 120 add x3, x3, x1 121 ldcol.8 v3, x3, x1 122 rev64 v0.8b, v0.8b 123 rev64 v1.8b, v1.8b 124 uaddl v7.8h, v2.8b, v3.8b 125 usubl v2.8h, v2.8b, v0.8b 126 usubl v3.8h, v3.8b, v1.8b 127 ld1 {v0.8h}, [x4] 128 mul v2.8h, v2.8h, v0.8h 129 mul v3.8h, v3.8h, v0.8h 130 addp v2.8h, v2.8h, v3.8h 131 addp v2.8h, v2.8h, v2.8h 132 addp v2.4h, v2.4h, v2.4h 133 sshll v3.4s, v2.4h, #2 134 saddw v2.4s, v3.4s, v2.4h 135 rshrn v4.4h, v2.4s, #6 136 trn2 v5.4h, v4.4h, v4.4h 137 add v2.4h, v4.4h, v5.4h 138 shl v3.4h, v2.4h, #3 139 ext v7.16b, v7.16b, v7.16b, #14 140 sub v3.4h, v3.4h, v2.4h // 7 * (b + c) 141 add v7.4h, v7.4h, v0.4h 142 shl v2.4h, v7.4h, #4 143 sub v2.4h, v2.4h, v3.4h 144 shl v3.4h, v4.4h, #4 145 ext v0.16b, v0.16b, v0.16b, #14 146 sub v6.4h, v5.4h, v3.4h 147 mov v0.h[0], wzr 148 mul v0.8h, v0.8h, v4.h[0] 149 dup v1.8h, v2.h[0] 150 dup v2.8h, v4.h[0] 151 dup v3.8h, v6.h[0] 152 shl v2.8h, v2.8h, #3 153 add v1.8h, v1.8h, v0.8h 154 add v3.8h, v3.8h, v2.8h 155 mov w3, #16 1561: 157 sqshrun v0.8b, v1.8h, #5 158 add v1.8h, v1.8h, v2.8h 159 sqshrun2 v0.16b, v1.8h, #5 160 add v1.8h, v1.8h, v3.8h 161 st1 {v0.16b}, [x0], x1 162 subs w3, w3, #1 163 b.ne 1b 164 ret 165endfunc 166 167const p16weight, align=4 168 .short 1,2,3,4,5,6,7,8 169endconst 170const p8weight, align=4 171 .short 1,2,3,4,1,2,3,4 172endconst 173 174function ff_pred8x8_hor_neon, export=1 175 sub x2, x0, #1 176 mov w3, #8 1771: ld1r {v0.8b}, [x2], x1 178 st1 {v0.8b}, [x0], x1 179 subs w3, w3, #1 180 b.ne 1b 181 ret 182endfunc 183 184function ff_pred8x8_vert_neon, export=1 185 sub x2, x0, x1 186 lsl x1, x1, #1 187 ld1 {v0.8b}, [x2], x1 188 mov w3, #4 1891: st1 {v0.8b}, [x0], x1 190 st1 {v0.8b}, [x2], x1 191 subs w3, w3, #1 192 b.ne 1b 193 ret 194endfunc 195 196function ff_pred8x8_plane_neon, export=1 197 sub x3, x0, x1 198 movrel x4, p8weight 199 movrel x5, p16weight 200 add x2, x3, #4 201 sub x3, x3, #1 202 ld1 {v0.s}[0], [x3] 203 ld1 {v2.s}[0], [x2], x1 204 ldcol.8 v0, x3, x1, 4, hi=1 205 add x3, x3, x1 206 ldcol.8 v3, x3, x1, 4 207 uaddl v7.8h, v2.8b, v3.8b 208 rev32 v0.8b, v0.8b 209 trn1 v2.2s, v2.2s, v3.2s 210 usubl v2.8h, v2.8b, v0.8b 211 ld1 {v6.8h}, [x4] 212 mul v2.8h, v2.8h, v6.8h 213 ld1 {v0.8h}, [x5] 214 saddlp v2.4s, v2.8h 215 addp v2.4s, v2.4s, v2.4s 216 shl v3.4s, v2.4s, #4 217 add v2.4s, v3.4s, v2.4s 218 rshrn v5.4h, v2.4s, #5 219 addp v2.4h, v5.4h, v5.4h 220 shl v3.4h, v2.4h, #1 221 add v3.4h, v3.4h, v2.4h 222 rev64 v7.4h, v7.4h 223 add v7.4h, v7.4h, v0.4h 224 shl v2.4h, v7.4h, #4 225 sub v2.4h, v2.4h, v3.4h 226 ext v0.16b, v0.16b, v0.16b, #14 227 mov v0.h[0], wzr 228 mul v0.8h, v0.8h, v5.h[0] 229 dup v1.8h, v2.h[0] 230 dup v2.8h, v5.h[1] 231 add v1.8h, v1.8h, v0.8h 232 mov w3, #8 2331: 234 sqshrun v0.8b, v1.8h, #5 235 add v1.8h, v1.8h, v2.8h 236 st1 {v0.8b}, [x0], x1 237 subs w3, w3, #1 238 b.ne 1b 239 ret 240endfunc 241 242function ff_pred8x8_128_dc_neon, export=1 243 movi v0.8b, #128 244 movi v1.8b, #128 245 b .L_pred8x8_dc_end 246endfunc 247 248function ff_pred8x8_top_dc_neon, export=1 249 sub x2, x0, x1 250 ld1 {v0.8b}, [x2] 251 uaddlp v0.4h, v0.8b 252 addp v0.4h, v0.4h, v0.4h 253 zip1 v0.8h, v0.8h, v0.8h 254 rshrn v2.8b, v0.8h, #2 255 zip1 v0.8b, v2.8b, v2.8b 256 zip1 v1.8b, v2.8b, v2.8b 257 b .L_pred8x8_dc_end 258endfunc 259 260function ff_pred8x8_left_dc_neon, export=1 261 sub x2, x0, #1 262 ldcol.8 v0, x2, x1 263 uaddlp v0.4h, v0.8b 264 addp v0.4h, v0.4h, v0.4h 265 rshrn v2.8b, v0.8h, #2 266 dup v1.8b, v2.b[1] 267 dup v0.8b, v2.b[0] 268 b .L_pred8x8_dc_end 269endfunc 270 271function ff_pred8x8_dc_neon, export=1 272 sub x2, x0, x1 273 sub x3, x0, #1 274 ld1 {v0.8b}, [x2] 275 ldcol.8 v1, x3, x1 276 uaddlp v0.4h, v0.8b 277 uaddlp v1.4h, v1.8b 278 trn1 v2.2s, v0.2s, v1.2s 279 trn2 v3.2s, v0.2s, v1.2s 280 addp v4.4h, v2.4h, v3.4h 281 addp v5.4h, v4.4h, v4.4h 282 rshrn v6.8b, v5.8h, #3 283 rshrn v7.8b, v4.8h, #2 284 dup v0.8b, v6.b[0] 285 dup v2.8b, v7.b[2] 286 dup v1.8b, v7.b[3] 287 dup v3.8b, v6.b[1] 288 zip1 v0.2s, v0.2s, v2.2s 289 zip1 v1.2s, v1.2s, v3.2s 290.L_pred8x8_dc_end: 291 mov w3, #4 292 add x2, x0, x1, lsl #2 2936: st1 {v0.8b}, [x0], x1 294 st1 {v1.8b}, [x2], x1 295 subs w3, w3, #1 296 b.ne 6b 297 ret 298endfunc 299 300function ff_pred8x8_l0t_dc_neon, export=1 301 sub x2, x0, x1 302 sub x3, x0, #1 303 ld1 {v0.8b}, [x2] 304 ldcol.8 v1, x3, x1, 4 305 zip1 v0.4s, v0.4s, v1.4s 306 uaddlp v0.8h, v0.16b 307 addp v0.8h, v0.8h, v0.8h 308 addp v1.4h, v0.4h, v0.4h 309 rshrn v2.8b, v0.8h, #2 310 rshrn v3.8b, v1.8h, #3 311 dup v4.8b, v3.b[0] 312 dup v6.8b, v2.b[2] 313 dup v5.8b, v2.b[0] 314 zip1 v0.2s, v4.2s, v6.2s 315 zip1 v1.2s, v5.2s, v6.2s 316 b .L_pred8x8_dc_end 317endfunc 318 319function ff_pred8x8_l00_dc_neon, export=1 320 sub x2, x0, #1 321 ldcol.8 v0, x2, x1, 4 322 uaddlp v0.4h, v0.8b 323 addp v0.4h, v0.4h, v0.4h 324 rshrn v0.8b, v0.8h, #2 325 movi v1.8b, #128 326 dup v0.8b, v0.b[0] 327 b .L_pred8x8_dc_end 328endfunc 329 330function ff_pred8x8_0lt_dc_neon, export=1 331 add x3, x0, x1, lsl #2 332 sub x2, x0, x1 333 sub x3, x3, #1 334 ld1 {v0.8b}, [x2] 335 ldcol.8 v1, x3, x1, 4, hi=1 336 zip1 v0.4s, v0.4s, v1.4s 337 uaddlp v0.8h, v0.16b 338 addp v0.8h, v0.8h, v0.8h 339 addp v1.4h, v0.4h, v0.4h 340 rshrn v2.8b, v0.8h, #2 341 rshrn v3.8b, v1.8h, #3 342 dup v4.8b, v2.b[0] 343 dup v5.8b, v2.b[3] 344 dup v6.8b, v2.b[2] 345 dup v7.8b, v3.b[1] 346 zip1 v0.2s, v4.2s, v6.2s 347 zip1 v1.2s, v5.2s, v7.2s 348 b .L_pred8x8_dc_end 349endfunc 350 351function ff_pred8x8_0l0_dc_neon, export=1 352 add x2, x0, x1, lsl #2 353 sub x2, x2, #1 354 ldcol.8 v1, x2, x1, 4 355 uaddlp v2.4h, v1.8b 356 addp v2.4h, v2.4h, v2.4h 357 rshrn v1.8b, v2.8h, #2 358 movi v0.8b, #128 359 dup v1.8b, v1.b[0] 360 b .L_pred8x8_dc_end 361endfunc 362