1/* 2 * VP8 NEON optimisations 3 * 4 * Copyright (c) 2010 Rob Clark <rob@ti.com> 5 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com> 6 * Copyright (c) 2018 Magnus Röös <mla2.roos@gmail.com> 7 * Copyright (c) 2019 Martin Storsjo <martin@martin.st> 8 * 9 * This file is part of FFmpeg. 10 * 11 * FFmpeg is free software; you can redistribute it and/or 12 * modify it under the terms of the GNU Lesser General Public 13 * License as published by the Free Software Foundation; either 14 * version 2.1 of the License, or (at your option) any later version. 15 * 16 * FFmpeg is distributed in the hope that it will be useful, 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19 * Lesser General Public License for more details. 20 * 21 * You should have received a copy of the GNU Lesser General Public 22 * License along with FFmpeg; if not, write to the Free Software 23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 24 */ 25 26#include "libavutil/aarch64/asm.S" 27#include "neon.S" 28 29function ff_vp8_luma_dc_wht_neon, export=1 30 ld1 {v0.4h - v3.4h}, [x1] 31 movi v30.8h, #0 32 33 add v4.4h, v0.4h, v3.4h 34 add v6.4h, v1.4h, v2.4h 35 st1 {v30.8h}, [x1], #16 36 sub v7.4h, v1.4h, v2.4h 37 sub v5.4h, v0.4h, v3.4h 38 st1 {v30.8h}, [x1] 39 add v0.4h, v4.4h, v6.4h 40 add v1.4h, v5.4h, v7.4h 41 sub v2.4h, v4.4h, v6.4h 42 sub v3.4h, v5.4h, v7.4h 43 44 movi v16.4h, #3 45 46 transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7 47 48 add v0.4h, v0.4h, v16.4h 49 50 add v4.4h, v0.4h, v3.4h 51 add v6.4h, v1.4h, v2.4h 52 sub v7.4h, v1.4h, v2.4h 53 sub v5.4h, v0.4h, v3.4h 54 add v0.4h, v4.4h, v6.4h 55 add v1.4h, v5.4h, v7.4h 56 sub v2.4h, v4.4h, v6.4h 57 sub v3.4h, v5.4h, v7.4h 58 59 sshr v0.4h, v0.4h, #3 60 sshr v1.4h, v1.4h, #3 61 sshr v2.4h, v2.4h, #3 62 sshr v3.4h, v3.4h, #3 63 64 mov x3, #32 65 st1 {v0.h}[0], [x0], x3 66 st1 {v1.h}[0], [x0], x3 67 st1 {v2.h}[0], [x0], x3 68 st1 {v3.h}[0], [x0], x3 69 st1 {v0.h}[1], [x0], x3 70 st1 {v1.h}[1], [x0], x3 71 st1 {v2.h}[1], [x0], x3 72 st1 {v3.h}[1], [x0], x3 73 st1 {v0.h}[2], [x0], x3 74 st1 {v1.h}[2], [x0], x3 75 st1 {v2.h}[2], [x0], x3 76 st1 {v3.h}[2], [x0], x3 77 st1 {v0.h}[3], [x0], x3 78 st1 {v1.h}[3], [x0], x3 79 st1 {v2.h}[3], [x0], x3 80 st1 {v3.h}[3], [x0], x3 81 82 ret 83endfunc 84 85function ff_vp8_idct_add_neon, export=1 86 ld1 {v0.8b - v3.8b}, [x1] 87 mov w4, #20091 88 movk w4, #35468/2, lsl #16 89 dup v4.2s, w4 90 91 smull v26.4s, v1.4h, v4.h[0] 92 smull v27.4s, v3.4h, v4.h[0] 93 sqdmulh v20.4h, v1.4h, v4.h[1] 94 sqdmulh v23.4h, v3.4h, v4.h[1] 95 shrn v21.4h, v26.4s, #16 96 shrn v22.4h, v27.4s, #16 97 add v21.4h, v21.4h, v1.4h 98 add v22.4h, v22.4h, v3.4h 99 100 add v16.4h, v0.4h, v2.4h 101 sub v17.4h, v0.4h, v2.4h 102 103 add v18.4h, v21.4h, v23.4h 104 sub v19.4h, v20.4h, v22.4h 105 106 add v0.4h, v16.4h, v18.4h 107 add v1.4h, v17.4h, v19.4h 108 sub v3.4h, v16.4h, v18.4h 109 sub v2.4h, v17.4h, v19.4h 110 111 transpose_4x4H v0, v1, v2, v3, v24, v5, v6, v7 112 113 movi v29.8h, #0 114 smull v26.4s, v1.4h, v4.h[0] 115 st1 {v29.8h}, [x1], #16 116 smull v27.4s, v3.4h, v4.h[0] 117 st1 {v29.16b}, [x1] 118 sqdmulh v21.4h, v1.4h, v4.h[1] 119 sqdmulh v23.4h, v3.4h, v4.h[1] 120 shrn v20.4h, v26.4s, #16 121 shrn v22.4h, v27.4s, #16 122 add v20.4h, v20.4h, v1.4h 123 add v22.4h, v22.4h, v3.4h 124 add v16.4h, v0.4h, v2.4h 125 sub v17.4h, v0.4h, v2.4h 126 127 add v18.4h, v20.4h, v23.4h 128 ld1 {v24.s}[0], [x0], x2 129 sub v19.4h, v21.4h, v22.4h 130 ld1 {v25.s}[0], [x0], x2 131 add v0.4h, v16.4h, v18.4h 132 add v1.4h, v17.4h, v19.4h 133 ld1 {v26.s}[0], [x0], x2 134 sub v3.4h, v16.4h, v18.4h 135 sub v2.4h, v17.4h, v19.4h 136 ld1 {v27.s}[0], [x0], x2 137 srshr v0.4h, v0.4h, #3 138 srshr v1.4h, v1.4h, #3 139 srshr v2.4h, v2.4h, #3 140 srshr v3.4h, v3.4h, #3 141 142 sub x0, x0, x2, lsl #2 143 144 transpose_4x4H v0, v1, v2, v3, v5, v6, v7, v16 145 146 uaddw v0.8h, v0.8h, v24.8b 147 uaddw v1.8h, v1.8h, v25.8b 148 uaddw v2.8h, v2.8h, v26.8b 149 uaddw v3.8h, v3.8h, v27.8b 150 sqxtun v0.8b, v0.8h 151 sqxtun v1.8b, v1.8h 152 sqxtun v2.8b, v2.8h 153 sqxtun v3.8b, v3.8h 154 155 st1 {v0.s}[0], [x0], x2 156 st1 {v1.s}[0], [x0], x2 157 st1 {v2.s}[0], [x0], x2 158 st1 {v3.s}[0], [x0], x2 159 160 ret 161endfunc 162 163function ff_vp8_idct_dc_add4uv_neon, export=1 164 movi v0.4h, #0 165 mov x3, #32 166 ld1r {v16.4h}, [x1] 167 st1 {v0.h}[0], [x1], x3 168 ld1r {v17.4h}, [x1] 169 st1 {v0.h}[0], [x1], x3 170 ld1r {v18.4h}, [x1] 171 st1 {v0.h}[0], [x1], x3 172 ld1r {v19.4h}, [x1] 173 st1 {v0.h}[0], [x1], x3 174 ins v16.d[1], v17.d[0] 175 ins v18.d[1], v19.d[0] 176 mov x3, x0 177 srshr v16.8h, v16.8h, #3 // dc >>= 3 178 ld1 {v0.8b}, [x0], x2 179 srshr v18.8h, v18.8h, #3 180 ld1 {v1.8b}, [x0], x2 181 uaddw v20.8h, v16.8h, v0.8b 182 ld1 {v2.8b}, [x0], x2 183 uaddw v0.8h, v16.8h, v1.8b 184 ld1 {v3.8b}, [x0], x2 185 uaddw v22.8h, v16.8h, v2.8b 186 ld1 {v4.8b}, [x0], x2 187 uaddw v2.8h, v16.8h, v3.8b 188 ld1 {v5.8b}, [x0], x2 189 uaddw v24.8h, v18.8h, v4.8b 190 ld1 {v6.8b}, [x0], x2 191 uaddw v4.8h, v18.8h, v5.8b 192 ld1 {v7.8b}, [x0], x2 193 uaddw v26.8h, v18.8h, v6.8b 194 sqxtun v20.8b, v20.8h 195 uaddw v6.8h, v18.8h, v7.8b 196 sqxtun v21.8b, v0.8h 197 sqxtun v22.8b, v22.8h 198 st1 {v20.8b}, [x3], x2 199 sqxtun v23.8b, v2.8h 200 st1 {v21.8b}, [x3], x2 201 sqxtun v24.8b, v24.8h 202 st1 {v22.8b}, [x3], x2 203 sqxtun v25.8b, v4.8h 204 st1 {v23.8b}, [x3], x2 205 sqxtun v26.8b, v26.8h 206 st1 {v24.8b}, [x3], x2 207 sqxtun v27.8b, v6.8h 208 st1 {v25.8b}, [x3], x2 209 st1 {v26.8b}, [x3], x2 210 st1 {v27.8b}, [x3], x2 211 212 ret 213endfunc 214 215function ff_vp8_idct_dc_add4y_neon, export=1 216 movi v0.16b, #0 217 mov x3, #32 218 ld1r {v16.4h}, [x1] 219 st1 {v0.h}[0], [x1], x3 220 ld1r {v17.4h}, [x1] 221 st1 {v0.h}[0], [x1], x3 222 zip1 v16.2d, v16.2d, v17.2d 223 ld1r {v18.4h}, [x1] 224 st1 {v0.h}[0], [x1], x3 225 ld1r {v19.4h}, [x1] 226 st1 {v0.h}[0], [x1], x3 227 zip1 v18.2d, v18.2d, v19.2d 228 srshr v16.8h, v16.8h, #3 // dc >>= 3 229 ld1 {v0.16b}, [x0], x2 230 srshr v18.8h, v18.8h, #3 231 ld1 {v1.16b}, [x0], x2 232 uaddw v20.8h, v16.8h, v0.8b 233 ld1 {v2.16b}, [x0], x2 234 uaddw2 v0.8h, v18.8h, v0.16b 235 ld1 {v3.16b}, [x0], x2 236 uaddw v21.8h, v16.8h, v1.8b 237 uaddw2 v1.8h, v18.8h, v1.16b 238 uaddw v22.8h, v16.8h, v2.8b 239 uaddw2 v2.8h, v18.8h, v2.16b 240 uaddw v23.8h, v16.8h, v3.8b 241 uaddw2 v3.8h, v18.8h, v3.16b 242 sub x0, x0, x2, lsl #2 243 sqxtun v20.8b, v20.8h 244 sqxtun2 v20.16b, v0.8h 245 sqxtun v21.8b, v21.8h 246 sqxtun2 v21.16b, v1.8h 247 sqxtun v22.8b, v22.8h 248 st1 {v20.16b}, [x0], x2 249 sqxtun2 v22.16b, v2.8h 250 st1 {v21.16b}, [x0], x2 251 sqxtun v23.8b, v23.8h 252 st1 {v22.16b}, [x0], x2 253 sqxtun2 v23.16b, v3.8h 254 st1 {v23.16b}, [x0], x2 255 256 ret 257endfunc 258 259function ff_vp8_idct_dc_add_neon, export=1 260 mov w3, #0 261 ld1r {v2.8h}, [x1] 262 strh w3, [x1] 263 srshr v2.8h, v2.8h, #3 264 ld1 {v0.s}[0], [x0], x2 265 ld1 {v0.s}[1], [x0], x2 266 uaddw v3.8h, v2.8h, v0.8b 267 ld1 {v1.s}[0], [x0], x2 268 ld1 {v1.s}[1], [x0], x2 269 uaddw v4.8h, v2.8h, v1.8b 270 sqxtun v0.8b, v3.8h 271 sqxtun v1.8b, v4.8h 272 sub x0, x0, x2, lsl #2 273 st1 {v0.s}[0], [x0], x2 274 st1 {v0.s}[1], [x0], x2 275 st1 {v1.s}[0], [x0], x2 276 st1 {v1.s}[1], [x0], x2 277 ret 278endfunc 279 280// Register layout: 281// P3..Q3 -> v0..v7 282// flim_E -> v22 283// flim_I -> v23 284// hev_thresh -> x5 285// 286.macro vp8_loop_filter, inner=0, simple=0, hev_thresh 287 .if \simple 288 uabd v17.16b, v3.16b, v4.16b // abs(P0-Q0) 289 uabd v23.16b, v2.16b, v5.16b // abs(P1-Q1) 290 uqadd v17.16b, v17.16b, v17.16b // abs(P0-Q0) * 2 291 ushr v18.16b, v23.16b, #1 // abs(P1-Q1) / 2 292 uqadd v19.16b, v17.16b, v18.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) 293 movi v21.16b, #0x80 294 cmhs v16.16b, v22.16b, v19.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim 295 .else 296 // calculate hev and normal_limit: 297 uabd v20.16b, v2.16b, v3.16b // abs(P1-P0) 298 uabd v21.16b, v5.16b, v4.16b // abs(Q1-Q0) 299 uabd v18.16b, v0.16b, v1.16b // abs(P3-P2) 300 uabd v19.16b, v1.16b, v2.16b // abs(P2-P1) 301 cmhs v16.16b, v23.16b, v20.16b // abs(P1-P0) <= flim_I 302 cmhs v17.16b, v23.16b, v21.16b // abs(Q1-Q0) <= flim_I 303 cmhs v18.16b, v23.16b, v18.16b // abs(P3-P2) <= flim_I 304 cmhs v19.16b, v23.16b, v19.16b // abs(P2-P1) <= flim_I 305 and v16.16b, v17.16b, v16.16b 306 uabd v17.16b, v7.16b, v6.16b // abs(Q3-Q2) 307 and v16.16b, v16.16b, v19.16b 308 uabd v19.16b, v6.16b, v5.16b // abs(Q2-Q1) 309 and v16.16b, v16.16b, v18.16b 310 cmhs v18.16b, v23.16b, v17.16b // abs(Q3-Q2) <= flim_I 311 cmhs v19.16b, v23.16b, v19.16b // abs(Q2-Q1) <= flim_I 312 uabd v17.16b, v3.16b, v4.16b // abs(P0-Q0) 313 uabd v23.16b, v2.16b, v5.16b // abs(P1-Q1) 314 and v16.16b, v16.16b, v18.16b 315 uqadd v17.16b, v17.16b, v17.16b // abs(P0-Q0) * 2 316 and v16.16b, v16.16b, v19.16b 317 ushr v18.16b, v23.16b, #1 // abs(P1-Q1) / 2 318 dup v23.16b, \hev_thresh // hev_thresh 319 uqadd v19.16b, v17.16b, v18.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) 320 cmhi v20.16b, v20.16b, v23.16b // abs(P1-P0) > hev_thresh 321 cmhs v19.16b, v22.16b, v19.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E 322 cmhi v22.16b, v21.16b, v23.16b // abs(Q1-Q0) > hev_thresh 323 and v16.16b, v16.16b, v19.16b 324 movi v21.16b, #0x80 325 orr v17.16b, v20.16b, v22.16b 326 .endif 327 328 // at this point: 329 // v16: normal_limit 330 // v17: hev 331 332 // convert to signed value: 333 eor v3.16b, v3.16b, v21.16b // PS0 = P0 ^ 0x80 334 eor v4.16b, v4.16b, v21.16b // QS0 = Q0 ^ 0x80 335 336 movi v20.8h, #3 337 ssubl v18.8h, v4.8b, v3.8b // QS0 - PS0 338 ssubl2 v19.8h, v4.16b, v3.16b // (widened to 16bit) 339 eor v2.16b, v2.16b, v21.16b // PS1 = P1 ^ 0x80 340 eor v5.16b, v5.16b, v21.16b // QS1 = Q1 ^ 0x80 341 mul v18.8h, v18.8h, v20.8h // w = 3 * (QS0 - PS0) 342 mul v19.8h, v19.8h, v20.8h 343 344 sqsub v20.16b, v2.16b, v5.16b // clamp(PS1-QS1) 345 movi v22.16b, #4 346 movi v23.16b, #3 347 .if \inner 348 and v20.16b, v20.16b, v17.16b // if(hev) w += clamp(PS1-QS1) 349 .endif 350 saddw v18.8h, v18.8h, v20.8b // w += clamp(PS1-QS1) 351 saddw2 v19.8h, v19.8h, v20.16b 352 sqxtn v18.8b, v18.8h // narrow result back into v18 353 sqxtn2 v18.16b, v19.8h 354 .if !\inner && !\simple 355 eor v1.16b, v1.16b, v21.16b // PS2 = P2 ^ 0x80 356 eor v6.16b, v6.16b, v21.16b // QS2 = Q2 ^ 0x80 357 .endif 358 and v18.16b, v18.16b, v16.16b // w &= normal_limit 359 360 // registers used at this point.. 361 // v0 -> P3 (don't corrupt) 362 // v1-v6 -> PS2-QS2 363 // v7 -> Q3 (don't corrupt) 364 // v17 -> hev 365 // v18 -> w 366 // v21 -> #0x80 367 // v22 -> #4 368 // v23 -> #3 369 // v16, v19, v29 -> unused 370 // 371 // filter_common: is4tap==1 372 // c1 = clamp(w + 4) >> 3; 373 // c2 = clamp(w + 3) >> 3; 374 // Q0 = s2u(QS0 - c1); 375 // P0 = s2u(PS0 + c2); 376 377 .if \simple 378 sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4) 379 sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3) 380 sshr v19.16b, v19.16b, #3 // c1 >>= 3 381 sshr v20.16b, v20.16b, #3 // c2 >>= 3 382 sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1) 383 sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2) 384 eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80 385 eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80 386 eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80 387 eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80 388 .elseif \inner 389 // the !is4tap case of filter_common, only used for inner blocks 390 // c3 = ((c1&~hev) + 1) >> 1; 391 // Q1 = s2u(QS1 - c3); 392 // P1 = s2u(PS1 + c3); 393 sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4) 394 sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3) 395 sshr v19.16b, v19.16b, #3 // c1 >>= 3 396 sshr v20.16b, v20.16b, #3 // c2 >>= 3 397 sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1) 398 sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2) 399 bic v19.16b, v19.16b, v17.16b // c1 & ~hev 400 eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80 401 srshr v19.16b, v19.16b, #1 // c3 >>= 1 402 eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80 403 sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-c3) 404 sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+c3) 405 eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80 406 eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80 407 .else 408 and v20.16b, v18.16b, v17.16b // w & hev 409 sqadd v19.16b, v20.16b, v22.16b // c1 = clamp((w&hev)+4) 410 sqadd v20.16b, v20.16b, v23.16b // c2 = clamp((w&hev)+3) 411 sshr v19.16b, v19.16b, #3 // c1 >>= 3 412 sshr v20.16b, v20.16b, #3 // c2 >>= 3 413 bic v18.16b, v18.16b, v17.16b // w &= ~hev 414 sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1) 415 sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2) 416 417 // filter_mbedge: 418 // a = clamp((27*w + 63) >> 7); 419 // Q0 = s2u(QS0 - a); 420 // P0 = s2u(PS0 + a); 421 // a = clamp((18*w + 63) >> 7); 422 // Q1 = s2u(QS1 - a); 423 // P1 = s2u(PS1 + a); 424 // a = clamp((9*w + 63) >> 7); 425 // Q2 = s2u(QS2 - a); 426 // P2 = s2u(PS2 + a); 427 movi v17.8h, #63 428 sshll v22.8h, v18.8b, #3 429 sshll2 v23.8h, v18.16b, #3 430 saddw v22.8h, v22.8h, v18.8b 431 saddw2 v23.8h, v23.8h, v18.16b 432 add v16.8h, v17.8h, v22.8h 433 add v17.8h, v17.8h, v23.8h // 9*w + 63 434 add v19.8h, v16.8h, v22.8h 435 add v20.8h, v17.8h, v23.8h // 18*w + 63 436 add v22.8h, v19.8h, v22.8h 437 add v23.8h, v20.8h, v23.8h // 27*w + 63 438 sqshrn v16.8b, v16.8h, #7 439 sqshrn2 v16.16b, v17.8h, #7 // clamp(( 9*w + 63)>>7) 440 sqshrn v19.8b, v19.8h, #7 441 sqshrn2 v19.16b, v20.8h, #7 // clamp((18*w + 63)>>7) 442 sqshrn v22.8b, v22.8h, #7 443 sqshrn2 v22.16b, v23.8h, #7 // clamp((27*w + 63)>>7) 444 sqadd v1.16b, v1.16b, v16.16b // PS2 = clamp(PS2+a) 445 sqsub v6.16b, v6.16b, v16.16b // QS2 = clamp(QS2-a) 446 sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+a) 447 sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-a) 448 sqadd v3.16b, v3.16b, v22.16b // PS0 = clamp(PS0+a) 449 sqsub v4.16b, v4.16b, v22.16b // QS0 = clamp(QS0-a) 450 eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80 451 eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80 452 eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80 453 eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80 454 eor v1.16b, v1.16b, v21.16b // P2 = PS2 ^ 0x80 455 eor v6.16b, v6.16b, v21.16b // Q2 = QS2 ^ 0x80 456 .endif 457.endm 458 459.macro vp8_v_loop_filter16 name, inner=0, simple=0 460function ff_vp8_v_loop_filter16\name\()_neon, export=1 461 sub x0, x0, x1, lsl #1+!\simple 462 463 // Load pixels: 464 .if !\simple 465 ld1 {v0.16b}, [x0], x1 // P3 466 ld1 {v1.16b}, [x0], x1 // P2 467 .endif 468 ld1 {v2.16b}, [x0], x1 // P1 469 ld1 {v3.16b}, [x0], x1 // P0 470 ld1 {v4.16b}, [x0], x1 // Q0 471 ld1 {v5.16b}, [x0], x1 // Q1 472 .if !\simple 473 ld1 {v6.16b}, [x0], x1 // Q2 474 ld1 {v7.16b}, [x0] // Q3 475 dup v23.16b, w3 // flim_I 476 .endif 477 dup v22.16b, w2 // flim_E 478 479 vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4 480 481 // back up to P2: dst -= stride * 6 482 sub x0, x0, x1, lsl #2 483 .if !\simple 484 sub x0, x0, x1, lsl #1 485 486 // Store pixels: 487 st1 {v1.16b}, [x0], x1 // P2 488 .endif 489 st1 {v2.16b}, [x0], x1 // P1 490 st1 {v3.16b}, [x0], x1 // P0 491 st1 {v4.16b}, [x0], x1 // Q0 492 st1 {v5.16b}, [x0], x1 // Q1 493 .if !\simple 494 st1 {v6.16b}, [x0] // Q2 495 .endif 496 497 ret 498endfunc 499.endm 500 501vp8_v_loop_filter16 502vp8_v_loop_filter16 _inner, inner=1 503vp8_v_loop_filter16 _simple, simple=1 504 505.macro vp8_v_loop_filter8uv name, inner=0 506function ff_vp8_v_loop_filter8uv\name\()_neon, export=1 507 sub x0, x0, x2, lsl #2 508 sub x1, x1, x2, lsl #2 509 // Load pixels: 510 ld1 {v0.d}[0], [x0], x2 // P3 511 ld1 {v0.d}[1], [x1], x2 // P3 512 ld1 {v1.d}[0], [x0], x2 // P2 513 ld1 {v1.d}[1], [x1], x2 // P2 514 ld1 {v2.d}[0], [x0], x2 // P1 515 ld1 {v2.d}[1], [x1], x2 // P1 516 ld1 {v3.d}[0], [x0], x2 // P0 517 ld1 {v3.d}[1], [x1], x2 // P0 518 ld1 {v4.d}[0], [x0], x2 // Q0 519 ld1 {v4.d}[1], [x1], x2 // Q0 520 ld1 {v5.d}[0], [x0], x2 // Q1 521 ld1 {v5.d}[1], [x1], x2 // Q1 522 ld1 {v6.d}[0], [x0], x2 // Q2 523 ld1 {v6.d}[1], [x1], x2 // Q2 524 ld1 {v7.d}[0], [x0] // Q3 525 ld1 {v7.d}[1], [x1] // Q3 526 527 dup v22.16b, w3 // flim_E 528 dup v23.16b, w4 // flim_I 529 530 vp8_loop_filter inner=\inner, hev_thresh=w5 531 532 // back up to P2: u,v -= stride * 6 533 sub x0, x0, x2, lsl #2 534 sub x1, x1, x2, lsl #2 535 sub x0, x0, x2, lsl #1 536 sub x1, x1, x2, lsl #1 537 538 // Store pixels: 539 540 st1 {v1.d}[0], [x0], x2 // P2 541 st1 {v1.d}[1], [x1], x2 // P2 542 st1 {v2.d}[0], [x0], x2 // P1 543 st1 {v2.d}[1], [x1], x2 // P1 544 st1 {v3.d}[0], [x0], x2 // P0 545 st1 {v3.d}[1], [x1], x2 // P0 546 st1 {v4.d}[0], [x0], x2 // Q0 547 st1 {v4.d}[1], [x1], x2 // Q0 548 st1 {v5.d}[0], [x0], x2 // Q1 549 st1 {v5.d}[1], [x1], x2 // Q1 550 st1 {v6.d}[0], [x0] // Q2 551 st1 {v6.d}[1], [x1] // Q2 552 553 ret 554endfunc 555.endm 556 557vp8_v_loop_filter8uv 558vp8_v_loop_filter8uv _inner, inner=1 559 560.macro vp8_h_loop_filter16 name, inner=0, simple=0 561function ff_vp8_h_loop_filter16\name\()_neon, export=1 562 563 sub x0, x0, #4 564 // Load pixels: 565 ld1 {v0.d}[0], [x0], x1 566 ld1 {v1.d}[0], [x0], x1 567 ld1 {v2.d}[0], [x0], x1 568 ld1 {v3.d}[0], [x0], x1 569 ld1 {v4.d}[0], [x0], x1 570 ld1 {v5.d}[0], [x0], x1 571 ld1 {v6.d}[0], [x0], x1 572 ld1 {v7.d}[0], [x0], x1 573 ld1 {v0.d}[1], [x0], x1 574 ld1 {v1.d}[1], [x0], x1 575 ld1 {v2.d}[1], [x0], x1 576 ld1 {v3.d}[1], [x0], x1 577 ld1 {v4.d}[1], [x0], x1 578 ld1 {v5.d}[1], [x0], x1 579 ld1 {v6.d}[1], [x0], x1 580 ld1 {v7.d}[1], [x0], x1 581 582 transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 583 584 dup v22.16b, w2 // flim_E 585 .if !\simple 586 dup v23.16b, w3 // flim_I 587 .endif 588 589 vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4 590 591 sub x0, x0, x1, lsl #4 // backup 16 rows 592 593 transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 594 595 // Store pixels: 596 st1 {v0.d}[0], [x0], x1 597 st1 {v1.d}[0], [x0], x1 598 st1 {v2.d}[0], [x0], x1 599 st1 {v3.d}[0], [x0], x1 600 st1 {v4.d}[0], [x0], x1 601 st1 {v5.d}[0], [x0], x1 602 st1 {v6.d}[0], [x0], x1 603 st1 {v7.d}[0], [x0], x1 604 st1 {v0.d}[1], [x0], x1 605 st1 {v1.d}[1], [x0], x1 606 st1 {v2.d}[1], [x0], x1 607 st1 {v3.d}[1], [x0], x1 608 st1 {v4.d}[1], [x0], x1 609 st1 {v5.d}[1], [x0], x1 610 st1 {v6.d}[1], [x0], x1 611 st1 {v7.d}[1], [x0] 612 613 ret 614endfunc 615.endm 616 617vp8_h_loop_filter16 618vp8_h_loop_filter16 _inner, inner=1 619vp8_h_loop_filter16 _simple, simple=1 620 621.macro vp8_h_loop_filter8uv name, inner=0 622function ff_vp8_h_loop_filter8uv\name\()_neon, export=1 623 sub x0, x0, #4 624 sub x1, x1, #4 625 626 // Load pixels: 627 ld1 {v0.d}[0], [x0], x2 // load u 628 ld1 {v0.d}[1], [x1], x2 // load v 629 ld1 {v1.d}[0], [x0], x2 630 ld1 {v1.d}[1], [x1], x2 631 ld1 {v2.d}[0], [x0], x2 632 ld1 {v2.d}[1], [x1], x2 633 ld1 {v3.d}[0], [x0], x2 634 ld1 {v3.d}[1], [x1], x2 635 ld1 {v4.d}[0], [x0], x2 636 ld1 {v4.d}[1], [x1], x2 637 ld1 {v5.d}[0], [x0], x2 638 ld1 {v5.d}[1], [x1], x2 639 ld1 {v6.d}[0], [x0], x2 640 ld1 {v6.d}[1], [x1], x2 641 ld1 {v7.d}[0], [x0], x2 642 ld1 {v7.d}[1], [x1], x2 643 644 transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 645 646 dup v22.16b, w3 // flim_E 647 dup v23.16b, w4 // flim_I 648 649 vp8_loop_filter inner=\inner, hev_thresh=w5 650 651 sub x0, x0, x2, lsl #3 // backup u 8 rows 652 sub x1, x1, x2, lsl #3 // backup v 8 rows 653 654 transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 655 656 // Store pixels: 657 st1 {v0.d}[0], [x0], x2 // load u 658 st1 {v0.d}[1], [x1], x2 // load v 659 st1 {v1.d}[0], [x0], x2 660 st1 {v1.d}[1], [x1], x2 661 st1 {v2.d}[0], [x0], x2 662 st1 {v2.d}[1], [x1], x2 663 st1 {v3.d}[0], [x0], x2 664 st1 {v3.d}[1], [x1], x2 665 st1 {v4.d}[0], [x0], x2 666 st1 {v4.d}[1], [x1], x2 667 st1 {v5.d}[0], [x0], x2 668 st1 {v5.d}[1], [x1], x2 669 st1 {v6.d}[0], [x0], x2 670 st1 {v6.d}[1], [x1], x2 671 st1 {v7.d}[0], [x0] 672 st1 {v7.d}[1], [x1] 673 674 ret 675 676endfunc 677.endm 678 679vp8_h_loop_filter8uv 680vp8_h_loop_filter8uv _inner, inner=1 681 682 683function ff_put_vp8_pixels16_neon, export=1 6841: 685 subs w4, w4, #4 686 ld1 {v0.16b}, [x2], x3 687 ld1 {v1.16b}, [x2], x3 688 ld1 {v2.16b}, [x2], x3 689 ld1 {v3.16b}, [x2], x3 690 st1 {v0.16b}, [x0], x1 691 st1 {v1.16b}, [x0], x1 692 st1 {v2.16b}, [x0], x1 693 st1 {v3.16b}, [x0], x1 694 b.gt 1b 695 ret 696endfunc 697 698function ff_put_vp8_pixels8_neon, export=1 6991: 700 subs w4, w4, #4 701 ld1 {v0.8b}, [x2], x3 702 ld1 {v0.d}[1], [x2], x3 703 ld1 {v1.8b}, [x2], x3 704 ld1 {v1.d}[1], [x2], x3 705 st1 {v0.8b}, [x0], x1 706 st1 {v0.d}[1], [x0], x1 707 st1 {v1.8b}, [x0], x1 708 st1 {v1.d}[1], [x0], x1 709 b.gt 1b 710 ret 711endfunc 712 713/* 4/6-tap 8th-pel MC */ 714 715.macro vp8_epel8_h6 d, s0, s1 716 ext v22.8b, \s0\().8b, \s1\().8b, #1 717 uxtl v18.8h, \s0\().8b 718 ext v23.8b, \s0\().8b, \s1\().8b, #2 719 uxtl v19.8h, v22.8b 720 ext v24.8b, \s0\().8b, \s1\().8b, #3 721 uxtl v21.8h, v23.8b 722 ext v25.8b, \s0\().8b, \s1\().8b, #4 723 uxtl v22.8h, v24.8b 724 ext v26.8b, \s0\().8b, \s1\().8b, #5 725 uxtl v25.8h, v25.8b 726 mul v21.8h, v21.8h, v0.h[2] 727 uxtl v26.8h, v26.8b 728 mul v22.8h, v22.8h, v0.h[3] 729 mls v21.8h, v19.8h, v0.h[1] 730 mls v22.8h, v25.8h, v0.h[4] 731 mla v21.8h, v18.8h, v0.h[0] 732 mla v22.8h, v26.8h, v0.h[5] 733 sqadd v22.8h, v21.8h, v22.8h 734 sqrshrun \d\().8b, v22.8h, #7 735.endm 736 737.macro vp8_epel16_h6 d0, v0, v1 738 ext v22.16b, \v0\().16b, \v1\().16b, #3 739 ext v23.16b, \v0\().16b, \v1\().16b, #4 740 uxtl v19.8h, v22.8b 741 uxtl2 v22.8h, v22.16b 742 ext v3.16b, \v0\().16b, \v1\().16b, #2 743 uxtl v20.8h, v23.8b 744 uxtl2 v23.8h, v23.16b 745 ext v16.16b, \v0\().16b, \v1\().16b, #1 746 uxtl v18.8h, v3.8b 747 uxtl2 v3.8h, v3.16b 748 ext v2.16b, \v0\().16b, \v1\().16b, #5 749 uxtl v21.8h, v2.8b 750 uxtl2 v2.8h, v2.16b 751 uxtl v17.8h, v16.8b 752 uxtl2 v16.8h, v16.16b 753 mul v19.8h, v19.8h, v0.h[3] 754 mul v18.8h, v18.8h, v0.h[2] 755 mul v3.8h, v3.8h, v0.h[2] 756 mul v22.8h, v22.8h, v0.h[3] 757 mls v19.8h, v20.8h, v0.h[4] 758 uxtl v20.8h, \v0\().8b 759 uxtl2 v1.8h, \v0\().16b 760 mls v18.8h, v17.8h, v0.h[1] 761 mls v3.8h, v16.8h, v0.h[1] 762 mls v22.8h, v23.8h, v0.h[4] 763 mla v18.8h, v20.8h, v0.h[0] 764 mla v19.8h, v21.8h, v0.h[5] 765 mla v3.8h, v1.8h, v0.h[0] 766 mla v22.8h, v2.8h, v0.h[5] 767 sqadd v19.8h, v18.8h, v19.8h 768 sqadd v22.8h, v3.8h, v22.8h 769 sqrshrun \d0\().8b, v19.8h, #7 770 sqrshrun2 \d0\().16b, v22.8h, #7 771.endm 772 773.macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6 774 uxtl \s0\().8h, \s0\().8b 775 uxtl \s3\().8h, \s3\().8b 776 uxtl \s6\().8h, \s6\().8b 777 uxtl \s1\().8h, \s1\().8b 778 uxtl \s4\().8h, \s4\().8b 779 uxtl \s2\().8h, \s2\().8b 780 uxtl \s5\().8h, \s5\().8b 781 mul \s0\().8h, \s0\().8h, v0.h[0] 782 mul v31.8h , \s3\().8h, v0.h[3] 783 mul \s3\().8h, \s3\().8h, v0.h[2] 784 mul \s6\().8h, \s6\().8h, v0.h[5] 785 786 mls \s0\().8h, \s1\().8h, v0.h[1] 787 mls v31.8h , \s4\().8h, v0.h[4] 788 mls \s3\().8h, \s2\().8h, v0.h[1] 789 mls \s6\().8h, \s5\().8h, v0.h[4] 790 791 mla \s0\().8h, \s2\().8h, v0.h[2] 792 mla v31.8h , \s5\().8h, v0.h[5] 793 mla \s3\().8h, \s1\().8h, v0.h[0] 794 mla \s6\().8h, \s4\().8h, v0.h[3] 795 sqadd v31.8h , \s0\().8h, v31.8h 796 sqadd \s6\().8h, \s3\().8h, \s6\().8h 797 sqrshrun \d0\().8b, v31.8h, #7 798 sqrshrun \d1\().8b, \s6\().8h, #7 799.endm 800 801.macro vp8_epel8_h4 d, v0, v1 802 ext v22.8b, \v0\().8b, \v1\().8b, #1 803 uxtl v19.8h, \v0\().8b 804 ext v23.8b, \v0\().8b, \v1\().8b, #2 805 uxtl v20.8h, v22.8b 806 ext v25.8b, \v0\().8b, \v1\().8b, #3 807 uxtl v22.8h, v23.8b 808 uxtl v25.8h, v25.8b 809 mul v20.8h, v20.8h, v0.h[2] 810 mul v22.8h, v22.8h, v0.h[3] 811 mls v20.8h, v19.8h, v0.h[1] 812 mls v22.8h, v25.8h, v0.h[4] 813 sqadd v22.8h, v20.8h, v22.8h 814 sqrshrun \d\().8b, v22.8h, #7 815.endm 816 817.macro vp8_epel8_v4_y2 d0, s0, s1, s2, s3, s4 818 uxtl \s0\().8h, \s0\().8b 819 uxtl \s1\().8h, \s1\().8b 820 uxtl \s2\().8h, \s2\().8b 821 uxtl \s3\().8h, \s3\().8b 822 uxtl \s4\().8h, \s4\().8b 823 mul v21.8h, \s1\().8h, v0.h[2] 824 mul v23.8h, \s2\().8h, v0.h[3] 825 mul \s2\().8h, \s2\().8h, v0.h[2] 826 mul v22.8h, \s3\().8h, v0.h[3] 827 mls v21.8h, \s0\().8h, v0.h[1] 828 mls v23.8h, \s3\().8h, v0.h[4] 829 mls \s2\().8h, \s1\().8h, v0.h[1] 830 mls v22.8h, \s4\().8h, v0.h[4] 831 sqadd v21.8h, v21.8h, v23.8h 832 sqadd \s2\().8h, \s2\().8h, v22.8h 833 sqrshrun \d0\().8b, v21.8h, #7 834 sqrshrun2 \d0\().16b, \s2\().8h, #7 835.endm 836 837 838// note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit 839// arithmetic can be used to apply filters 840const subpel_filters, align=4 841 .short 0, 6, 123, 12, 1, 0, 0, 0 842 .short 2, 11, 108, 36, 8, 1, 0, 0 843 .short 0, 9, 93, 50, 6, 0, 0, 0 844 .short 3, 16, 77, 77, 16, 3, 0, 0 845 .short 0, 6, 50, 93, 9, 0, 0, 0 846 .short 1, 8, 36, 108, 11, 2, 0, 0 847 .short 0, 1, 12, 123, 6, 0, 0, 0 848endconst 849 850function ff_put_vp8_epel16_v6_neon, export=1 851 sub x2, x2, x3, lsl #1 852 853 sxtw x4, w4 854 sxtw x6, w6 855 movrel x17, subpel_filters, -16 856 add x6, x17, x6, lsl #4 // y 857 ld1 {v0.8h}, [x6] 8581: 859 ld1 {v1.1d - v2.1d}, [x2], x3 860 ld1 {v3.1d - v4.1d}, [x2], x3 861 ld1 {v16.1d - v17.1d}, [x2], x3 862 ld1 {v18.1d - v19.1d}, [x2], x3 863 ld1 {v20.1d - v21.1d}, [x2], x3 864 ld1 {v22.1d - v23.1d}, [x2], x3 865 ld1 {v24.1d - v25.1d}, [x2] 866 sub x2, x2, x3, lsl #2 867 868 vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24 869 vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25 870 871 st1 {v1.1d - v2.1d}, [x0], x1 872 st1 {v3.1d - v4.1d}, [x0], x1 873 subs x4, x4, #2 874 b.ne 1b 875 876 ret 877endfunc 878 879function ff_put_vp8_epel16_h6_neon, export=1 880 sub x2, x2, #2 881 sxtw x5, w5 // x 882 883 // first pass (horizontal): 884 movrel x17, subpel_filters, -16 885 add x5, x17, x5, lsl #4 // x 886 ld1 {v0.8h}, [x5] 8871: 888 ld1 {v1.16b, v2.16b}, [x2], x3 889 vp8_epel16_h6 v1, v1, v2 890 st1 {v1.16b}, [x0], x1 891 892 subs w4, w4, #1 893 b.ne 1b 894 ret 895endfunc 896 897 898function ff_put_vp8_epel16_h6v6_neon, export=1 899 sub x2, x2, x3, lsl #1 900 sub x2, x2, #2 901 902 // first pass (horizontal): 903 movrel x17, subpel_filters, -16 904 sxtw x5, w5 // x 905 add x16, x17, x5, lsl #4 // x 906 sub sp, sp, #336+16 907 ld1 {v0.8h}, [x16] 908 add x7, sp, #15 909 sxtw x4, w4 910 add x16, x4, #5 // h 911 bic x7, x7, #15 9121: 913 ld1 {v1.16b, v2.16b}, [x2], x3 914 vp8_epel16_h6 v1, v1, v2 915 st1 {v1.16b}, [x7], #16 916 subs x16, x16, #1 917 b.ne 1b 918 919 920 // second pass (vertical): 921 sxtw x6, w6 922 add x6, x17, x6, lsl #4 // y 923 add x7, sp, #15 924 ld1 {v0.8h}, [x6] 925 bic x7, x7, #15 9262: 927 ld1 {v1.8b - v4.8b}, [x7], #32 928 ld1 {v16.8b - v19.8b}, [x7], #32 929 ld1 {v20.8b - v23.8b}, [x7], #32 930 ld1 {v24.8b - v25.8b}, [x7] 931 sub x7, x7, #64 932 933 vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24 934 vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25 935 trn1 v1.2d, v1.2d, v2.2d 936 trn1 v3.2d, v3.2d, v4.2d 937 938 st1 {v1.16b}, [x0], x1 939 st1 {v3.16b}, [x0], x1 940 subs x4, x4, #2 941 b.ne 2b 942 943 add sp, sp, #336+16 944 ret 945endfunc 946 947function ff_put_vp8_epel8_v6_neon, export=1 948 sub x2, x2, x3, lsl #1 949 950 movrel x7, subpel_filters, -16 951 add x6, x7, w6, uxtw #4 952 ld1 {v0.8h}, [x6] 9531: 954 ld1 {v2.8b}, [x2], x3 955 ld1 {v3.8b}, [x2], x3 956 ld1 {v4.8b}, [x2], x3 957 ld1 {v5.8b}, [x2], x3 958 ld1 {v6.8b}, [x2], x3 959 ld1 {v7.8b}, [x2], x3 960 ld1 {v28.8b}, [x2] 961 962 sub x2, x2, x3, lsl #2 963 964 vp8_epel8_v6_y2 v2, v3, v2, v3, v4, v5, v6, v7, v28 965 966 st1 {v2.8b}, [x0], x1 967 st1 {v3.8b}, [x0], x1 968 subs w4, w4, #2 969 b.ne 1b 970 971 ret 972endfunc 973 974function ff_put_vp8_epel8_h6_neon, export=1 975 sub x2, x2, #2 976 977 movrel x7, subpel_filters, -16 978 add x5, x7, w5, uxtw #4 979 ld1 {v0.8h}, [x5] 9801: 981 ld1 {v2.8b, v3.8b}, [x2], x3 982 983 vp8_epel8_h6 v2, v2, v3 984 985 st1 {v2.8b}, [x0], x1 986 subs w4, w4, #1 987 b.ne 1b 988 989 ret 990endfunc 991 992function ff_put_vp8_epel8_h6v6_neon, export=1 993 sub x2, x2, x3, lsl #1 994 sub x2, x2, #2 995 sxtw x4, w4 996 997 // first pass (horizontal): 998 movrel x17, subpel_filters, -16 999 sxtw x5, w5 1000 add x5, x17, x5, lsl #4 // x 1001 sub sp, sp, #168+16 1002 ld1 {v0.8h}, [x5] 1003 add x7, sp, #15 1004 add x16, x4, #5 // h 1005 bic x7, x7, #15 10061: 1007 ld1 {v1.8b, v2.8b}, [x2], x3 1008 1009 vp8_epel8_h6 v1, v1, v2 1010 1011 st1 {v1.8b}, [x7], #8 1012 subs x16, x16, #1 1013 b.ne 1b 1014 1015 // second pass (vertical): 1016 sxtw x6, w6 1017 add x6, x17, x6, lsl #4 // y 1018 add x7, sp, #15 1019 ld1 {v0.8h}, [x6] 1020 bic x7, x7, #15 10212: 1022 ld1 {v1.8b - v4.8b}, [x7], #32 1023 ld1 {v5.8b - v7.8b}, [x7] 1024 1025 sub x7, x7, #16 1026 1027 vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7 1028 1029 st1 {v1.8b}, [x0], x1 1030 st1 {v2.8b}, [x0], x1 1031 subs x4, x4, #2 1032 b.ne 2b 1033 1034 add sp, sp, #168+16 1035 ret 1036endfunc 1037 1038function ff_put_vp8_epel8_v4_neon, export=1 1039 sub x2, x2, x3 1040 1041 movrel x7, subpel_filters, -16 1042 add x6, x7, w6, uxtw #4 1043 ld1 {v0.8h}, [x6] 10441: 1045 ld1 {v2.8b}, [x2], x3 1046 ld1 {v3.8b}, [x2], x3 1047 ld1 {v4.8b}, [x2], x3 1048 ld1 {v5.8b}, [x2], x3 1049 ld1 {v6.8b}, [x2] 1050 sub x2, x2, x3, lsl #1 1051 1052 vp8_epel8_v4_y2 v2, v2, v3, v4, v5, v6 1053 1054 st1 {v2.d}[0], [x0], x1 1055 st1 {v2.d}[1], [x0], x1 1056 subs w4, w4, #2 1057 b.ne 1b 1058 1059 ret 1060endfunc 1061 1062function ff_put_vp8_epel8_h4_neon, export=1 1063 sub x2, x2, #1 1064 1065 movrel x7, subpel_filters, -16 1066 add x5, x7, w5, uxtw #4 1067 ld1 {v0.8h}, [x5] 10681: 1069 ld1 {v2.8b,v3.8b}, [x2], x3 1070 1071 vp8_epel8_h4 v2, v2, v3 1072 1073 st1 {v2.8b}, [x0], x1 1074 subs w4, w4, #1 1075 b.ne 1b 1076 1077 ret 1078endfunc 1079 1080function ff_put_vp8_epel8_h4v6_neon, export=1 1081 sub x2, x2, x3, lsl #1 1082 sub x2, x2, #1 1083 sxtw x4, w4 1084 1085 // first pass (horizontal): 1086 movrel x17, subpel_filters, -16 1087 sxtw x5, w5 1088 add x5, x17, x5, lsl #4 // x 1089 sub sp, sp, #168+16 1090 ld1 {v0.8h}, [x5] 1091 add x7, sp, #15 1092 add x16, x4, #5 // h 1093 bic x7, x7, #15 10941: 1095 ld1 {v1.8b, v2.8b}, [x2], x3 1096 1097 vp8_epel8_h4 v1, v1, v2 1098 1099 st1 {v1.8b}, [x7], #8 1100 subs x16, x16, #1 1101 b.ne 1b 1102 1103 // second pass (vertical): 1104 sxtw x6, w6 1105 add x6, x17, x6, lsl #4 // y 1106 add x7, sp, #15 1107 ld1 {v0.8h}, [x6] 1108 bic x7, x7, #15 11092: 1110 ld1 {v1.8b - v4.8b}, [x7], #32 1111 ld1 {v5.8b - v7.8b}, [x7] 1112 1113 sub x7, x7, #16 1114 1115 vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7 1116 1117 st1 {v1.8b}, [x0], x1 1118 st1 {v2.8b}, [x0], x1 1119 subs x4, x4, #2 1120 b.ne 2b 1121 1122 add sp, sp, #168+16 1123 ret 1124endfunc 1125 1126function ff_put_vp8_epel8_h4v4_neon, export=1 1127 sub x2, x2, x3 1128 sub x2, x2, #1 1129 sxtw x4, w4 1130 1131 1132 // first pass (horizontal): 1133 movrel x17, subpel_filters, -16 1134 sxtw x5, w5 1135 add x5, x17, x5, lsl #4 // x 1136 sub sp, sp, #168+16 1137 ld1 {v0.8h}, [x5] 1138 add x7, sp, #15 1139 add x16, x4, #3 // h 1140 bic x7, x7, #15 11411: 1142 ld1 {v1.8b, v2.8b}, [x2], x3 1143 1144 vp8_epel8_h4 v1, v1, v2 1145 1146 st1 {v1.8b}, [x7], #8 1147 subs x16, x16, #1 1148 b.ne 1b 1149 1150 // second pass (vertical): 1151 sxtw x6, w6 1152 add x6, x17, x6, lsl #4 // y 1153 add x7, sp, #15 1154 ld1 {v0.8h}, [x6] 1155 bic x7, x7, #15 11562: 1157 ld1 {v1.8b - v2.8b}, [x7], #16 1158 ld1 {v3.8b - v5.8b}, [x7] 1159 1160 vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5 1161 1162 st1 {v1.d}[0], [x0], x1 1163 st1 {v1.d}[1], [x0], x1 1164 subs x4, x4, #2 1165 b.ne 2b 1166 1167 add sp, sp, #168+16 1168 ret 1169endfunc 1170 1171function ff_put_vp8_epel8_h6v4_neon, export=1 1172 sub x2, x2, x3 1173 sub x2, x2, #2 1174 sxtw x4, w4 1175 1176 1177 // first pass (horizontal): 1178 movrel x17, subpel_filters, -16 1179 sxtw x5, w5 1180 add x5, x17, x5, lsl #4 // x 1181 sub sp, sp, #168+16 1182 ld1 {v0.8h}, [x5] 1183 add x7, sp, #15 1184 add x16, x4, #3 // h 1185 bic x7, x7, #15 11861: 1187 ld1 {v1.8b, v2.8b}, [x2], x3 1188 1189 vp8_epel8_h6 v1, v1, v2 1190 1191 st1 {v1.8b}, [x7], #8 1192 subs x16, x16, #1 1193 b.ne 1b 1194 1195 // second pass (vertical): 1196 sxtw x6, w6 1197 add x6, x17, x6, lsl #4 // y 1198 add x7, sp, #15 1199 ld1 {v0.8h}, [x6] 1200 bic x7, x7, #15 12012: 1202 ld1 {v1.8b - v2.8b}, [x7], #16 1203 ld1 {v3.8b - v5.8b}, [x7] 1204 1205 vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5 1206 1207 st1 {v1.d}[0], [x0], x1 1208 st1 {v1.d}[1], [x0], x1 1209 subs x4, x4, #2 1210 b.ne 2b 1211 1212 add sp, sp, #168+16 1213 ret 1214endfunc 1215 1216function ff_put_vp8_epel4_v6_neon, export=1 1217 sub x2, x2, x3, lsl #1 1218 1219 movrel x7, subpel_filters, -16 1220 add x6, x7, w6, uxtw #4 1221 ld1 {v0.8h}, [x6] 12221: 1223 ld1r {v2.2s}, [x2], x3 1224 ld1r {v3.2s}, [x2], x3 1225 ld1r {v4.2s}, [x2], x3 1226 ld1r {v5.2s}, [x2], x3 1227 ld1r {v6.2s}, [x2], x3 1228 ld1r {v7.2s}, [x2], x3 1229 ld1r {v28.2s}, [x2] 1230 sub x2, x2, x3, lsl #2 1231 ld1 {v2.s}[1], [x2], x3 1232 ld1 {v3.s}[1], [x2], x3 1233 ld1 {v4.s}[1], [x2], x3 1234 ld1 {v5.s}[1], [x2], x3 1235 ld1 {v6.s}[1], [x2], x3 1236 ld1 {v7.s}[1], [x2], x3 1237 ld1 {v28.s}[1], [x2] 1238 sub x2, x2, x3, lsl #2 1239 1240 vp8_epel8_v6_y2 v2, v3, v2, v3, v4, v5, v6, v7, v28 1241 1242 st1 {v2.s}[0], [x0], x1 1243 st1 {v3.s}[0], [x0], x1 1244 st1 {v2.s}[1], [x0], x1 1245 st1 {v3.s}[1], [x0], x1 1246 subs w4, w4, #4 1247 b.ne 1b 1248 1249 ret 1250endfunc 1251 1252function ff_put_vp8_epel4_h6_neon, export=1 1253 sub x2, x2, #2 1254 1255 movrel x7, subpel_filters, -16 1256 add x5, x7, w5, uxtw #4 1257 ld1 {v0.8h}, [x5] 12581: 1259 ld1 {v2.8b,v3.8b}, [x2], x3 1260 vp8_epel8_h6 v2, v2, v3 1261 st1 {v2.s}[0], [x0], x1 1262 subs w4, w4, #1 1263 b.ne 1b 1264 1265 ret 1266endfunc 1267 1268function ff_put_vp8_epel4_h6v6_neon, export=1 1269 sub x2, x2, x3, lsl #1 1270 sub x2, x2, #2 1271 1272 movrel x7, subpel_filters, -16 1273 add x5, x7, w5, uxtw #4 1274 ld1 {v0.8h}, [x5] 1275 1276 sub sp, sp, #52 1277 add w8, w4, #5 1278 mov x9, sp 12791: 1280 ld1 {v2.8b,v3.8b}, [x2], x3 1281 vp8_epel8_h6 v2, v2, v3 1282 st1 {v2.s}[0], [x9], #4 1283 subs w8, w8, #1 1284 b.ne 1b 1285 1286 add x6, x7, w6, uxtw #4 1287 ld1 {v0.8h}, [x6] 1288 mov x9, sp 12892: 1290 ld1 {v2.8b,v3.8b}, [x9], #16 1291 ld1 {v6.8b}, [x9], #8 1292 ld1r {v28.2s}, [x9] 1293 sub x9, x9, #16 1294 ld1 {v4.8b,v5.8b}, [x9], #16 1295 ld1 {v7.8b}, [x9], #8 1296 ld1 {v28.s}[1], [x9] 1297 sub x9, x9, #16 1298 trn1 v1.2s, v2.2s, v4.2s 1299 trn2 v4.2s, v2.2s, v4.2s 1300 trn1 v2.2s, v3.2s, v5.2s 1301 trn2 v5.2s, v3.2s, v5.2s 1302 trn1 v3.2s, v6.2s, v7.2s 1303 trn2 v7.2s, v6.2s, v7.2s 1304 vp8_epel8_v6_y2 v2, v3, v1, v4, v2, v5, v3, v7, v28 1305 st1 {v2.s}[0], [x0], x1 1306 st1 {v3.s}[0], [x0], x1 1307 st1 {v2.s}[1], [x0], x1 1308 st1 {v3.s}[1], [x0], x1 1309 subs w4, w4, #4 1310 b.ne 2b 1311 1312 add sp, sp, #52 1313 ret 1314endfunc 1315 1316function ff_put_vp8_epel4_h4v6_neon, export=1 1317 sub x2, x2, x3, lsl #1 1318 sub x2, x2, #1 1319 1320 movrel x7, subpel_filters, -16 1321 add x5, x7, w5, uxtw #4 1322 ld1 {v0.8h}, [x5] 1323 1324 sub sp, sp, #52 1325 add w8, w4, #5 1326 mov x9, sp 13271: 1328 ld1 {v2.8b}, [x2], x3 1329 vp8_epel8_h4 v2, v2, v2 1330 st1 {v2.s}[0], [x9], #4 1331 subs w8, w8, #1 1332 b.ne 1b 1333 1334 add x6, x7, w6, uxtw #4 1335 ld1 {v0.8h}, [x6] 1336 mov x9, sp 13372: 1338 ld1 {v2.8b,v3.8b}, [x9], #16 1339 ld1 {v6.8b}, [x9], #8 1340 ld1r {v28.2s}, [x9] 1341 sub x9, x9, #16 1342 ld1 {v4.8b,v5.8b}, [x9], #16 1343 ld1 {v7.8b}, [x9], #8 1344 ld1 {v28.s}[1], [x9] 1345 sub x9, x9, #16 1346 trn1 v1.2s, v2.2s, v4.2s 1347 trn2 v4.2s, v2.2s, v4.2s 1348 trn1 v2.2s, v3.2s, v5.2s 1349 trn2 v5.2s, v3.2s, v5.2s 1350 trn1 v3.2s, v6.2s, v7.2s 1351 trn2 v7.2s, v6.2s, v7.2s 1352 vp8_epel8_v6_y2 v2, v3, v1, v4, v2, v5, v3, v7, v28 1353 st1 {v2.s}[0], [x0], x1 1354 st1 {v3.s}[0], [x0], x1 1355 st1 {v2.s}[1], [x0], x1 1356 st1 {v3.s}[1], [x0], x1 1357 subs w4, w4, #4 1358 b.ne 2b 1359 1360 add sp, sp, #52 1361 ret 1362endfunc 1363 1364function ff_put_vp8_epel4_h6v4_neon, export=1 1365 sub x2, x2, x3 1366 sub x2, x2, #2 1367 1368 movrel x7, subpel_filters, -16 1369 add x5, x7, w5, uxtw #4 1370 ld1 {v0.8h}, [x5] 1371 1372 sub sp, sp, #44 1373 add w8, w4, #3 1374 mov x9, sp 13751: 1376 ld1 {v2.8b,v3.8b}, [x2], x3 1377 vp8_epel8_h6 v2, v2, v3 1378 st1 {v2.s}[0], [x9], #4 1379 subs w8, w8, #1 1380 b.ne 1b 1381 1382 add x6, x7, w6, uxtw #4 1383 ld1 {v0.8h}, [x6] 1384 mov x9, sp 13852: 1386 ld1 {v2.8b,v3.8b}, [x9], #16 1387 ld1r {v6.2s}, [x9] 1388 sub x9, x9, #8 1389 ld1 {v4.8b,v5.8b}, [x9], #16 1390 ld1 {v6.s}[1], [x9] 1391 sub x9, x9, #8 1392 trn1 v1.2s, v2.2s, v4.2s 1393 trn2 v4.2s, v2.2s, v4.2s 1394 trn1 v2.2s, v3.2s, v5.2s 1395 trn2 v5.2s, v3.2s, v5.2s 1396 vp8_epel8_v4_y2 v1, v1, v4, v2, v5, v6 1397 st1 {v1.s}[0], [x0], x1 1398 st1 {v1.s}[2], [x0], x1 1399 st1 {v1.s}[1], [x0], x1 1400 st1 {v1.s}[3], [x0], x1 1401 subs w4, w4, #4 1402 b.ne 2b 1403 1404 add sp, sp, #44 1405 ret 1406endfunc 1407 1408function ff_put_vp8_epel4_h4_neon, export=1 1409 sub x2, x2, #1 1410 1411 movrel x7, subpel_filters, -16 1412 add x5, x7, w5, uxtw #4 1413 ld1 {v0.8h}, [x5] 14141: 1415 ld1 {v2.8b}, [x2], x3 1416 vp8_epel8_h4 v2, v2, v2 1417 st1 {v2.s}[0], [x0], x1 1418 subs w4, w4, #1 1419 b.ne 1b 1420 1421 ret 1422endfunc 1423 1424function ff_put_vp8_epel4_v4_neon, export=1 1425 sub x2, x2, x3 1426 1427 movrel x7, subpel_filters, -16 1428 add x6, x7, w6, uxtw #4 1429 ld1 {v0.8h}, [x6] 14301: 1431 ld1r {v2.2s}, [x2], x3 1432 ld1r {v3.2s}, [x2], x3 1433 ld1r {v4.2s}, [x2], x3 1434 ld1r {v5.2s}, [x2], x3 1435 ld1r {v6.2s}, [x2] 1436 sub x2, x2, x3, lsl #1 1437 ld1 {v2.s}[1], [x2], x3 1438 ld1 {v3.s}[1], [x2], x3 1439 ld1 {v4.s}[1], [x2], x3 1440 ld1 {v5.s}[1], [x2], x3 1441 ld1 {v6.s}[1], [x2] 1442 sub x2, x2, x3, lsl #1 1443 1444 vp8_epel8_v4_y2 v2, v2, v3, v4, v5, v6 1445 1446 st1 {v2.s}[0], [x0], x1 1447 st1 {v2.s}[2], [x0], x1 1448 st1 {v2.s}[1], [x0], x1 1449 st1 {v2.s}[3], [x0], x1 1450 subs w4, w4, #4 1451 b.ne 1b 1452 1453 ret 1454endfunc 1455 1456function ff_put_vp8_epel4_h4v4_neon, export=1 1457 sub x2, x2, x3 1458 sub x2, x2, #1 1459 1460 movrel x7, subpel_filters, -16 1461 add x5, x7, w5, uxtw #4 1462 ld1 {v0.8h}, [x5] 1463 1464 sub sp, sp, #44 1465 add w8, w4, #3 1466 mov x9, sp 14671: 1468 ld1 {v2.8b}, [x2], x3 1469 vp8_epel8_h4 v2, v2, v3 1470 st1 {v2.s}[0], [x9], #4 1471 subs w8, w8, #1 1472 b.ne 1b 1473 1474 add x6, x7, w6, uxtw #4 1475 ld1 {v0.8h}, [x6] 1476 mov x9, sp 14772: 1478 ld1 {v2.8b,v3.8b}, [x9], #16 1479 ld1r {v6.2s}, [x9] 1480 sub x9, x9, #8 1481 ld1 {v4.8b,v5.8b}, [x9], #16 1482 ld1 {v6.s}[1], [x9] 1483 sub x9, x9, #8 1484 trn1 v1.2s, v2.2s, v4.2s 1485 trn2 v4.2s, v2.2s, v4.2s 1486 trn1 v2.2s, v3.2s, v5.2s 1487 trn2 v5.2s, v3.2s, v5.2s 1488 vp8_epel8_v4_y2 v1, v1, v4, v2, v5, v6 1489 st1 {v1.s}[0], [x0], x1 1490 st1 {v1.s}[2], [x0], x1 1491 st1 {v1.s}[1], [x0], x1 1492 st1 {v1.s}[3], [x0], x1 1493 subs w4, w4, #4 1494 b.ne 2b 1495 1496 add sp, sp, #44 1497 ret 1498endfunc 1499 1500/* Bilinear MC */ 1501 1502function ff_put_vp8_bilin16_h_neon, export=1 1503 mov w7, #8 1504 dup v0.8b, w5 1505 sub w5, w7, w5 1506 dup v1.8b, w5 15071: 1508 subs w4, w4, #2 1509 ld1 {v2.8b,v3.8b,v4.8b}, [x2], x3 1510 ext v5.8b, v3.8b, v4.8b, #1 1511 ext v4.8b, v2.8b, v3.8b, #1 1512 umull v16.8h, v2.8b, v1.8b 1513 umlal v16.8h, v4.8b, v0.8b 1514 ld1 {v18.8b,v19.8b,v20.8b}, [x2], x3 1515 umull v6.8h, v3.8b, v1.8b 1516 umlal v6.8h, v5.8b, v0.8b 1517 ext v21.8b, v19.8b, v20.8b, #1 1518 ext v20.8b, v18.8b, v19.8b, #1 1519 umull v22.8h, v18.8b, v1.8b 1520 umlal v22.8h, v20.8b, v0.8b 1521 umull v24.8h, v19.8b, v1.8b 1522 umlal v24.8h, v21.8b, v0.8b 1523 rshrn v4.8b, v16.8h, #3 1524 rshrn2 v4.16b, v6.8h, #3 1525 rshrn v6.8b, v22.8h, #3 1526 rshrn2 v6.16b, v24.8h, #3 1527 st1 {v4.16b}, [x0], x1 1528 st1 {v6.16b}, [x0], x1 1529 b.gt 1b 1530 1531 ret 1532endfunc 1533 1534function ff_put_vp8_bilin16_v_neon, export=1 1535 mov w7, #8 1536 dup v0.16b, w6 1537 sub w6, w7, w6 1538 dup v1.16b, w6 1539 1540 ld1 {v2.16b}, [x2], x3 15411: 1542 subs w4, w4, #2 1543 ld1 {v4.16b}, [x2], x3 1544 umull v6.8h, v2.8b, v1.8b 1545 umlal v6.8h, v4.8b, v0.8b 1546 umull2 v16.8h, v2.16b, v1.16b 1547 umlal2 v16.8h, v4.16b, v0.16b 1548 ld1 {v2.16b}, [x2], x3 1549 umull v18.8h, v4.8b, v1.8b 1550 umlal v18.8h, v2.8b, v0.8b 1551 umull2 v20.8h, v4.16b, v1.16b 1552 umlal2 v20.8h, v2.16b, v0.16b 1553 rshrn v4.8b, v6.8h, #3 1554 rshrn2 v4.16b, v16.8h, #3 1555 rshrn v6.8b, v18.8h, #3 1556 rshrn2 v6.16b, v20.8h, #3 1557 st1 {v4.16b}, [x0], x1 1558 st1 {v6.16b}, [x0], x1 1559 b.gt 1b 1560 1561 ret 1562endfunc 1563 1564function ff_put_vp8_bilin16_hv_neon, export=1 1565 mov w7, #8 1566 dup v0.8b, w5 // mx 1567 sub w5, w7, w5 1568 dup v1.8b, w5 1569 dup v2.16b, w6 // my 1570 sub w6, w7, w6 1571 dup v3.16b, w6 1572 1573 ld1 {v4.8b,v5.8b,v6.8b}, [x2], x3 1574 1575 ext v7.8b, v5.8b, v6.8b, #1 1576 ext v6.8b, v4.8b, v5.8b, #1 1577 umull v16.8h, v4.8b, v1.8b 1578 umlal v16.8h, v6.8b, v0.8b 1579 umull v18.8h, v5.8b, v1.8b 1580 umlal v18.8h, v7.8b, v0.8b 1581 rshrn v4.8b, v16.8h, #3 1582 rshrn2 v4.16b, v18.8h, #3 15831: 1584 subs w4, w4, #2 1585 ld1 {v18.8b,v19.8b,v20.8b}, [x2], x3 1586 ext v21.8b, v19.8b, v20.8b, #1 1587 ext v20.8b, v18.8b, v19.8b, #1 1588 umull v22.8h, v18.8b, v1.8b 1589 umlal v22.8h, v20.8b, v0.8b 1590 ld1 {v26.8b,v27.8b,v28.8b}, [x2], x3 1591 umull v24.8h, v19.8b, v1.8b 1592 umlal v24.8h, v21.8b, v0.8b 1593 ext v29.8b, v27.8b, v28.8b, #1 1594 ext v28.8b, v26.8b, v27.8b, #1 1595 umull v16.8h, v26.8b, v1.8b 1596 umlal v16.8h, v28.8b, v0.8b 1597 umull v18.8h, v27.8b, v1.8b 1598 umlal v18.8h, v29.8b, v0.8b 1599 rshrn v6.8b, v22.8h, #3 1600 rshrn2 v6.16b, v24.8h, #3 1601 umull v24.8h, v4.8b, v3.8b 1602 umlal v24.8h, v6.8b, v2.8b 1603 umull2 v30.8h, v4.16b, v3.16b 1604 umlal2 v30.8h, v6.16b, v2.16b 1605 rshrn v4.8b, v16.8h, #3 1606 rshrn2 v4.16b, v18.8h, #3 1607 umull v20.8h, v6.8b, v3.8b 1608 umlal v20.8h, v4.8b, v2.8b 1609 umull2 v22.8h, v6.16b, v3.16b 1610 umlal2 v22.8h, v4.16b, v2.16b 1611 rshrn v24.8b, v24.8h, #3 1612 rshrn2 v24.16b, v30.8h, #3 1613 st1 {v24.16b}, [x0], x1 1614 rshrn v20.8b, v20.8h, #3 1615 rshrn2 v20.16b, v22.8h, #3 1616 st1 {v20.16b}, [x0], x1 1617 b.gt 1b 1618 1619 ret 1620endfunc 1621 1622function ff_put_vp8_bilin8_h_neon, export=1 1623 mov w7, #8 1624 dup v0.8b, w5 1625 sub w5, w7, w5 1626 dup v1.8b, w5 16271: 1628 subs w4, w4, #2 1629 ld1 {v2.8b,v3.8b}, [x2], x3 1630 ext v3.8b, v2.8b, v3.8b, #1 1631 umull v4.8h, v2.8b, v1.8b 1632 umlal v4.8h, v3.8b, v0.8b 1633 ld1 {v6.8b,v7.8b}, [x2], x3 1634 ext v7.8b, v6.8b, v7.8b, #1 1635 umull v16.8h, v6.8b, v1.8b 1636 umlal v16.8h, v7.8b, v0.8b 1637 rshrn v4.8b, v4.8h, #3 1638 rshrn v16.8b, v16.8h, #3 1639 st1 {v4.8b}, [x0], x1 1640 st1 {v16.8b}, [x0], x1 1641 b.gt 1b 1642 1643 ret 1644endfunc 1645 1646function ff_put_vp8_bilin8_v_neon, export=1 1647 mov w7, #8 1648 dup v0.8b, w6 1649 sub w6, w7, w6 1650 dup v1.8b, w6 1651 1652 ld1 {v2.8b}, [x2], x3 16531: 1654 subs w4, w4, #2 1655 ld1 {v3.8b}, [x2], x3 1656 umull v4.8h, v2.8b, v1.8b 1657 umlal v4.8h, v3.8b, v0.8b 1658 ld1 {v2.8b}, [x2], x3 1659 umull v6.8h, v3.8b, v1.8b 1660 umlal v6.8h, v2.8b, v0.8b 1661 rshrn v4.8b, v4.8h, #3 1662 rshrn v6.8b, v6.8h, #3 1663 st1 {v4.8b}, [x0], x1 1664 st1 {v6.8b}, [x0], x1 1665 b.gt 1b 1666 1667 ret 1668endfunc 1669 1670function ff_put_vp8_bilin8_hv_neon, export=1 1671 mov w7, #8 1672 dup v0.8b, w5 // mx 1673 sub w5, w7, w5 1674 dup v1.8b, w5 1675 dup v2.8b, w6 // my 1676 sub w6, w7, w6 1677 dup v3.8b, w6 1678 1679 ld1 {v4.8b,v5.8b}, [x2], x3 1680 ext v5.8b, v4.8b, v5.8b, #1 1681 umull v18.8h, v4.8b, v1.8b 1682 umlal v18.8h, v5.8b, v0.8b 1683 rshrn v22.8b, v18.8h, #3 16841: 1685 subs w4, w4, #2 1686 ld1 {v6.8b,v7.8b}, [x2], x3 1687 ext v7.8b, v6.8b, v7.8b, #1 1688 umull v16.8h, v6.8b, v1.8b 1689 umlal v16.8h, v7.8b, v0.8b 1690 ld1 {v4.8b,v5.8b}, [x2], x3 1691 ext v5.8b, v4.8b, v5.8b, #1 1692 umull v18.8h, v4.8b, v1.8b 1693 umlal v18.8h, v5.8b, v0.8b 1694 rshrn v16.8b, v16.8h, #3 1695 umull v20.8h, v22.8b, v3.8b 1696 umlal v20.8h, v16.8b, v2.8b 1697 rshrn v22.8b, v18.8h, #3 1698 umull v24.8h, v16.8b, v3.8b 1699 umlal v24.8h, v22.8b, v2.8b 1700 rshrn v20.8b, v20.8h, #3 1701 st1 {v20.8b}, [x0], x1 1702 rshrn v23.8b, v24.8h, #3 1703 st1 {v23.8b}, [x0], x1 1704 b.gt 1b 1705 1706 ret 1707endfunc 1708 1709function ff_put_vp8_bilin4_h_neon, export=1 1710 mov w7, #8 1711 dup v0.8b, w5 1712 sub w5, w7, w5 1713 dup v1.8b, w5 17141: 1715 subs w4, w4, #2 1716 ld1 {v2.8b}, [x2], x3 1717 ext v3.8b, v2.8b, v3.8b, #1 1718 ld1 {v6.8b}, [x2], x3 1719 ext v7.8b, v6.8b, v7.8b, #1 1720 trn1 v2.2s, v2.2s, v6.2s 1721 trn1 v3.2s, v3.2s, v7.2s 1722 umull v4.8h, v2.8b, v1.8b 1723 umlal v4.8h, v3.8b, v0.8b 1724 rshrn v4.8b, v4.8h, #3 1725 st1 {v4.s}[0], [x0], x1 1726 st1 {v4.s}[1], [x0], x1 1727 b.gt 1b 1728 1729 ret 1730endfunc 1731 1732function ff_put_vp8_bilin4_v_neon, export=1 1733 mov w7, #8 1734 dup v0.8b, w6 1735 sub w6, w7, w6 1736 dup v1.8b, w6 1737 1738 ld1r {v2.2s}, [x2], x3 17391: 1740 ld1r {v3.2s}, [x2] 1741 ld1 {v2.s}[1], [x2], x3 1742 ld1 {v3.s}[1], [x2], x3 1743 umull v4.8h, v2.8b, v1.8b 1744 umlal v4.8h, v3.8b, v0.8b 1745 trn2 v2.2s, v3.2s, v2.2s 1746 rshrn v4.8b, v4.8h, #3 1747 st1 {v4.s}[0], [x0], x1 1748 st1 {v4.s}[1], [x0], x1 1749 subs w4, w4, #2 1750 b.gt 1b 1751 1752 ret 1753endfunc 1754 1755function ff_put_vp8_bilin4_hv_neon, export=1 1756 mov w7, #8 1757 dup v0.8b, w5 // mx 1758 sub w5, w7, w5 1759 dup v1.8b, w5 1760 dup v2.8b, w6 // my 1761 sub w6, w7, w6 1762 dup v3.8b, w6 1763 1764 ld1 {v4.8b}, [x2], x3 1765 ext v5.8b, v4.8b, v4.8b, #1 1766 umull v18.8h, v4.8b, v1.8b 1767 umlal v18.8h, v5.8b, v0.8b 1768 rshrn v22.8b, v18.8h, #3 17691: 1770 subs w4, w4, #2 1771 ld1 {v6.8b}, [x2], x3 1772 ext v7.8b, v6.8b, v6.8b, #1 1773 ld1 {v4.8b}, [x2], x3 1774 ext v5.8b, v4.8b, v4.8b, #1 1775 trn1 v6.2s, v6.2s, v4.2s 1776 trn1 v7.2s, v7.2s, v5.2s 1777 umull v16.8h, v6.8b, v1.8b 1778 umlal v16.8h, v7.8b, v0.8b 1779 rshrn v16.8b, v16.8h, #3 1780 umull v20.8h, v16.8b, v2.8b 1781 trn1 v22.2s, v22.2s, v16.2s 1782 umlal v20.8h, v22.8b, v3.8b 1783 rev64 v22.2s, v16.2s 1784 rshrn v20.8b, v20.8h, #3 1785 st1 {v20.s}[0], [x0], x1 1786 st1 {v20.s}[1], [x0], x1 1787 b.gt 1b 1788 1789 ret 1790endfunc 1791