1/* 2 * Copyright (c) 2017 Google Inc. 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/aarch64/asm.S" 22#include "neon.S" 23 24 25.macro transpose_4x8H r0, r1, r2, r3, t4, t5, t6, t7 26 trn1 \t4\().8h, \r0\().8h, \r1\().8h 27 trn2 \t5\().8h, \r0\().8h, \r1\().8h 28 trn1 \t6\().8h, \r2\().8h, \r3\().8h 29 trn2 \t7\().8h, \r2\().8h, \r3\().8h 30 31 trn1 \r0\().4s, \t4\().4s, \t6\().4s 32 trn2 \r2\().4s, \t4\().4s, \t6\().4s 33 trn1 \r1\().4s, \t5\().4s, \t7\().4s 34 trn2 \r3\().4s, \t5\().4s, \t7\().4s 35.endm 36 37// The input to and output from this macro is in the registers v16-v31, 38// and v0-v7 are used as scratch registers. 39// p7 = v16 .. p3 = v20, p0 = v23, q0 = v24, q3 = v27, q7 = v31 40// Depending on the width of the loop filter, we either use v16-v19 41// and v28-v31 as temp registers, or v8-v15. 42.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8 43 dup v0.8h, w2 // E 44 dup v2.8h, w3 // I 45 dup v3.8h, w4 // H 46 47 uabd v4.8h, v20.8h, v21.8h // abs(p3 - p2) 48 uabd v5.8h, v21.8h, v22.8h // abs(p2 - p1) 49 uabd v6.8h, v22.8h, v23.8h // abs(p1 - p0) 50 uabd v7.8h, v24.8h, v25.8h // abs(q0 - q1) 51 uabd \tmp1\().8h, v25.8h, v26.8h // abs(q1 - q2) 52 uabd \tmp2\().8h, v26.8h, v27.8h // abs(q2 - q3) 53 umax v4.8h, v4.8h, v5.8h 54 umax v5.8h, v6.8h, v7.8h 55 umax \tmp1\().8h, \tmp1\().8h, \tmp2\().8h 56 uabd v6.8h, v23.8h, v24.8h // abs(p0 - q0) 57 umax v4.8h, v4.8h, v5.8h 58 add v6.8h, v6.8h, v6.8h // abs(p0 - q0) * 2 59 uabd v5.8h, v22.8h, v25.8h // abs(p1 - q1) 60 umax v4.8h, v4.8h, \tmp1\().8h // max(abs(p3 - p2), ..., abs(q2 - q3)) 61 ushr v5.8h, v5.8h, #1 62 cmhs v4.8h, v2.8h, v4.8h // max(abs()) <= I 63 add v6.8h, v6.8h, v5.8h // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 64 cmhs v6.8h, v0.8h, v6.8h 65 and v4.16b, v4.16b, v6.16b // fm 66 67 // If no pixels need filtering, just exit as soon as possible 68 mov x11, v4.d[0] 69 mov x12, v4.d[1] 70 adds x11, x11, x12 71 b.ne 1f 72 br x10 731: 74 75.if \wd >= 8 76 dup v0.8h, w5 77 78 uabd v6.8h, v20.8h, v23.8h // abs(p3 - p0) 79 uabd v2.8h, v21.8h, v23.8h // abs(p2 - p0) 80 uabd v1.8h, v22.8h, v23.8h // abs(p1 - p0) 81 uabd \tmp1\().8h, v25.8h, v24.8h // abs(q1 - q0) 82 uabd \tmp2\().8h, v26.8h, v24.8h // abs(q2 - q0) 83 uabd \tmp3\().8h, v27.8h, v24.8h // abs(q3 - q0) 84 umax v6.8h, v6.8h, v2.8h 85 umax v1.8h, v1.8h, \tmp1\().8h 86 umax \tmp2\().8h, \tmp2\().8h, \tmp3\().8h 87.if \wd == 16 88 uabd v7.8h, v16.8h, v23.8h // abs(p7 - p0) 89 umax v6.8h, v6.8h, v1.8h 90 uabd v2.8h, v17.8h, v23.8h // abs(p6 - p0) 91 umax v6.8h, v6.8h, \tmp2\().8h 92 uabd v1.8h, v18.8h, v23.8h // abs(p5 - p0) 93 cmhs v6.8h, v0.8h, v6.8h // flat8in 94 uabd v8.8h, v19.8h, v23.8h // abs(p4 - p0) 95 and v6.16b, v6.16b, v4.16b // flat8in && fm 96 uabd v9.8h, v28.8h, v24.8h // abs(q4 - q0) 97 bic v4.16b, v4.16b, v6.16b // fm && !flat8in 98 uabd v10.8h, v29.8h, v24.8h // abs(q5 - q0) 99 uabd v11.8h, v30.8h, v24.8h // abs(q6 - q0) 100 uabd v12.8h, v31.8h, v24.8h // abs(q7 - q0) 101 102 umax v7.8h, v7.8h, v2.8h 103 umax v1.8h, v1.8h, v8.8h 104 umax v9.8h, v9.8h, v10.8h 105 umax v11.8h, v11.8h, v12.8h 106 // The rest of the calculation of flat8out is interleaved below 107.else 108 // The rest of the calculation of flat8in is interleaved below 109.endif 110.endif 111 112 // Calculate the normal inner loop filter for 2 or 4 pixels 113 uabd v5.8h, v22.8h, v23.8h // abs(p1 - p0) 114.if \wd == 16 115 umax v7.8h, v7.8h, v1.8h 116 umax v9.8h, v9.8h, v11.8h 117.elseif \wd == 8 118 umax v6.8h, v6.8h, v1.8h 119.endif 120 uabd v1.8h, v25.8h, v24.8h // abs(q1 - q0) 121.if \wd == 16 122 umax v7.8h, v7.8h, v9.8h 123.elseif \wd == 8 124 umax v6.8h, v6.8h, \tmp2\().8h 125.endif 126 dup \tmp2\().8h, w6 // left shift for saturation 127 sub \tmp1\().8h, v22.8h, v25.8h // p1 - q1 128 neg \tmp6\().8h, \tmp2\().8h // negative left shift after saturation 129 umax v5.8h, v5.8h, v1.8h // max(abs(p1 - p0), abs(q1 - q0)) 130 sub \tmp3\().8h, v24.8h, v23.8h // q0 - p0 131 movi \tmp5\().8h, #3 132.if \wd == 8 133 cmhs v6.8h, v0.8h, v6.8h // flat8in 134.endif 135 cmhs v5.8h, v3.8h, v5.8h // !hev 136.if \wd == 8 137 and v6.16b, v6.16b, v4.16b // flat8in && fm 138.endif 139 sqshl \tmp1\().8h, \tmp1\().8h, \tmp2\().8h 140.if \wd == 16 141 cmhs v7.8h, v0.8h, v7.8h // flat8out 142.elseif \wd == 8 143 bic v4.16b, v4.16b, v6.16b // fm && !flat8in 144.endif 145 and v5.16b, v5.16b, v4.16b // !hev && fm && !flat8in 146.if \wd == 16 147 and v7.16b, v7.16b, v6.16b // flat8out && flat8in && fm 148.endif 149 sshl \tmp1\().8h, \tmp1\().8h, \tmp6\().8h // av_clip_int2p(p1 - q1, BIT_DEPTH - 1) 150 151 mul \tmp3\().8h, \tmp3\().8h, \tmp5\().8h // 3 * (q0 - p0) 152 bic \tmp1\().16b, \tmp1\().16b, v5.16b // if (!hev) av_clip_int8 = 0 153 movi v2.8h, #4 154 add \tmp3\().8h, \tmp3\().8h, \tmp1\().8h // 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)] 155 movi v3.8h, #3 156 sqshl \tmp1\().8h, \tmp3\().8h, \tmp2\().8h 157 movi \tmp5\().8h, #0 158 sshl \tmp1\().8h, \tmp1\().8h, \tmp6\().8h // av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f 159 dup \tmp6\().8h, w7 // max pixel value 160.if \wd == 16 161 bic v6.16b, v6.16b, v7.16b // fm && flat8in && !flat8out 162.endif 163 164 ushr \tmp2\().8h, \tmp6\().8h, #1 // (1 << (BIT_DEPTH - 1)) - 1 165 166 add \tmp3\().8h, \tmp1\().8h, v2.8h // f + 4 167 add \tmp4\().8h, \tmp1\().8h, v3.8h // f + 3 168 smin \tmp3\().8h, \tmp3\().8h, \tmp2\().8h // FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1) 169 smin \tmp4\().8h, \tmp4\().8h, \tmp2\().8h // FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1) 170 sshr \tmp3\().8h, \tmp3\().8h, #3 // f1 171 sshr \tmp4\().8h, \tmp4\().8h, #3 // f2 172 173 add v0.8h, v23.8h, \tmp4\().8h // p0 + f2 174 sub v2.8h, v24.8h, \tmp3\().8h // q0 - f1 175 smin v0.8h, v0.8h, \tmp6\().8h 176 smin v2.8h, v2.8h, \tmp6\().8h 177 srshr \tmp3\().8h, \tmp3\().8h, #1 // f = (f1 + 1) >> 1 178 smax v0.8h, v0.8h, \tmp5\().8h // out p0 179 smax v2.8h, v2.8h, \tmp5\().8h // out q0 180 bit v23.16b, v0.16b, v4.16b // if (fm && !flat8in) 181 bit v24.16b, v2.16b, v4.16b 182 183 add v0.8h, v22.8h, \tmp3\().8h // p1 + f 184 sub v2.8h, v25.8h, \tmp3\().8h // q1 - f 185.if \wd >= 8 186 mov x11, v6.d[0] 187.endif 188 smin v0.8h, v0.8h, \tmp6\().8h 189 smin v2.8h, v2.8h, \tmp6\().8h 190.if \wd >= 8 191 mov x12, v6.d[1] 192.endif 193 smax v0.8h, v0.8h, \tmp5\().8h // out p1 194 smax v2.8h, v2.8h, \tmp5\().8h // out q1 195.if \wd >= 8 196 adds x11, x11, x12 197.endif 198 bit v22.16b, v0.16b, v5.16b // if (!hev && fm && !flat8in) 199 bit v25.16b, v2.16b, v5.16b 200 201 // If no pixels need flat8in, jump to flat8out 202 // (or to a writeout of the inner 4 pixels, for wd=8) 203.if \wd >= 8 204.if \wd == 16 205 b.eq 6f 206.else 207 b.ne 1f 208 br x13 2091: 210.endif 211 212 // flat8in 213 add \tmp1\().8h, v20.8h, v21.8h 214 add \tmp3\().8h, v22.8h, v25.8h 215 add \tmp5\().8h, v20.8h, v22.8h 216 add \tmp7\().8h, v23.8h, v26.8h 217 add v0.8h, \tmp1\().8h, \tmp1\().8h 218 add v0.8h, v0.8h, v23.8h 219 add v0.8h, v0.8h, v24.8h 220 add v0.8h, v0.8h, \tmp5\().8h 221 sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h 222 sub \tmp7\().8h, \tmp7\().8h, \tmp5\().8h 223 urshr v2.8h, v0.8h, #3 // out p2 224 225 add v0.8h, v0.8h, \tmp3\().8h 226 add \tmp1\().8h, v20.8h, v23.8h 227 add \tmp3\().8h, v24.8h, v27.8h 228 urshr v3.8h, v0.8h, #3 // out p1 229 230 add v0.8h, v0.8h, \tmp7\().8h 231 sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h 232 add \tmp5\().8h, v21.8h, v24.8h 233 add \tmp7\().8h, v25.8h, v27.8h 234 urshr v4.8h, v0.8h, #3 // out p0 235 236 add v0.8h, v0.8h, \tmp3\().8h 237 sub \tmp7\().8h, \tmp7\().8h, \tmp5\().8h 238 add \tmp1\().8h, v22.8h, v25.8h 239 add \tmp3\().8h, v26.8h, v27.8h 240 urshr v5.8h, v0.8h, #3 // out q0 241 242 add v0.8h, v0.8h, \tmp7\().8h 243 sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h 244 urshr \tmp5\().8h, v0.8h, #3 // out q1 245 246 add v0.8h, v0.8h, \tmp3\().8h 247 // The output here is written back into the input registers. This doesn't 248 // matter for the flat8part below, since we only update those pixels 249 // which won't be touched below. 250 bit v21.16b, v2.16b, v6.16b 251 bit v22.16b, v3.16b, v6.16b 252 bit v23.16b, v4.16b, v6.16b 253 urshr \tmp6\().8h, v0.8h, #3 // out q2 254 bit v24.16b, v5.16b, v6.16b 255 bit v25.16b, \tmp5\().16b, v6.16b 256 bit v26.16b, \tmp6\().16b, v6.16b 257.endif 258.if \wd == 16 2596: 260 orr v2.16b, v6.16b, v7.16b 261 mov x11, v2.d[0] 262 mov x12, v2.d[1] 263 adds x11, x11, x12 264 b.ne 1f 265 // If no pixels needed flat8in nor flat8out, jump to a 266 // writeout of the inner 4 pixels 267 br x14 2681: 269 270 mov x11, v7.d[0] 271 mov x12, v7.d[1] 272 adds x11, x11, x12 273 b.ne 1f 274 // If no pixels need flat8out, jump to a writeout of the inner 6 pixels 275 br x15 276 2771: 278 // flat8out 279 // This writes all outputs into v2-v17 (skipping v6 and v16). 280 // If this part is skipped, the output is read from v21-v26 (which is the input 281 // to this section). 282 shl v0.8h, v16.8h, #3 // 8 * v16 283 sub v0.8h, v0.8h, v16.8h // 7 * v16 284 add v0.8h, v0.8h, v17.8h 285 add v8.8h, v17.8h, v18.8h 286 add v10.8h, v19.8h, v20.8h 287 add v0.8h, v0.8h, v8.8h 288 add v8.8h, v16.8h, v17.8h 289 add v12.8h, v21.8h, v22.8h 290 add v0.8h, v0.8h, v10.8h 291 add v10.8h, v18.8h, v25.8h 292 add v14.8h, v23.8h, v24.8h 293 sub v10.8h, v10.8h, v8.8h 294 add v0.8h, v0.8h, v12.8h 295 add v0.8h, v0.8h, v14.8h 296 add v12.8h, v16.8h, v18.8h 297 add v14.8h, v19.8h, v26.8h 298 urshr v2.8h, v0.8h, #4 299 300 add v0.8h, v0.8h, v10.8h 301 add v8.8h, v16.8h, v19.8h 302 add v10.8h, v20.8h, v27.8h 303 sub v14.8h, v14.8h, v12.8h 304 bif v2.16b, v17.16b, v7.16b 305 urshr v3.8h , v0.8h, #4 306 307 add v0.8h, v0.8h, v14.8h 308 add v12.8h, v16.8h, v20.8h 309 add v14.8h, v21.8h, v28.8h 310 sub v10.8h, v10.8h, v8.8h 311 bif v3.16b, v18.16b, v7.16b 312 urshr v4.8h, v0.8h, #4 313 314 add v0.8h, v0.8h, v10.8h 315 add v8.8h, v16.8h, v21.8h 316 add v10.8h, v22.8h, v29.8h 317 sub v14.8h, v14.8h, v12.8h 318 bif v4.16b, v19.16b, v7.16b 319 urshr v5.8h, v0.8h, #4 320 321 add v0.8h, v0.8h, v14.8h 322 add v12.8h, v16.8h, v22.8h 323 add v14.8h, v23.8h, v30.8h 324 sub v10.8h, v10.8h, v8.8h 325 bif v5.16b, v20.16b, v7.16b 326 urshr v6.8h, v0.8h, #4 327 328 add v0.8h, v0.8h, v10.8h 329 add v10.8h, v16.8h, v23.8h 330 sub v14.8h, v14.8h, v12.8h 331 add v12.8h, v24.8h, v31.8h 332 bif v6.16b, v21.16b, v7.16b 333 urshr v8.8h, v0.8h, #4 334 335 add v0.8h, v0.8h, v14.8h 336 sub v10.8h, v12.8h, v10.8h 337 add v12.8h, v17.8h, v24.8h 338 add v14.8h, v25.8h, v31.8h 339 bif v8.16b, v22.16b, v7.16b 340 urshr v9.8h, v0.8h, #4 341 342 add v0.8h, v0.8h, v10.8h 343 sub v14.8h, v14.8h, v12.8h 344 add v12.8h, v26.8h, v31.8h 345 bif v9.16b, v23.16b, v7.16b 346 urshr v10.8h, v0.8h, #4 347 348 add v0.8h, v0.8h, v14.8h 349 add v14.8h, v18.8h, v25.8h 350 add v18.8h, v19.8h, v26.8h 351 sub v12.8h, v12.8h, v14.8h 352 add v14.8h, v27.8h, v31.8h 353 bif v10.16b, v24.16b, v7.16b 354 urshr v11.8h, v0.8h, #4 355 356 add v0.8h, v0.8h, v12.8h 357 add v12.8h, v20.8h, v27.8h 358 sub v14.8h, v14.8h, v18.8h 359 add v18.8h, v28.8h, v31.8h 360 bif v11.16b, v25.16b, v7.16b 361 sub v18.8h, v18.8h, v12.8h 362 urshr v12.8h, v0.8h, #4 363 364 add v0.8h, v0.8h, v14.8h 365 add v14.8h, v21.8h, v28.8h 366 add v20.8h, v29.8h, v31.8h 367 bif v12.16b, v26.16b, v7.16b 368 urshr v13.8h, v0.8h, #4 369 370 add v0.8h, v0.8h, v18.8h 371 sub v20.8h, v20.8h, v14.8h 372 add v18.8h, v22.8h, v29.8h 373 add v22.8h, v30.8h, v31.8h 374 bif v13.16b, v27.16b, v7.16b 375 urshr v14.8h, v0.8h, #4 376 377 add v0.8h, v0.8h, v20.8h 378 sub v22.8h, v22.8h, v18.8h 379 bif v14.16b, v28.16b, v7.16b 380 urshr v15.8h, v0.8h, #4 381 382 add v0.8h, v0.8h, v22.8h 383 bif v15.16b, v29.16b, v7.16b 384 urshr v17.8h, v0.8h, #4 385 bif v17.16b, v30.16b, v7.16b 386.endif 387.endm 388 389// For wd <= 8, we use v16-v19 and v28-v31 for temp registers, 390// while we need those for inputs/outputs in wd=16 and use v8-v15 391// for temp registers there instead. 392function vp9_loop_filter_4 393 loop_filter 4, v16, v17, v18, v19, v28, v29, v30, v31 394 ret 395endfunc 396 397function vp9_loop_filter_8 398 loop_filter 8, v16, v17, v18, v19, v28, v29, v30, v31 399 ret 400endfunc 401 402function vp9_loop_filter_16 403 loop_filter 16, v8, v9, v10, v11, v12, v13, v14, v15 404 ret 405endfunc 406 407.macro loop_filter_4 408 bl vp9_loop_filter_4 409.endm 410 411.macro loop_filter_8 412 // calculate alternative 'return' targets 413 adr x13, 6f 414 bl vp9_loop_filter_8 415.endm 416 417.macro loop_filter_16 418 // calculate alternative 'return' targets 419 adr x14, 7f 420 adr x15, 8f 421 bl vp9_loop_filter_16 422.endm 423 424 425// The public functions in this file have got the following signature: 426// void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr); 427 428.macro bpp_frontend func, bpp, push 429function ff_\func\()_\bpp\()_neon, export=1 430.if \push 431 mov x16, x30 432 stp d14, d15, [sp, #-0x10]! 433 stp d12, d13, [sp, #-0x10]! 434 stp d10, d11, [sp, #-0x10]! 435 stp d8, d9, [sp, #-0x10]! 436.endif 437 lsl w2, w2, #\bpp - 8 438 lsl w3, w3, #\bpp - 8 439 lsl w4, w4, #\bpp - 8 440 mov x5, #1 << (\bpp - 8) 441 mov x6, #16 - \bpp 442 mov x7, #((1 << \bpp) - 1) 443.if \push 444 bl \func\()_16_neon 445 ldp d8, d9, [sp], 0x10 446 ldp d10, d11, [sp], 0x10 447 ldp d12, d13, [sp], 0x10 448 ldp d14, d15, [sp], 0x10 449 br x16 450.else 451 b \func\()_16_neon 452.endif 453endfunc 454.endm 455 456.macro bpp_frontends func, push=0 457 bpp_frontend \func, 10, \push 458 bpp_frontend \func, 12, \push 459.endm 460 461.macro bpp_frontend_rep func, suffix, int_suffix, dir, bpp, push 462function ff_\func\()_\suffix\()_\bpp\()_neon, export=1 463 mov x16, x30 464.if \push 465 stp d14, d15, [sp, #-0x10]! 466 stp d12, d13, [sp, #-0x10]! 467 stp d10, d11, [sp, #-0x10]! 468 stp d8, d9, [sp, #-0x10]! 469.endif 470 lsl w2, w2, #\bpp - 8 471 lsl w3, w3, #\bpp - 8 472 lsl w4, w4, #\bpp - 8 473 mov x5, #1 << (\bpp - 8) 474 mov x6, #16 - \bpp 475 mov x7, #((1 << \bpp) - 1) 476 bl \func\()_\int_suffix\()_16_neon 477.ifc \dir,h 478 add x0, x0, x1, lsl #3 479.else 480 add x0, x0, #16 481.endif 482 bl \func\()_\int_suffix\()_16_neon 483.if \push 484 ldp d8, d9, [sp], 0x10 485 ldp d10, d11, [sp], 0x10 486 ldp d12, d13, [sp], 0x10 487 ldp d14, d15, [sp], 0x10 488.endif 489 br x16 490endfunc 491.endm 492 493.macro bpp_frontends_rep func, suffix, int_suffix, dir, push=0 494 bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 10, \push 495 bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 12, \push 496.endm 497 498.macro bpp_frontend_mix2 wd1, wd2, dir, bpp 499function ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1 500 mov x16, x30 501 lsr w8, w2, #8 502 lsr w14, w3, #8 503 lsr w15, w4, #8 504 and w2, w2, #0xff 505 and w3, w3, #0xff 506 and w4, w4, #0xff 507 lsl w2, w2, #\bpp - 8 508 lsl w3, w3, #\bpp - 8 509 lsl w4, w4, #\bpp - 8 510 mov x5, #1 << (\bpp - 8) 511 mov x6, #16 - \bpp 512 mov x7, #((1 << \bpp) - 1) 513 bl vp9_loop_filter_\dir\()_\wd1\()_8_16_neon 514.ifc \dir,h 515 add x0, x0, x1, lsl #3 516.else 517 add x0, x0, #16 518.endif 519 lsl w2, w8, #\bpp - 8 520 lsl w3, w14, #\bpp - 8 521 lsl w4, w15, #\bpp - 8 522 bl vp9_loop_filter_\dir\()_\wd2\()_8_16_neon 523 br x16 524endfunc 525.endm 526 527.macro bpp_frontends_mix2 wd1, wd2 528 bpp_frontend_mix2 \wd1, \wd2, v, 10 529 bpp_frontend_mix2 \wd1, \wd2, v, 12 530 bpp_frontend_mix2 \wd1, \wd2, h, 10 531 bpp_frontend_mix2 \wd1, \wd2, h, 12 532.endm 533 534function vp9_loop_filter_v_4_8_16_neon 535 mov x10, x30 536 sub x9, x0, x1, lsl #2 537 ld1 {v20.8h}, [x9], x1 // p3 538 ld1 {v24.8h}, [x0], x1 // q0 539 ld1 {v21.8h}, [x9], x1 // p2 540 ld1 {v25.8h}, [x0], x1 // q1 541 ld1 {v22.8h}, [x9], x1 // p1 542 ld1 {v26.8h}, [x0], x1 // q2 543 ld1 {v23.8h}, [x9], x1 // p0 544 ld1 {v27.8h}, [x0], x1 // q3 545 sub x0, x0, x1, lsl #2 546 sub x9, x9, x1, lsl #1 547 548 loop_filter_4 549 550 st1 {v22.8h}, [x9], x1 551 st1 {v24.8h}, [x0], x1 552 st1 {v23.8h}, [x9], x1 553 st1 {v25.8h}, [x0], x1 554 sub x0, x0, x1, lsl #1 555 556 br x10 557endfunc 558 559bpp_frontends vp9_loop_filter_v_4_8 560 561function vp9_loop_filter_h_4_8_16_neon 562 mov x10, x30 563 sub x9, x0, #8 564 add x0, x9, x1, lsl #2 565 ld1 {v20.8h}, [x9], x1 566 ld1 {v24.8h}, [x0], x1 567 ld1 {v21.8h}, [x9], x1 568 ld1 {v25.8h}, [x0], x1 569 ld1 {v22.8h}, [x9], x1 570 ld1 {v26.8h}, [x0], x1 571 ld1 {v23.8h}, [x9], x1 572 ld1 {v27.8h}, [x0], x1 573 574 sub x9, x9, x1, lsl #2 575 sub x0, x0, x1, lsl #3 576 add x0, x0, #8 577 578 transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 579 580 loop_filter_4 581 582 // Move x9 forward by 2 pixels; we don't need to rewrite the 583 // outermost 2 pixels since they aren't changed. 584 add x9, x9, #4 585 add x0, x9, x1, lsl #2 586 587 // We only will write the mid 4 pixels back; after the loop filter, 588 // these are in v22, v23, v24, v25, ordered as rows (8x4 pixels). 589 // We need to transpose them to columns, done with a 4x8 transpose 590 // (which in practice is two 4x4 transposes of the two 4x4 halves 591 // of the 8x4 pixels; into 4x8 pixels). 592 transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29 593 st1 {v22.d}[0], [x9], x1 594 st1 {v22.d}[1], [x0], x1 595 st1 {v23.d}[0], [x9], x1 596 st1 {v23.d}[1], [x0], x1 597 st1 {v24.d}[0], [x9], x1 598 st1 {v24.d}[1], [x0], x1 599 st1 {v25.d}[0], [x9], x1 600 st1 {v25.d}[1], [x0], x1 601 sub x0, x0, x1, lsl #3 602 add x0, x0, #4 603 604 br x10 605endfunc 606 607bpp_frontends vp9_loop_filter_h_4_8 608 609function vp9_loop_filter_v_8_8_16_neon 610 mov x10, x30 611 sub x9, x0, x1, lsl #2 612 ld1 {v20.8h}, [x9], x1 // p3 613 ld1 {v24.8h}, [x0], x1 // q0 614 ld1 {v21.8h}, [x9], x1 // p2 615 ld1 {v25.8h}, [x0], x1 // q1 616 ld1 {v22.8h}, [x9], x1 // p1 617 ld1 {v26.8h}, [x0], x1 // q2 618 ld1 {v23.8h}, [x9], x1 // p0 619 ld1 {v27.8h}, [x0], x1 // q3 620 sub x9, x9, x1, lsl #2 621 sub x0, x0, x1, lsl #2 622 add x9, x9, x1 623 624 loop_filter_8 625 626 st1 {v21.8h}, [x9], x1 627 st1 {v24.8h}, [x0], x1 628 st1 {v22.8h}, [x9], x1 629 st1 {v25.8h}, [x0], x1 630 st1 {v23.8h}, [x9], x1 631 st1 {v26.8h}, [x0], x1 632 sub x0, x0, x1, lsl #1 633 sub x0, x0, x1 634 635 br x10 6366: 637 sub x9, x0, x1, lsl #1 638 st1 {v22.8h}, [x9], x1 639 st1 {v24.8h}, [x0], x1 640 st1 {v23.8h}, [x9], x1 641 st1 {v25.8h}, [x0], x1 642 sub x0, x0, x1, lsl #1 643 br x10 644endfunc 645 646bpp_frontends vp9_loop_filter_v_8_8 647 648function vp9_loop_filter_h_8_8_16_neon 649 mov x10, x30 650 sub x9, x0, #8 651 add x0, x9, x1, lsl #2 652 ld1 {v20.8h}, [x9], x1 653 ld1 {v24.8h}, [x0], x1 654 ld1 {v21.8h}, [x9], x1 655 ld1 {v25.8h}, [x0], x1 656 ld1 {v22.8h}, [x9], x1 657 ld1 {v26.8h}, [x0], x1 658 ld1 {v23.8h}, [x9], x1 659 ld1 {v27.8h}, [x0], x1 660 661 sub x9, x9, x1, lsl #2 662 sub x0, x0, x1, lsl #3 663 add x0, x0, #8 664 665 transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 666 667 loop_filter_8 668 669 add x0, x9, x1, lsl #2 670 671 // Even though only 6 pixels per row have been changed, we write the 672 // full 8 pixel registers. 673 transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 674 675 st1 {v20.8h}, [x9], x1 676 st1 {v24.8h}, [x0], x1 677 st1 {v21.8h}, [x9], x1 678 st1 {v25.8h}, [x0], x1 679 st1 {v22.8h}, [x9], x1 680 st1 {v26.8h}, [x0], x1 681 st1 {v23.8h}, [x9], x1 682 st1 {v27.8h}, [x0], x1 683 sub x0, x0, x1, lsl #3 684 add x0, x0, #8 685 686 br x10 6876: 688 // If we didn't need to do the flat8in part, we use the same writeback 689 // as in loop_filter_h_4_8. 690 add x9, x9, #4 691 add x0, x9, x1, lsl #2 692 transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29 693 st1 {v22.d}[0], [x9], x1 694 st1 {v22.d}[1], [x0], x1 695 st1 {v23.d}[0], [x9], x1 696 st1 {v23.d}[1], [x0], x1 697 st1 {v24.d}[0], [x9], x1 698 st1 {v24.d}[1], [x0], x1 699 st1 {v25.d}[0], [x9], x1 700 st1 {v25.d}[1], [x0], x1 701 sub x0, x0, x1, lsl #3 702 add x0, x0, #4 703 br x10 704endfunc 705 706bpp_frontends vp9_loop_filter_h_8_8 707 708bpp_frontends_mix2 4, 4 709bpp_frontends_mix2 4, 8 710bpp_frontends_mix2 8, 4 711bpp_frontends_mix2 8, 8 712 713function vp9_loop_filter_v_16_8_16_neon 714 mov x10, x30 715 sub x9, x0, x1, lsl #3 716 ld1 {v16.8h}, [x9], x1 // p7 717 ld1 {v24.8h}, [x0], x1 // q0 718 ld1 {v17.8h}, [x9], x1 // p6 719 ld1 {v25.8h}, [x0], x1 // q1 720 ld1 {v18.8h}, [x9], x1 // p5 721 ld1 {v26.8h}, [x0], x1 // q2 722 ld1 {v19.8h}, [x9], x1 // p4 723 ld1 {v27.8h}, [x0], x1 // q3 724 ld1 {v20.8h}, [x9], x1 // p3 725 ld1 {v28.8h}, [x0], x1 // q4 726 ld1 {v21.8h}, [x9], x1 // p2 727 ld1 {v29.8h}, [x0], x1 // q5 728 ld1 {v22.8h}, [x9], x1 // p1 729 ld1 {v30.8h}, [x0], x1 // q6 730 ld1 {v23.8h}, [x9], x1 // p0 731 ld1 {v31.8h}, [x0], x1 // q7 732 sub x9, x9, x1, lsl #3 733 sub x0, x0, x1, lsl #3 734 add x9, x9, x1 735 736 loop_filter_16 737 738 // If we did the flat8out part, we get the output in 739 // v2-v17 (skipping v7 and v16). x9 points to x0 - 7 * stride, 740 // store v2-v9 there, and v10-v17 into x0. 741 st1 {v2.8h}, [x9], x1 742 st1 {v10.8h}, [x0], x1 743 st1 {v3.8h}, [x9], x1 744 st1 {v11.8h}, [x0], x1 745 st1 {v4.8h}, [x9], x1 746 st1 {v12.8h}, [x0], x1 747 st1 {v5.8h}, [x9], x1 748 st1 {v13.8h}, [x0], x1 749 st1 {v6.8h}, [x9], x1 750 st1 {v14.8h}, [x0], x1 751 st1 {v8.8h}, [x9], x1 752 st1 {v15.8h}, [x0], x1 753 st1 {v9.8h}, [x9], x1 754 st1 {v17.8h}, [x0], x1 755 sub x0, x0, x1, lsl #3 756 add x0, x0, x1 757 758 br x10 7598: 760 add x9, x9, x1, lsl #2 761 // If we didn't do the flat8out part, the output is left in the 762 // input registers. 763 st1 {v21.8h}, [x9], x1 764 st1 {v24.8h}, [x0], x1 765 st1 {v22.8h}, [x9], x1 766 st1 {v25.8h}, [x0], x1 767 st1 {v23.8h}, [x9], x1 768 st1 {v26.8h}, [x0], x1 769 sub x0, x0, x1, lsl #1 770 sub x0, x0, x1 771 br x10 7727: 773 sub x9, x0, x1, lsl #1 774 st1 {v22.8h}, [x9], x1 775 st1 {v24.8h}, [x0], x1 776 st1 {v23.8h}, [x9], x1 777 st1 {v25.8h}, [x0], x1 778 sub x0, x0, x1, lsl #1 779 br x10 780endfunc 781 782bpp_frontends vp9_loop_filter_v_16_8, push=1 783bpp_frontends_rep vp9_loop_filter_v_16, 16, 8, v, push=1 784 785function vp9_loop_filter_h_16_8_16_neon 786 mov x10, x30 787 sub x9, x0, #16 788 ld1 {v16.8h}, [x9], x1 789 ld1 {v24.8h}, [x0], x1 790 ld1 {v17.8h}, [x9], x1 791 ld1 {v25.8h}, [x0], x1 792 ld1 {v18.8h}, [x9], x1 793 ld1 {v26.8h}, [x0], x1 794 ld1 {v19.8h}, [x9], x1 795 ld1 {v27.8h}, [x0], x1 796 ld1 {v20.8h}, [x9], x1 797 ld1 {v28.8h}, [x0], x1 798 ld1 {v21.8h}, [x9], x1 799 ld1 {v29.8h}, [x0], x1 800 ld1 {v22.8h}, [x9], x1 801 ld1 {v30.8h}, [x0], x1 802 ld1 {v23.8h}, [x9], x1 803 ld1 {v31.8h}, [x0], x1 804 sub x0, x0, x1, lsl #3 805 sub x9, x9, x1, lsl #3 806 807 // The 16x8 pixels read above is in two 8x8 blocks; the left 808 // half in v16-v23, and the right half in v24-v31. Do two 8x8 transposes 809 // of this, to get one column per register. 810 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 811 transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v0, v1 812 813 loop_filter_16 814 815 transpose_8x8H v16, v2, v3, v4, v5, v6, v8, v9, v0, v1 816 transpose_8x8H v10, v11, v12, v13, v14, v15, v17, v31, v0, v1 817 818 st1 {v16.8h}, [x9], x1 819 st1 {v10.8h}, [x0], x1 820 st1 {v2.8h}, [x9], x1 821 st1 {v11.8h}, [x0], x1 822 st1 {v3.8h}, [x9], x1 823 st1 {v12.8h}, [x0], x1 824 st1 {v4.8h}, [x9], x1 825 st1 {v13.8h}, [x0], x1 826 st1 {v5.8h}, [x9], x1 827 st1 {v14.8h}, [x0], x1 828 st1 {v6.8h}, [x9], x1 829 st1 {v15.8h}, [x0], x1 830 st1 {v8.8h}, [x9], x1 831 st1 {v17.8h}, [x0], x1 832 st1 {v9.8h}, [x9], x1 833 st1 {v31.8h}, [x0], x1 834 sub x0, x0, x1, lsl #3 835 836 br x10 8378: 838 // The same writeback as in loop_filter_h_8_8 839 sub x9, x0, #8 840 add x0, x9, x1, lsl #2 841 transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 842 843 st1 {v20.8h}, [x9], x1 844 st1 {v24.8h}, [x0], x1 845 st1 {v21.8h}, [x9], x1 846 st1 {v25.8h}, [x0], x1 847 st1 {v22.8h}, [x9], x1 848 st1 {v26.8h}, [x0], x1 849 st1 {v23.8h}, [x9], x1 850 st1 {v27.8h}, [x0], x1 851 sub x0, x0, x1, lsl #3 852 add x0, x0, #8 853 br x10 8547: 855 // The same writeback as in loop_filter_h_4_8 856 sub x9, x0, #4 857 add x0, x9, x1, lsl #2 858 transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29 859 st1 {v22.d}[0], [x9], x1 860 st1 {v22.d}[1], [x0], x1 861 st1 {v23.d}[0], [x9], x1 862 st1 {v23.d}[1], [x0], x1 863 st1 {v24.d}[0], [x9], x1 864 st1 {v24.d}[1], [x0], x1 865 st1 {v25.d}[0], [x9], x1 866 st1 {v25.d}[1], [x0], x1 867 sub x0, x0, x1, lsl #3 868 add x0, x0, #4 869 br x10 870endfunc 871 872bpp_frontends vp9_loop_filter_h_16_8, push=1 873bpp_frontends_rep vp9_loop_filter_h_16, 16, 8, h, push=1 874