1/* 2 * Copyright (c) 2017 Google Inc. 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21#include "libavutil/arm/asm.S" 22 23.macro transpose16_q_8x8 rq0, rq1, rq2, rq3, rq4, rq5, rq6, rq7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15 24 vswp \r1, \r8 @ vtrn.64 \rq0, \rq4 25 vswp \r3, \r10 @ vtrn.64 \rq1, \rq5 26 vswp \r5, \r12 @ vtrn.64 \rq2, \rq6 27 vswp \r7, \r14 @ vtrn.64 \rq3, \rq7 28 vtrn.32 \rq0, \rq2 29 vtrn.32 \rq1, \rq3 30 vtrn.32 \rq4, \rq6 31 vtrn.32 \rq5, \rq7 32 vtrn.16 \rq0, \rq1 33 vtrn.16 \rq2, \rq3 34 vtrn.16 \rq4, \rq5 35 vtrn.16 \rq6, \rq7 36.endm 37 38.macro transpose16_4x4 r0, r1, r2, r3 39 vtrn.32 \r0, \r2 40 vtrn.32 \r1, \r3 41 vtrn.16 \r0, \r1 42 vtrn.16 \r2, \r3 43.endm 44 45@ Do a 4x4 transpose, using q registers for the subtransposes that don't 46@ need to address the indiviudal d registers. 47@ r0,r1 == rq0, r2,r3 == rq1 48.macro transpose16_q_4x4 rq0, rq1, r0, r1, r2, r3 49 vtrn.32 \rq0, \rq1 50 vtrn.16 \r0, \r1 51 vtrn.16 \r2, \r3 52.endm 53 54@ The input to and output from this macro is in the registers q8-q15, 55@ and q0-q7 are used as scratch registers. 56@ p3 = q8, p0 = q11, q0 = q12, q3 = q15 57.macro loop_filter_q wd 58 vdup.u16 q0, r2 @ E 59 vdup.u16 q1, r3 @ I 60 61 vabd.u16 q2, q8, q9 @ abs(p3 - p2) 62 vabd.u16 q3, q9, q10 @ abs(p2 - p1) 63 vabd.u16 q4, q10, q11 @ abs(p1 - p0) 64 vabd.u16 q5, q12, q13 @ abs(q0 - q1) 65 vabd.u16 q6, q13, q14 @ abs(q1 - q2) 66 vabd.u16 q7, q14, q15 @ abs(q2 - q3) 67 vmax.u16 q2, q2, q3 68 vmax.u16 q3, q4, q5 69 vmax.u16 q4, q6, q7 70 vabd.u16 q5, q11, q12 @ abs(p0 - q0) 71 vmax.u16 q2, q2, q3 72 vadd.u16 q5, q5, q5 @ abs(p0 - q0) * 2 73 vabd.u16 q6, q10, q13 @ abs(p1 - q1) 74 vmax.u16 q2, q2, q4 @ max(abs(p3 - p2), ..., abs(q2 - q3)) 75 vshr.u16 q6, q6, #1 76 vcle.u16 q2, q2, q1 @ max(abs()) <= I 77 vadd.u16 q5, q5, q6 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 78 vcle.u16 q5, q5, q0 79 vand q2, q2, q5 @ fm 80 81 vmovn.u16 d10, q2 82 vmov r8, r9, d10 83 orrs r8, r8, r9 84 @ If no pixels need filtering, just exit as soon as possible 85 beq 9f 86 87.if \wd >= 8 88 vdup.u16 q0, r5 89 90 vabd.u16 q1, q8, q11 @ abs(p3 - p0) 91 vabd.u16 q3, q9, q11 @ abs(p2 - p0) 92 vabd.u16 q4, q10, q11 @ abs(p1 - p0) 93 vabd.u16 q5, q13, q12 @ abs(q1 - q0) 94 vabd.u16 q6, q14, q12 @ abs(q2 - q0) 95 vabd.u16 q7, q15, q12 @ abs(q3 - q0) 96 vmax.u16 q1, q1, q3 97 vmax.u16 q4, q4, q5 98 vmax.u16 q6, q6, q7 99 @ The rest of the calculation of flat8in is interleaved below 100.endif 101 102 @ Calculate the normal inner loop filter for 2 or 4 pixels 103 vabd.u16 q3, q10, q11 @ abs(p1 - p0) 104.if \wd == 8 105 vmax.u16 q1, q1, q4 106.endif 107 vabd.u16 q4, q13, q12 @ abs(q1 - q0) 108.if \wd == 8 109 vmax.u16 q1, q1, q6 110.endif 111 112 vsub.u16 q5, q10, q13 @ p1 - q1 113 vmax.u16 q3, q3, q4 @ max(abs(p1 - p0), abs(q1 - q0)) 114 vdup.u16 q4, r4 @ H 115 vsub.u16 q6, q12, q11 @ q0 - p0 116.if \wd == 8 117 vcle.u16 q1, q1, q0 @ flat8in 118.endif 119 vdup.u16 q0, r6 @ left shift for saturation 120 vcle.u16 q3, q3, q4 @ !hev 121.if \wd == 8 122 vand q1, q1, q2 @ flat8in && fm 123.endif 124 vneg.s16 q4, q0 @ negative left shift after saturation 125 vqshl.s16 q5, q5, q0 126.if \wd == 8 127 vbic q2, q2, q1 @ fm && !flat8in 128.endif 129 vmov.s16 q7, #3 130 vand q3, q3, q2 @ !hev && fm && !flat8in 131 vshl.s16 q5, q5, q4 @ av_clip_int2p(p1 - q1, BIT_DEPTH - 1) 132 133 vmul.s16 q6, q6, q7 @ 3 * (q0 - p0) 134 vbic q5, q5, q3 @ if (!hev) av_clip_int2p = 0 135 vadd.s16 q6, q6, q5 @ 3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)] 136 vmov.s16 q5, #4 137 vqshl.s16 q6, q6, q0 138 vmov.s16 q0, #3 139 vshl.s16 q6, q6, q4 @ av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f 140 vdup.u16 q4, r7 @ max pixel value 141 142 vshr.u16 q4, q4, #1 @ (1 << (BIT_DEPTH - 1)) - 1) 143 144 vadd.s16 q5, q6, q5 @ f + 4 145 vadd.s16 q0, q6, q0 @ f + 3 146 vmov.s16 q6, #0 147 vmin.s16 q5, q5, q4 @ FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1) 148 vmin.s16 q0, q0, q4 @ FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1) 149 vdup.u16 q4, r7 @ max pixel value 150 vshr.s16 q5, q5, #3 @ f1 151 vshr.s16 q0, q0, #3 @ f2 152 153 vadd.s16 q0, q11, q0 @ p0 + f2 154 vsub.s16 q7, q12, q5 @ q0 - f1 155 vmin.s16 q0, q0, q4 156 vmin.s16 q7, q7, q4 157 vrshr.s16 q5, q5, #1 @ f = (f1 + 1) >> 1 158 vmax.s16 q0, q0, q6 @ out p0 159 vmax.s16 q7, q7, q6 @ out q0 160 vbit q11, q0, q2 @ if (fm && !flat8in) 161 vbit q12, q7, q2 162.if \wd >= 8 163 vmovn.u16 d4, q1 164.endif 165 166 vadd.s16 q0, q10, q5 @ p1 + f 167 vsub.s16 q7, q13, q5 @ q1 - f 168.if \wd >= 8 169 vmov r8, r9, d4 170.endif 171 vmin.s16 q0, q0, q4 172 vmin.s16 q7, q7, q4 173.if \wd >= 8 174 orrs r8, r8, r9 175.endif 176 vmax.s16 q0, q0, q6 @ out p1 177 vmax.s16 q7, q7, q6 @ out q1 178 vbit q10, q0, q3 @ if (!hev && fm && !flat8in) 179 vbit q13, q7, q3 180 181.if \wd >= 8 182 @ If no pixels need flat8in, jump to a writeout of the inner 4 pixels 183 beq 6f 184 185 @ flat8in 186 vadd.u16 q2, q8, q9 187 vadd.u16 q3, q10, q13 188 vadd.u16 q4, q8, q10 189 vadd.u16 q5, q11, q14 190 vadd.u16 q0, q2, q2 191 vadd.u16 q0, q0, q11 192 vadd.u16 q0, q0, q12 193 vadd.u16 q0, q0, q4 194 vsub.s16 q3, q3, q2 195 vsub.s16 q5, q5, q4 196 vrshr.u16 q6, q0, #3 @ out p2 197 198 vadd.u16 q0, q0, q3 199 vadd.u16 q2, q8, q11 200 vadd.u16 q3, q12, q15 201 vrshr.u16 q7, q0, #3 @ out p1 202 203 vadd.u16 q0, q0, q5 204 vsub.s16 q3, q3, q2 205 vadd.u16 q4, q9, q12 206 vbit q9, q6, q1 207 vadd.u16 q5, q13, q15 208 vrshr.u16 q6, q0, #3 @ out p0 209 210 vadd.u16 q0, q0, q3 211 vsub.s16 q5, q5, q4 212 vadd.u16 q2, q10, q13 213 vbit q10, q7, q1 214 vadd.u16 q3, q14, q15 215 vrshr.u16 q7, q0, #3 @ out q0 216 217 vadd.u16 q0, q0, q5 218 vsub.s16 q3, q3, q2 219 vbit q11, q6, q1 220 vrshr.u16 q6, q0, #3 @ out q1 221 222 vadd.u16 q0, q0, q3 223 vbit q12, q7, q1 224 vrshr.u16 q7, q0, #3 @ out q2 225 vbit q13, q6, q1 226 vbit q14, q7, q1 227.endif 228.endm 229 230@ The input to and output from this macro is in the registers d16-d31, 231@ and d0-d7 are used as scratch registers. 232@ p7 = d16 .. p3 = d20, p0 = d23, q0 = d24, q3 = d27, q7 = d31 233@ Depending on the width of the loop filter, we either use d16-d19 234@ and d28-d31 as temp registers, or d8-d15. 235@ In practice, this is only ever instantiated once, so the macro parameters 236@ could be hardcoded, but keeping them as is, to keep similarities to the 237@ 8 bpp and aarch64 versions. 238.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8 239 vdup.u16 d0, r2 @ E 240 vdup.u16 d2, r3 @ I 241 242 vabd.u16 d4, d20, d21 @ abs(p3 - p2) 243 vabd.u16 d5, d21, d22 @ abs(p2 - p1) 244 vabd.u16 d6, d22, d23 @ abs(p1 - p0) 245 vabd.u16 d7, d24, d25 @ abs(q0 - q1) 246 vabd.u16 \tmp1, d25, d26 @ abs(q1 - q2) 247 vabd.u16 \tmp2, d26, d27 @ abs(q2 - q3) 248 vmax.u16 d4, d4, d5 249 vmax.u16 d5, d6, d7 250 vmax.u16 \tmp1, \tmp1, \tmp2 251 vabd.u16 d6, d23, d24 @ abs(p0 - q0) 252 vmax.u16 d4, d4, d5 253 vadd.u16 d6, d6, d6 @ abs(p0 - q0) * 2 254 vabd.u16 d5, d22, d25 @ abs(p1 - q1) 255 vmax.u16 d4, d4, \tmp1 @ max(abs(p3 - p2), ..., abs(q2 - q3)) 256 vshr.u16 d5, d5, #1 257 vcle.u16 d4, d4, d2 @ max(abs()) <= I 258 vadd.u16 d6, d6, d5 @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 259 vcle.u16 d6, d6, d0 260 vand d4, d4, d6 @ fm 261 262 vdup.u16 d3, r4 @ H 263 vmov r8, r9, d4 264 orrs r8, r8, r9 265 @ If no pixels need filtering, just exit as soon as possible 266 beq 9f 267 268.if \wd >= 8 269 vdup.u16 d0, r5 270 271 vabd.u16 d6, d20, d23 @ abs(p3 - p0) 272 vabd.u16 d2, d21, d23 @ abs(p2 - p0) 273 vabd.u16 d1, d22, d23 @ abs(p1 - p0) 274 vabd.u16 \tmp1, d25, d24 @ abs(q1 - q0) 275 vabd.u16 \tmp2, d26, d24 @ abs(q2 - q0) 276 vabd.u16 \tmp3, d27, d24 @ abs(q3 - q0) 277 vmax.u16 d6, d6, d2 278 vmax.u16 d1, d1, \tmp1 279 vmax.u16 \tmp2, \tmp2, \tmp3 280.if \wd == 16 281 vabd.u16 d7, d16, d23 @ abs(p7 - p0) 282 vmax.u16 d6, d6, d1 283 vabd.u16 d2, d17, d23 @ abs(p6 - p0) 284 vmax.u16 d6, d6, \tmp2 285 vabd.u16 d1, d18, d23 @ abs(p5 - p0) 286 vcle.u16 d6, d6, d0 @ flat8in 287 vabd.u16 d8, d19, d23 @ abs(p4 - p0) 288 vand d6, d6, d4 @ flat8in && fm 289 vabd.u16 d9, d28, d24 @ abs(q4 - q0) 290 vbic d4, d4, d6 @ fm && !flat8in 291 vabd.u16 d10, d29, d24 @ abs(q5 - q0) 292 vabd.u16 d11, d30, d24 @ abs(q6 - q0) 293 vabd.u16 d12, d31, d24 @ abs(q7 - q0) 294 295 vmax.u16 d7, d7, d2 296 vmax.u16 d1, d1, d8 297 vmax.u16 d9, d9, d10 298 vmax.u16 d11, d11, d12 299 @ The rest of the calculation of flat8out is interleaved below 300.else 301 @ The rest of the calculation of flat8in is interleaved below 302.endif 303.endif 304 305 @ Calculate the normal inner loop filter for 2 or 4 pixels 306 vabd.u16 d5, d22, d23 @ abs(p1 - p0) 307.if \wd == 16 308 vmax.u16 d7, d7, d1 309 vmax.u16 d9, d9, d11 310.elseif \wd == 8 311 vmax.u16 d6, d6, d1 312.endif 313 vabd.u16 d1, d25, d24 @ abs(q1 - q0) 314.if \wd == 16 315 vmax.u16 d7, d7, d9 316.elseif \wd == 8 317 vmax.u16 d6, d6, \tmp2 318.endif 319 vdup.u16 \tmp2, r6 @ left shift for saturation 320 vsub.u16 \tmp1, d22, d25 @ p1 - q1 321 vneg.s16 \tmp6, \tmp2 @ negative left shift after saturation 322 vmax.u16 d5, d5, d1 @ max(abs(p1 - p0), abs(q1 - q0)) 323 vsub.u16 \tmp3, d24, d23 @ q0 - p0 324 vmov.s16 \tmp5, #3 325.if \wd == 8 326 vcle.u16 d6, d6, d0 @ flat8in 327.endif 328 vcle.u16 d5, d5, d3 @ !hev 329.if \wd == 8 330 vand d6, d6, d4 @ flat8in && fm 331.endif 332 vqshl.s16 \tmp1, \tmp1, \tmp2 333.if \wd == 16 334 vcle.u16 d7, d7, d0 @ flat8out 335.elseif \wd == 8 336 vbic d4, d4, d6 @ fm && !flat8in 337.endif 338 vand d5, d5, d4 @ !hev && fm && !flat8in 339.if \wd == 16 340 vand d7, d7, d6 @ flat8out && flat8in && fm 341.endif 342 vshl.s16 \tmp1, \tmp1, \tmp6 @ av_clip_int2p(p1 - q1, BIT_DEPTH - 1) 343 344 vmul.s16 \tmp3, \tmp3, \tmp5 @ 3 * (q0 - p0) 345 vbic \tmp1, \tmp1, d5 @ if (!hev) av_clip_int2p = 0 346 vmov.s16 d2, #4 347 vadd.s16 \tmp3, \tmp3, \tmp1 @ 3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)] 348 vmov.s16 d3, #3 349 vqshl.s16 \tmp1, \tmp3, \tmp2 350 vmov.s16 \tmp5, #0 351 vshl.s16 \tmp1, \tmp1, \tmp6 @ av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f 352 vdup.u16 \tmp6, r7 @ max pixel value 353.if \wd == 16 354 vbic d6, d6, d7 @ fm && flat8in && !flat8out 355.endif 356 357 vshr.u16 \tmp2, \tmp6, #1 @ (1 << (BIT_DEPTH - 1)) - 1 358 359 vadd.s16 \tmp3, \tmp1, d2 @ f + 4 360 vadd.s16 \tmp4, \tmp1, d3 @ f + 3 361 vmin.s16 \tmp3, \tmp3, \tmp2 @ FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1) 362 vmin.s16 \tmp4, \tmp4, \tmp2 @ FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1) 363 vshr.s16 \tmp3, \tmp3, #3 @ f1 364 vshr.s16 \tmp4, \tmp4, #3 @ f2 365 366 vadd.s16 d0, d23, \tmp4 @ p0 + f2 367 vsub.s16 d2, d24, \tmp3 @ q0 - f1 368 vmin.s16 d0, d0, \tmp6 369 vmin.s16 d2, d2, \tmp6 370 vrshr.s16 \tmp3, \tmp3, #1 @ f = (f1 + 1) >> 1 371 vmax.s16 d0, d0, \tmp5 @ out p0 372 vmax.s16 d2, d2, \tmp5 @ out q0 373 vbit d23, d0, d4 @ if (fm && !flat8in) 374 vbit d24, d2, d4 375 376 vadd.s16 d0, d22, \tmp3 @ p1 + f 377 vsub.s16 d2, d25, \tmp3 @ q1 - f 378.if \wd >= 8 379 vmov r8, r9, d6 380.endif 381 vmin.s16 d0, d0, \tmp6 382 vmin.s16 d2, d2, \tmp6 383.if \wd >= 8 384 orrs r8, r8, r9 385.endif 386 vmax.s16 d0, d0, \tmp5 @ out p1 387 vmax.s16 d2, d2, \tmp5 @ out q1 388 vbit d22, d0, d5 @ if (!hev && fm && !flat8in) 389 vbit d25, d2, d5 390 391.if \wd >= 8 392 @ If no pixels need flat8in, jump to flat8out 393 @ (or to a writeout of the inner 4 pixels, for wd=8) 394 beq 6f 395 396 @ flat8in 397 vadd.u16 \tmp1, d20, d21 398 vadd.u16 \tmp3, d22, d25 399 vadd.u16 \tmp5, d20, d22 400 vadd.u16 \tmp7, d23, d26 401 vadd.u16 d0, \tmp1, \tmp1 402 vadd.u16 d0, d0, d23 403 vadd.u16 d0, d0, d24 404 vadd.u16 d0, d0, \tmp5 405 vsub.s16 \tmp3, \tmp3, \tmp1 406 vsub.s16 \tmp7, \tmp7, \tmp5 407 vrshr.u16 d2, d0, #3 @ out p2 408 409 vadd.u16 d0, d0, \tmp3 410 vadd.u16 \tmp1, d20, d23 411 vadd.u16 \tmp3, d24, d27 412 vrshr.u16 d3, d0, #3 @ out p1 413 414 vadd.u16 d0, d0, \tmp7 415 vsub.s16 \tmp3, \tmp3, \tmp1 416 vadd.u16 \tmp5, d21, d24 417 vadd.u16 \tmp7, d25, d27 418 vrshr.u16 d4, d0, #3 @ out p0 419 420 vadd.u16 d0, d0, \tmp3 421 vsub.s16 \tmp7, \tmp7, \tmp5 422 vadd.u16 \tmp1, d22, d25 423 vadd.u16 \tmp3, d26, d27 424 vrshr.u16 d5, d0, #3 @ out d0 425 426 vadd.u16 d0, d0, \tmp7 427 vsub.s16 \tmp3, \tmp3, \tmp1 428 vrshr.u16 \tmp5, d0, #3 @ out q1 429 430 vadd.u16 d0, d0, \tmp3 431 @ The output here is written back into the input registers. This doesn't 432 @ matter for the flat8out part below, since we only update those pixels 433 @ which won't be touched below. 434 vbit d21, d2, d6 435 vbit d22, d3, d6 436 vbit d23, d4, d6 437 vrshr.u16 \tmp6, d0, #3 @ out q2 438 vbit d24, d5, d6 439 vbit d25, \tmp5, d6 440 vbit d26, \tmp6, d6 441.endif 442.if \wd == 16 4436: 444 vorr d2, d6, d7 445 vmov r8, r9, d2 446 orrs r8, r8, r9 447 @ If no pixels needed flat8in nor flat8out, jump to a 448 @ writeout of the inner 4 pixels 449 beq 7f 450 vmov r8, r9, d7 451 orrs r8, r8, r9 452 @ If no pixels need flat8out, jump to a writeout of the inner 6 pixels 453 beq 8f 454 455 @ flat8out 456 @ This writes all outputs into d2-d17 (skipping d6 and d16). 457 @ If this part is skipped, the output is read from d21-d26 (which is the input 458 @ to this section). 459 vshl.u16 d0, d16, #3 @ 8 * d16 460 vsub.u16 d0, d0, d16 @ 7 * d16 461 vadd.u16 d0, d0, d17 462 vadd.u16 d8, d17, d18 463 vadd.u16 d10, d19, d20 464 vadd.s16 d0, d0, d8 465 vadd.u16 d8, d16, d17 466 vadd.u16 d12, d21, d22 467 vadd.s16 d0, d0, d10 468 vadd.u16 d10, d18, d25 469 vadd.u16 d14, d23, d24 470 vsub.s16 d10, d10, d8 471 vadd.s16 d0, d0, d12 472 vadd.s16 d0, d0, d14 473 vadd.u16 d12, d16, d18 474 vadd.u16 d14, d19, d26 475 vrshr.u16 d2, d0, #4 476 477 vadd.s16 d0, d0, d10 478 vadd.u16 d8, d16, d19 479 vadd.u16 d10, d20, d27 480 vsub.s16 d14, d14, d12 481 vbif d2, d17, d7 482 vrshr.u16 d3, d0, #4 483 484 vadd.s16 d0, d0, d14 485 vadd.u16 d12, d16, d20 486 vadd.u16 d14, d21, d28 487 vsub.s16 d10, d10, d8 488 vbif d3, d18, d7 489 vrshr.u16 d4, d0, #4 490 491 vadd.s16 d0, d0, d10 492 vadd.u16 d8, d16, d21 493 vadd.u16 d10, d22, d29 494 vsub.s16 d14, d14, d12 495 vbif d4, d19, d7 496 vrshr.u16 d5, d0, #4 497 498 vadd.s16 d0, d0, d14 499 vadd.u16 d12, d16, d22 500 vadd.u16 d14, d23, d30 501 vsub.s16 d10, d10, d8 502 vbif d5, d20, d7 503 vrshr.u16 d6, d0, #4 504 505 vadd.s16 d0, d0, d10 506 vadd.u16 d10, d16, d23 507 vsub.s16 d14, d14, d12 508 vadd.u16 d12, d24, d31 509 vbif d6, d21, d7 510 vrshr.u16 d8, d0, #4 511 512 vadd.s16 d0, d0, d14 513 vsub.s16 d10, d12, d10 514 vadd.u16 d12, d17, d24 515 vadd.u16 d14, d25, d31 516 vbif d8, d22, d7 517 vrshr.u16 d9, d0, #4 518 519 vadd.s16 d0, d0, d10 520 vsub.s16 d14, d14, d12 521 vadd.u16 d12, d26, d31 522 vbif d9, d23, d7 523 vrshr.u16 d10, d0, #4 524 525 vadd.s16 d0, d0, d14 526 vadd.u16 d14, d18, d25 527 vadd.u16 d18, d19, d26 528 vsub.s16 d12, d12, d14 529 vadd.u16 d14, d27, d31 530 vbif d10, d24, d7 531 vrshr.u16 d11, d0, #4 532 533 vadd.s16 d0, d0, d12 534 vadd.u16 d12, d20, d27 535 vsub.s16 d14, d14, d18 536 vadd.u16 d18, d28, d31 537 vbif d11, d25, d7 538 vsub.s16 d18, d18, d12 539 vrshr.u16 d12, d0, #4 540 541 vadd.s16 d0, d0, d14 542 vadd.u16 d14, d21, d28 543 vadd.u16 d20, d29, d31 544 vbif d12, d26, d7 545 vrshr.u16 d13, d0, #4 546 547 vadd.s16 d0, d0, d18 548 vsub.s16 d20, d20, d14 549 vadd.u16 d18, d22, d29 550 vadd.u16 d22, d30, d31 551 vbif d13, d27, d7 552 vrshr.u16 d14, d0, #4 553 554 vadd.s16 d0, d0, d20 555 vsub.s16 d22, d22, d18 556 vbif d14, d28, d7 557 vrshr.u16 d15, d0, #4 558 559 vadd.s16 d0, d0, d22 560 vbif d15, d29, d7 561 vrshr.u16 d17, d0, #4 562 vbif d17, d30, d7 563.endif 564.endm 565 566.macro loop_filter_q_4 567 loop_filter_q 4 568.endm 569 570.macro loop_filter_q_8 571 loop_filter_q 8 572.endm 573 574.macro loop_filter_16 575 loop_filter 16, d8, d9, d10, d11, d12, d13, d14, d15 576.endm 577 578 579@ The public functions in this file have got the following signature: 580@ void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr); 581 582.macro bpp_frontend func, bpp 583function ff_\func\()_\bpp\()_neon, export=1 584 push {r4-r9,lr} 585 ldr r4, [sp, #28] 586 vpush {q4-q7} 587 lsl r2, r2, #\bpp - 8 588 lsl r3, r3, #\bpp - 8 589 lsl r4, r4, #\bpp - 8 590 mov r5, #1 << (\bpp - 8) 591 mov r6, #16 - \bpp 592 movw r7, #((1 << \bpp) - 1) 593 bl \func\()_16_neon 594 vpop {q4-q7} 595 pop {r4-r9,pc} 596endfunc 597.endm 598 599.macro bpp_frontends func 600 bpp_frontend \func, 10 601 bpp_frontend \func, 12 602.endm 603 604.macro bpp_frontend_rep func, suffix, int_suffix, rep, dir, bpp 605function ff_\func\()_\suffix\()_\bpp\()_neon, export=1 606 push {r4-r9,lr} 607 ldr r4, [sp, #28] 608 vpush {q4-q7} 609 lsl r2, r2, #\bpp - 8 610 lsl r3, r3, #\bpp - 8 611 lsl r4, r4, #\bpp - 8 612 mov r5, #1 << (\bpp - 8) 613 mov r6, #16 - \bpp 614 movw r7, #((1 << \bpp) - 1) 615 bl \func\()_\int_suffix\()_16_neon 616.ifc \dir,h 617 add r0, r0, r1, lsl #2 618.else 619 add r0, r0, #8 620.endif 621 bl \func\()_\int_suffix\()_16_neon 622.if \rep >= 4 623.ifc \dir,h 624 add r0, r0, r1, lsl #2 625 bl \func\()_\int_suffix\()_16_neon 626 add r0, r0, r1, lsl #2 627 bl \func\()_\int_suffix\()_16_neon 628.else 629 add r0, r0, #8 630 bl \func\()_\int_suffix\()_16_neon 631 add r0, r0, #8 632 bl \func\()_\int_suffix\()_16_neon 633.endif 634.endif 635 vpop {q4-q7} 636 pop {r4-r9,pc} 637endfunc 638.endm 639 640.macro bpp_frontends_rep func, suffix, int_suffix, rep, dir 641 bpp_frontend_rep \func, \suffix, \int_suffix, \rep, \dir, 10 642 bpp_frontend_rep \func, \suffix, \int_suffix, \rep, \dir, 12 643.endm 644 645.macro bpp_frontend_mix2 wd1, wd2, dir, bpp 646function ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1 647 push {r4-r9,lr} 648 ldr r4, [sp, #28] 649 vpush {q4-q7} 650 push {r2, r3, r4} 651 and r2, r2, #0xff 652 and r3, r3, #0xff 653 and r4, r4, #0xff 654 lsl r2, r2, #\bpp - 8 655 lsl r3, r3, #\bpp - 8 656 lsl r4, r4, #\bpp - 8 657 mov r5, #1 << (\bpp - 8) 658 mov r6, #16 - \bpp 659 movw r7, #((1 << \bpp) - 1) 660 bl vp9_loop_filter_\dir\()_\wd1\()_8_16_neon 661.ifc \dir,h 662 add r0, r0, r1, lsl #3 663.else 664 add r0, r0, #16 665.endif 666 pop {r2, r3, r4} 667 lsr r2, r2, #8 668 lsr r3, r3, #8 669 lsr r4, r4, #8 670 lsl r2, r2, #\bpp - 8 671 lsl r3, r3, #\bpp - 8 672 lsl r4, r4, #\bpp - 8 673 bl vp9_loop_filter_\dir\()_\wd2\()_8_16_neon 674 vpop {q4-q7} 675 pop {r4-r9,pc} 676endfunc 677.endm 678 679.macro bpp_frontends_mix2 wd1, wd2 680 bpp_frontend_mix2 \wd1, \wd2, v, 10 681 bpp_frontend_mix2 \wd1, \wd2, v, 12 682 bpp_frontend_mix2 \wd1, \wd2, h, 10 683 bpp_frontend_mix2 \wd1, \wd2, h, 12 684.endm 685 686function vp9_loop_filter_v_4_8_16_neon 687 sub r12, r0, r1, lsl #2 688 vld1.16 {q8}, [r12,:128], r1 @ p3 689 vld1.16 {q12}, [r0, :128], r1 @ q0 690 vld1.16 {q9}, [r12,:128], r1 @ p2 691 vld1.16 {q13}, [r0, :128], r1 @ q1 692 vld1.16 {q10}, [r12,:128], r1 @ p1 693 vld1.16 {q14}, [r0, :128], r1 @ q2 694 vld1.16 {q11}, [r12,:128], r1 @ p0 695 vld1.16 {q15}, [r0, :128], r1 @ q3 696 sub r0, r0, r1, lsl #2 697 sub r12, r12, r1, lsl #1 698 699 loop_filter_q_4 700 701 vst1.16 {q10}, [r12,:128], r1 702 vst1.16 {q12}, [r0, :128], r1 703 vst1.16 {q11}, [r12,:128], r1 704 vst1.16 {q13}, [r0, :128], r1 705 sub r0, r0, r1, lsl #1 7069: 707 bx lr 708endfunc 709 710bpp_frontends vp9_loop_filter_v_4_8 711 712 713function vp9_loop_filter_h_4_8_16_neon 714 sub r12, r0, #8 715 add r0, r12, r1, lsl #2 716 vld1.16 {q8}, [r12,:64], r1 717 vld1.16 {q12}, [r0, :64], r1 718 vld1.16 {q9}, [r12,:64], r1 719 vld1.16 {q13}, [r0, :64], r1 720 vld1.16 {q10}, [r12,:64], r1 721 vld1.16 {q14}, [r0, :64], r1 722 vld1.16 {q11}, [r12,:64], r1 723 vld1.16 {q15}, [r0, :64], r1 724 725 sub r12, r12, r1, lsl #2 726 sub r0, r0, r1, lsl #2 727 @ Move r0/r12 forward by 2 pixels; we don't need to rewrite the 728 @ outermost 2 pixels since they aren't changed. 729 add r12, r12, #4 730 add r0, r0, #4 731 732 transpose16_q_8x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 733 734 loop_filter_q_4 735 736 @ We only will write the mid 4 pixels back; after the loop filter, 737 @ these are in q10, q11, q12, q13, ordered as rows (8x4 pixels). 738 @ We need to transpose them to columns, done with a 739 @ 4x4 transpose (which in practice is two 4x4 transposes of the two 740 @ 4x4 halves of the 8x4 pixels; into 4x8 pixels). 741 transpose16_4x4 q10, q11, q12, q13 742 743 vst1.16 {d20}, [r12], r1 744 vst1.16 {d21}, [r0], r1 745 vst1.16 {d22}, [r12], r1 746 vst1.16 {d23}, [r0], r1 747 vst1.16 {d24}, [r12], r1 748 vst1.16 {d25}, [r0], r1 749 vst1.16 {d26}, [r12], r1 750 vst1.16 {d27}, [r0], r1 751 sub r12, r12, r1, lsl #2 7529: 753 add r0, r12, #4 754 bx lr 755endfunc 756 757bpp_frontends vp9_loop_filter_h_4_8 758 759 760function vp9_loop_filter_v_8_8_16_neon 761 sub r12, r0, r1, lsl #2 762 vld1.16 {q8}, [r12,:128], r1 @ p3 763 vld1.16 {q12}, [r0, :128], r1 @ q0 764 vld1.16 {q9}, [r12,:128], r1 @ p2 765 vld1.16 {q13}, [r0, :128], r1 @ q1 766 vld1.16 {q10}, [r12,:128], r1 @ p1 767 vld1.16 {q14}, [r0, :128], r1 @ q2 768 vld1.16 {q11}, [r12,:128], r1 @ p0 769 vld1.16 {q15}, [r0, :128], r1 @ q3 770 sub r12, r12, r1, lsl #2 771 sub r0, r0, r1, lsl #2 772 add r12, r12, r1 773 774 loop_filter_q_8 775 776 vst1.16 {q9}, [r12,:128], r1 777 vst1.16 {q12}, [r0, :128], r1 778 vst1.16 {q10}, [r12,:128], r1 779 vst1.16 {q13}, [r0, :128], r1 780 vst1.16 {q11}, [r12,:128], r1 781 vst1.16 {q14}, [r0, :128], r1 782 sub r0, r0, r1, lsl #1 783 sub r0, r0, r1 7849: 785 bx lr 7866: 787 sub r12, r0, r1, lsl #1 788 vst1.16 {q10}, [r12,:128], r1 789 vst1.16 {q12}, [r0, :128], r1 790 vst1.16 {q11}, [r12,:128], r1 791 vst1.16 {q13}, [r0, :128], r1 792 sub r0, r0, r1, lsl #1 793 bx lr 794endfunc 795 796bpp_frontends vp9_loop_filter_v_8_8 797 798 799function vp9_loop_filter_h_8_8_16_neon 800 sub r12, r0, #8 801 add r0, r12, r1, lsl #2 802 vld1.16 {q8}, [r12,:64], r1 803 vld1.16 {q12}, [r0, :64], r1 804 vld1.16 {q9}, [r12,:64], r1 805 vld1.16 {q13}, [r0, :64], r1 806 vld1.16 {q10}, [r12,:64], r1 807 vld1.16 {q14}, [r0, :64], r1 808 vld1.16 {q11}, [r12,:64], r1 809 vld1.16 {q15}, [r0, :64], r1 810 811 sub r12, r12, r1, lsl #2 812 sub r0, r0, r1, lsl #2 813 814 transpose16_q_8x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 815 816 loop_filter_q_8 817 818 @ Even though only 6 pixels per row have been changed, we write the 819 @ full 8 pixel registers. 820 transpose16_q_8x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 821 822 vst1.16 {q8}, [r12,:64], r1 823 vst1.16 {q12}, [r0, :64], r1 824 vst1.16 {q9}, [r12,:64], r1 825 vst1.16 {q13}, [r0, :64], r1 826 vst1.16 {q10}, [r12,:64], r1 827 vst1.16 {q14}, [r0, :64], r1 828 vst1.16 {q11}, [r12,:64], r1 829 vst1.16 {q15}, [r0, :64], r1 830 sub r12, r12, r1, lsl #2 8319: 832 add r0, r12, #8 833 bx lr 8346: 835 @ If we didn't need to do the flat8in part, we use the same writeback 836 @ as in loop_filter_h_4_8. 837 add r12, r12, #4 838 add r0, r0, #4 839 transpose16_4x4 q10, q11, q12, q13 840 841 vst1.16 {d20}, [r12], r1 842 vst1.16 {d21}, [r0], r1 843 vst1.16 {d22}, [r12], r1 844 vst1.16 {d23}, [r0], r1 845 vst1.16 {d24}, [r12], r1 846 vst1.16 {d25}, [r0], r1 847 vst1.16 {d26}, [r12], r1 848 vst1.16 {d27}, [r0], r1 849 sub r12, r12, r1, lsl #2 850 add r0, r12, #4 851 bx lr 852endfunc 853 854bpp_frontends vp9_loop_filter_h_8_8 855 856bpp_frontends_mix2 4, 4 857bpp_frontends_mix2 4, 8 858bpp_frontends_mix2 8, 4 859bpp_frontends_mix2 8, 8 860 861function vp9_loop_filter_v_16_4_16_neon 862 sub r12, r0, r1, lsl #3 863 @ Read p7-p0 using r12 and q0-q7 using r0 864 vld1.16 {d16}, [r12,:64], r1 @ p7 865 vld1.16 {d24}, [r0, :64], r1 @ q0 866 vld1.16 {d17}, [r12,:64], r1 @ p6 867 vld1.16 {d25}, [r0, :64], r1 @ q1 868 vld1.16 {d18}, [r12,:64], r1 @ p5 869 vld1.16 {d26}, [r0, :64], r1 @ q2 870 vld1.16 {d19}, [r12,:64], r1 @ p4 871 vld1.16 {d27}, [r0, :64], r1 @ q3 872 vld1.16 {d20}, [r12,:64], r1 @ p3 873 vld1.16 {d28}, [r0, :64], r1 @ q4 874 vld1.16 {d21}, [r12,:64], r1 @ p2 875 vld1.16 {d29}, [r0, :64], r1 @ q5 876 vld1.16 {d22}, [r12,:64], r1 @ p1 877 vld1.16 {d30}, [r0, :64], r1 @ q6 878 vld1.16 {d23}, [r12,:64], r1 @ p0 879 vld1.16 {d31}, [r0, :64], r1 @ q7 880 sub r12, r12, r1, lsl #3 881 sub r0, r0, r1, lsl #3 882 add r12, r12, r1 883 884 loop_filter_16 885 886 @ If we did the flat8out part, we get the output in 887 @ d2-d17 (skipping d7 and d16). r12 points to r0 - 7 * stride, 888 @ store d2-d9 there, and d10-d17 into r0. 889 vst1.16 {d2}, [r12,:64], r1 890 vst1.16 {d10}, [r0, :64], r1 891 vst1.16 {d3}, [r12,:64], r1 892 vst1.16 {d11}, [r0, :64], r1 893 vst1.16 {d4}, [r12,:64], r1 894 vst1.16 {d12}, [r0, :64], r1 895 vst1.16 {d5}, [r12,:64], r1 896 vst1.16 {d13}, [r0, :64], r1 897 vst1.16 {d6}, [r12,:64], r1 898 vst1.16 {d14}, [r0, :64], r1 899 vst1.16 {d8}, [r12,:64], r1 900 vst1.16 {d15}, [r0, :64], r1 901 vst1.16 {d9}, [r12,:64], r1 902 vst1.16 {d17}, [r0, :64], r1 903 sub r0, r0, r1, lsl #3 904 add r0, r0, r1 905 9069: 907 bx lr 908 9098: 910 add r12, r12, r1, lsl #2 911 @ If we didn't do the flat8out part, the output is left in the 912 @ input registers. 913 vst1.16 {d21}, [r12,:64], r1 914 vst1.16 {d24}, [r0, :64], r1 915 vst1.16 {d22}, [r12,:64], r1 916 vst1.16 {d25}, [r0, :64], r1 917 vst1.16 {d23}, [r12,:64], r1 918 vst1.16 {d26}, [r0, :64], r1 919 sub r0, r0, r1, lsl #1 920 sub r0, r0, r1 921 bx lr 9227: 923 sub r12, r0, r1, lsl #1 924 vst1.16 {d22}, [r12,:64], r1 925 vst1.16 {d24}, [r0, :64], r1 926 vst1.16 {d23}, [r12,:64], r1 927 vst1.16 {d25}, [r0, :64], r1 928 sub r0, r0, r1, lsl #1 929 bx lr 930endfunc 931 932bpp_frontends_rep vp9_loop_filter_v_16, 8, 4, 2, v 933bpp_frontends_rep vp9_loop_filter_v_16, 16, 4, 4, v 934 935function vp9_loop_filter_h_16_4_16_neon 936 sub r12, r0, #16 937 sub r0, r0, #8 938 vld1.16 {d16}, [r12,:64], r1 939 vld1.16 {d20}, [r0, :64], r1 940 vld1.16 {d17}, [r12,:64], r1 941 vld1.16 {d21}, [r0, :64], r1 942 vld1.16 {d18}, [r12,:64], r1 943 vld1.16 {d22}, [r0, :64], r1 944 vld1.16 {d19}, [r12,:64], r1 945 vld1.16 {d23}, [r0, :64], r1 946 sub r12, r12, r1, lsl #2 947 sub r0, r0, r1, lsl #2 948 add r12, r12, #16 949 add r0, r0, #16 950 vld1.16 {d24}, [r12,:64], r1 951 vld1.16 {d28}, [r0, :64], r1 952 vld1.16 {d25}, [r12,:64], r1 953 vld1.16 {d29}, [r0, :64], r1 954 vld1.16 {d26}, [r12,:64], r1 955 vld1.16 {d30}, [r0, :64], r1 956 vld1.16 {d27}, [r12,:64], r1 957 vld1.16 {d31}, [r0, :64], r1 958 sub r0, r0, r1, lsl #2 959 sub r12, r12, r1, lsl #2 960 sub r12, r12, #16 961 sub r0, r0, #16 962 963 @ The 16x4 pixels read above is in four 4x4 blocks 964 transpose16_q_4x4 q8, q9, d16, d17, d18, d19 965 transpose16_q_4x4 q10, q11, d20, d21, d22, d23 966 transpose16_q_4x4 q12, q13, d24, d25, d26, d27 967 transpose16_q_4x4 q14, q15, d28, d29, d30, d31 968 969 loop_filter_16 970 971 @ Transpose back; this is the same transpose as above, but 972 @ we can't take advantage of q registers for the transpose, since 973 @ all d registers in the transpose aren't consecutive. 974 transpose16_4x4 d16, d2, d3, d4 975 transpose16_4x4 d5, d6, d8, d9 976 transpose16_4x4 d10, d11, d12, d13 977 transpose16_4x4 d14, d15, d17, d31 978 979 vst1.16 {d16}, [r12,:64], r1 980 vst1.16 {d5}, [r0, :64], r1 981 982 vst1.16 {d2}, [r12,:64], r1 983 vst1.16 {d6}, [r0, :64], r1 984 985 vst1.16 {d3}, [r12,:64], r1 986 vst1.16 {d8}, [r0, :64], r1 987 988 vst1.16 {d4}, [r12,:64], r1 989 vst1.16 {d9}, [r0, :64], r1 990 991 sub r12, r12, r1, lsl #2 992 sub r0, r0, r1, lsl #2 993 add r12, r12, #16 994 add r0, r0, #16 995 996 vst1.16 {d10}, [r12,:64], r1 997 vst1.16 {d14}, [r0, :64], r1 998 999 vst1.16 {d11}, [r12,:64], r1 1000 vst1.16 {d15}, [r0, :64], r1 1001 1002 vst1.16 {d12}, [r12,:64], r1 1003 vst1.16 {d17}, [r0, :64], r1 1004 1005 vst1.16 {d13}, [r12,:64], r1 1006 vst1.16 {d31}, [r0, :64], r1 1007 sub r0, r0, r1, lsl #2 1008 sub r0, r0, #8 1009 bx lr 10109: 1011 add r0, r0, #8 1012 bx lr 10138: 1014 add r12, r12, #8 1015 add r0, r0, #8 1016 transpose16_q_4x4 q10, q11, d20, d21, d22, d23 1017 transpose16_q_4x4 q12, q13, d24, d25, d26, d27 1018 1019 vst1.16 {d20}, [r12,:64], r1 1020 vst1.16 {d24}, [r0, :64], r1 1021 vst1.16 {d21}, [r12,:64], r1 1022 vst1.16 {d25}, [r0, :64], r1 1023 vst1.16 {d22}, [r12,:64], r1 1024 vst1.16 {d26}, [r0, :64], r1 1025 vst1.16 {d23}, [r12,:64], r1 1026 vst1.16 {d27}, [r0, :64], r1 1027 sub r0, r0, r1, lsl #2 1028 bx lr 10297: 1030 add r12, r12, #12 1031 add r0, r12, r1, lsl #1 1032 transpose16_q_4x4 q11, q12, d22, d23, d24, d25 1033 1034 vst1.16 {d22}, [r12], r1 1035 vst1.16 {d24}, [r0], r1 1036 vst1.16 {d23}, [r12], r1 1037 vst1.16 {d25}, [r0], r1 1038 sub r0, r0, r1, lsl #2 1039 add r0, r0, #4 1040 bx lr 1041endfunc 1042 1043bpp_frontends_rep vp9_loop_filter_h_16, 8, 4, 2, h 1044bpp_frontends_rep vp9_loop_filter_h_16, 16, 4, 4, h 1045