1/* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2019, Martin Storsjo 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28#include "src/arm/asm.S" 29#include "util.S" 30 31.macro loop_filter wd 32function lpf_16_wd\wd\()_neon 33 uabd v0.16b, v22.16b, v23.16b // abs(p1 - p0) 34 uabd v1.16b, v25.16b, v24.16b // abs(q1 - q0) 35 uabd v2.16b, v23.16b, v24.16b // abs(p0 - q0) 36 uabd v3.16b, v22.16b, v25.16b // abs(p1 - q1) 37.if \wd >= 6 38 uabd v4.16b, v21.16b, v22.16b // abs(p2 - p1) 39 uabd v5.16b, v26.16b, v25.16b // abs(q2 - q1) 40.endif 41.if \wd >= 8 42 uabd v6.16b, v20.16b, v21.16b // abs(p3 - p2) 43 uabd v7.16b, v27.16b, v26.16b // abs(q3 - q3) 44.endif 45.if \wd >= 6 46 umax v4.16b, v4.16b, v5.16b 47.endif 48 uqadd v2.16b, v2.16b, v2.16b // abs(p0 - q0) * 2 49.if \wd >= 8 50 umax v6.16b, v6.16b, v7.16b 51.endif 52 ushr v3.16b, v3.16b, #1 53.if \wd >= 8 54 umax v4.16b, v4.16b, v6.16b 55.endif 56.if \wd >= 6 57 and v4.16b, v4.16b, v14.16b 58.endif 59 umax v0.16b, v0.16b, v1.16b // max(abs(p1 - p0), abs(q1 - q0)) 60 uqadd v2.16b, v2.16b, v3.16b // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 61.if \wd >= 6 62 umax v4.16b, v0.16b, v4.16b 63 cmhs v1.16b, v11.16b, v4.16b // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I 64.else 65 cmhs v1.16b, v11.16b, v0.16b // max(abs(p1 - p0), abs(q1 - q0)) <= I 66.endif 67 cmhs v2.16b, v10.16b, v2.16b // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E 68 and v1.16b, v1.16b, v2.16b // fm 69 and v1.16b, v1.16b, v13.16b // fm && wd >= 4 70.if \wd >= 6 71 and v14.16b, v14.16b, v1.16b // fm && wd > 4 72.endif 73.if \wd >= 16 74 and v15.16b, v15.16b, v1.16b // fm && wd == 16 75.endif 76 77 mov x16, v1.d[0] 78 mov x17, v1.d[1] 79 adds x16, x16, x17 80 b.eq 9f // if (!fm || wd < 4) return; 81 82.if \wd >= 6 83 movi v10.16b, #1 84 uabd v2.16b, v21.16b, v23.16b // abs(p2 - p0) 85 uabd v3.16b, v22.16b, v23.16b // abs(p1 - p0) 86 uabd v4.16b, v25.16b, v24.16b // abs(q1 - q0) 87 uabd v5.16b, v26.16b, v24.16b // abs(q2 - q0) 88.if \wd >= 8 89 uabd v6.16b, v20.16b, v23.16b // abs(p3 - p0) 90 uabd v7.16b, v27.16b, v24.16b // abs(q3 - q0) 91.endif 92 umax v2.16b, v2.16b, v3.16b 93 umax v4.16b, v4.16b, v5.16b 94.if \wd >= 8 95 umax v6.16b, v6.16b, v7.16b 96.endif 97 umax v2.16b, v2.16b, v4.16b 98.if \wd >= 8 99 umax v2.16b, v2.16b, v6.16b 100.endif 101 102.if \wd == 16 103 uabd v3.16b, v17.16b, v23.16b // abs(p6 - p0) 104 uabd v4.16b, v18.16b, v23.16b // abs(p5 - p0) 105 uabd v5.16b, v19.16b, v23.16b // abs(p4 - p0) 106.endif 107 cmhs v2.16b, v10.16b, v2.16b // flat8in 108.if \wd == 16 109 uabd v6.16b, v28.16b, v24.16b // abs(q4 - q0) 110 uabd v7.16b, v29.16b, v24.16b // abs(q5 - q0) 111 uabd v8.16b, v30.16b, v24.16b // abs(q6 - q0) 112.endif 113 and v14.16b, v2.16b, v14.16b // flat8in && fm && wd > 4 114 bic v1.16b, v1.16b, v14.16b // fm && wd >= 4 && !flat8in 115.if \wd == 16 116 umax v3.16b, v3.16b, v4.16b 117 umax v5.16b, v5.16b, v6.16b 118.endif 119 mov x16, v1.d[0] 120 mov x17, v1.d[1] 121.if \wd == 16 122 umax v7.16b, v7.16b, v8.16b 123 umax v3.16b, v3.16b, v5.16b 124 umax v3.16b, v3.16b, v7.16b 125 cmhs v3.16b, v10.16b, v3.16b // flat8out 126.endif 127 adds x16, x16, x17 128.if \wd == 16 129 and v15.16b, v15.16b, v3.16b // flat8out && fm && wd == 16 130 and v15.16b, v15.16b, v14.16b // flat8out && flat8in && fm && wd == 16 131 bic v14.16b, v14.16b, v15.16b // flat8in && fm && wd >= 4 && !flat8out 132.endif 133 b.eq 1f // skip wd == 4 case 134.endif 135 movi v3.16b, #128 136 eor v2.16b, v22.16b, v3.16b // p1 - 128 137 eor v3.16b, v25.16b, v3.16b // q1 - 128 138 cmhi v0.16b, v0.16b, v12.16b // hev 139 sqsub v2.16b, v2.16b, v3.16b // iclip_diff(p1 - q1) 140 and v4.16b, v2.16b, v0.16b // if (hev) iclip_diff(p1 - q1) 141 bic v0.16b, v1.16b, v0.16b // (fm && wd >= 4 && !hev) 142 usubl v2.8h, v24.8b, v23.8b 143 movi v5.8h, #3 144 usubl2 v3.8h, v24.16b, v23.16b 145 mul v2.8h, v2.8h, v5.8h 146 mul v3.8h, v3.8h, v5.8h 147 movi v6.16b, #4 148 saddw v2.8h, v2.8h, v4.8b 149 saddw2 v3.8h, v3.8h, v4.16b 150 movi v7.16b, #3 151 sqxtn v2.8b, v2.8h // f 152 sqxtn2 v2.16b, v3.8h 153 sqadd v4.16b, v6.16b, v2.16b // imin(f + 4, 127) 154 sqadd v5.16b, v7.16b, v2.16b // imin(f + 3, 127) 155 sshr v4.16b, v4.16b, #3 // f1 156 sshr v5.16b, v5.16b, #3 // f2 157 mov v2.16b, v23.16b // p0 158 mov v3.16b, v24.16b // q0 159 neg v6.16b, v4.16b // -f1 160 srshr v4.16b, v4.16b, #1 // (f1 + 1) >> 1 161 // p0 + f2, q0 - f1 162 usqadd v2.16b, v5.16b // out p0 163 usqadd v3.16b, v6.16b // out q0 164 neg v6.16b, v4.16b // -((f1 + 1) >> 1) 165 bit v23.16b, v2.16b, v1.16b // if (fm && wd >= 4) 166 bit v24.16b, v3.16b, v1.16b // if (fm && wd >= 4) 167 mov v2.16b, v22.16b // p1 168 mov v3.16b, v25.16b // q1 169 // p1 + ((f1 + 1) >> 1), q1 - ((f1 + 1) >> 1) 170 usqadd v2.16b, v4.16b // out p1 171 usqadd v3.16b, v6.16b // out q1 172 bit v22.16b, v2.16b, v0.16b // if (fm && wd >= 4 && !hev) 173 bit v25.16b, v3.16b, v0.16b // if (fm && wd >= 4 && !hev) 1741: 175 176.if \wd == 6 177 mov x16, v14.d[0] 178 mov x17, v14.d[1] 179 adds x16, x16, x17 180 b.eq 2f // skip if there's no flat8in 181 182 uaddl v0.8h, v21.8b, v21.8b // p2 * 2 183 uaddl2 v1.8h, v21.16b, v21.16b 184 uaddl v2.8h, v21.8b, v22.8b // p2 + p1 185 uaddl2 v3.8h, v21.16b, v22.16b 186 uaddl v4.8h, v22.8b, v23.8b // p1 + p0 187 uaddl2 v5.8h, v22.16b, v23.16b 188 uaddl v6.8h, v23.8b, v24.8b // p0 + q0 189 uaddl2 v7.8h, v23.16b, v24.16b 190 add v8.8h, v0.8h, v2.8h 191 add v9.8h, v1.8h, v3.8h 192 add v10.8h, v4.8h, v6.8h 193 add v11.8h, v5.8h, v7.8h 194 uaddl v12.8h, v24.8b, v25.8b // q0 + q1 195 uaddl2 v13.8h, v24.16b, v25.16b 196 add v8.8h, v8.8h, v10.8h 197 add v9.8h, v9.8h, v11.8h 198 sub v12.8h, v12.8h, v0.8h 199 sub v13.8h, v13.8h, v1.8h 200 uaddl v10.8h, v25.8b, v26.8b // q1 + q2 201 uaddl2 v11.8h, v25.16b, v26.16b 202 rshrn v0.8b, v8.8h, #3 // out p1 203 rshrn2 v0.16b, v9.8h, #3 204 205 add v8.8h, v8.8h, v12.8h 206 add v9.8h, v9.8h, v13.8h 207 sub v10.8h, v10.8h, v2.8h 208 sub v11.8h, v11.8h, v3.8h 209 uaddl v12.8h, v26.8b, v26.8b // q2 + q2 210 uaddl2 v13.8h, v26.16b, v26.16b 211 rshrn v1.8b, v8.8h, #3 // out p0 212 rshrn2 v1.16b, v9.8h, #3 213 214 add v8.8h, v8.8h, v10.8h 215 add v9.8h, v9.8h, v11.8h 216 sub v12.8h, v12.8h, v4.8h 217 sub v13.8h, v13.8h, v5.8h 218 rshrn v2.8b, v8.8h, #3 // out q0 219 rshrn2 v2.16b, v9.8h, #3 220 221 bit v22.16b, v0.16b, v14.16b // p1 if (flat8in) 222 add v8.8h, v8.8h, v12.8h 223 add v9.8h, v9.8h, v13.8h 224 bit v23.16b, v1.16b, v14.16b // p0 if (flat8in) 225 rshrn v3.8b, v8.8h, #3 // out q1 226 rshrn2 v3.16b, v9.8h, #3 227 bit v24.16b, v2.16b, v14.16b // q0 if (flat8in) 228 bit v25.16b, v3.16b, v14.16b // q1 if (flat8in) 229.elseif \wd >= 8 230 mov x16, v14.d[0] 231 mov x17, v14.d[1] 232 adds x16, x16, x17 233.if \wd == 8 234 b.eq 8f // skip if there's no flat8in 235.else 236 b.eq 2f // skip if there's no flat8in 237.endif 238 239 uaddl v0.8h, v20.8b, v21.8b // p3 + p2 240 uaddl2 v1.8h, v20.16b, v21.16b 241 uaddl v2.8h, v22.8b, v25.8b // p1 + q1 242 uaddl2 v3.8h, v22.16b, v25.16b 243 uaddl v4.8h, v20.8b, v22.8b // p3 + p1 244 uaddl2 v5.8h, v20.16b, v22.16b 245 uaddl v6.8h, v23.8b, v26.8b // p0 + q2 246 uaddl2 v7.8h, v23.16b, v26.16b 247 add v8.8h, v0.8h, v0.8h // 2 * (p3 + p2) 248 add v9.8h, v1.8h, v1.8h 249 uaddw v8.8h, v8.8h, v23.8b // + p0 250 uaddw2 v9.8h, v9.8h, v23.16b 251 uaddw v8.8h, v8.8h, v24.8b // + q0 252 uaddw2 v9.8h, v9.8h, v24.16b 253 add v8.8h, v8.8h, v4.8h 254 add v9.8h, v9.8h, v5.8h // + p3 + p1 255 sub v2.8h, v2.8h, v0.8h // p1 + q1 - p3 - p2 256 sub v3.8h, v3.8h, v1.8h 257 sub v6.8h, v6.8h, v4.8h // p0 + q2 - p3 - p1 258 sub v7.8h, v7.8h, v5.8h 259 rshrn v10.8b, v8.8h, #3 // out p2 260 rshrn2 v10.16b, v9.8h, #3 261 262 add v8.8h, v8.8h, v2.8h 263 add v9.8h, v9.8h, v3.8h 264 uaddl v0.8h, v20.8b, v23.8b // p3 + p0 265 uaddl2 v1.8h, v20.16b, v23.16b 266 uaddl v2.8h, v24.8b, v27.8b // q0 + q3 267 uaddl2 v3.8h, v24.16b, v27.16b 268 rshrn v11.8b, v8.8h, #3 // out p1 269 rshrn2 v11.16b, v9.8h, #3 270 271 add v8.8h, v8.8h, v6.8h 272 add v9.8h, v9.8h, v7.8h 273 sub v2.8h, v2.8h, v0.8h // q0 + q3 - p3 - p0 274 sub v3.8h, v3.8h, v1.8h 275 uaddl v4.8h, v21.8b, v24.8b // p2 + q0 276 uaddl2 v5.8h, v21.16b, v24.16b 277 uaddl v6.8h, v25.8b, v27.8b // q1 + q3 278 uaddl2 v7.8h, v25.16b, v27.16b 279 rshrn v12.8b, v8.8h, #3 // out p0 280 rshrn2 v12.16b, v9.8h, #3 281 282 add v8.8h, v8.8h, v2.8h 283 add v9.8h, v9.8h, v3.8h 284 sub v6.8h, v6.8h, v4.8h // q1 + q3 - p2 - q0 285 sub v7.8h, v7.8h, v5.8h 286 uaddl v0.8h, v22.8b, v25.8b // p1 + q1 287 uaddl2 v1.8h, v22.16b, v25.16b 288 uaddl v2.8h, v26.8b, v27.8b // q2 + q3 289 uaddl2 v3.8h, v26.16b, v27.16b 290 rshrn v13.8b, v8.8h, #3 // out q0 291 rshrn2 v13.16b, v9.8h, #3 292 293 add v8.8h, v8.8h, v6.8h 294 add v9.8h, v9.8h, v7.8h 295 sub v2.8h, v2.8h, v0.8h // q2 + q3 - p1 - q1 296 sub v3.8h, v3.8h, v1.8h 297 rshrn v0.8b, v8.8h, #3 // out q1 298 rshrn2 v0.16b, v9.8h, #3 299 300 add v8.8h, v8.8h, v2.8h 301 add v9.8h , v9.8h, v3.8h 302 303 bit v21.16b, v10.16b, v14.16b 304 bit v22.16b, v11.16b, v14.16b 305 bit v23.16b, v12.16b, v14.16b 306 rshrn v1.8b, v8.8h, #3 // out q2 307 rshrn2 v1.16b, v9.8h, #3 308 bit v24.16b, v13.16b, v14.16b 309 bit v25.16b, v0.16b, v14.16b 310 bit v26.16b, v1.16b, v14.16b 311.endif 3122: 313.if \wd == 16 314 mov x16, v15.d[0] 315 mov x17, v15.d[1] 316 adds x16, x16, x17 317 b.ne 1f // check if flat8out is needed 318 mov x16, v14.d[0] 319 mov x17, v14.d[1] 320 adds x16, x16, x17 321 b.eq 8f // if there was no flat8in, just write the inner 4 pixels 322 b 7f // if flat8in was used, write the inner 6 pixels 3231: 324 325 uaddl v2.8h, v17.8b, v17.8b // p6 + p6 326 uaddl2 v3.8h, v17.16b, v17.16b 327 uaddl v4.8h, v17.8b, v18.8b // p6 + p5 328 uaddl2 v5.8h, v17.16b, v18.16b 329 uaddl v6.8h, v17.8b, v19.8b // p6 + p4 330 uaddl2 v7.8h, v17.16b, v19.16b 331 uaddl v8.8h, v17.8b, v20.8b // p6 + p3 332 uaddl2 v9.8h, v17.16b, v20.16b 333 add v12.8h, v2.8h, v4.8h 334 add v13.8h, v3.8h, v5.8h 335 add v10.8h, v6.8h, v8.8h 336 add v11.8h, v7.8h, v9.8h 337 uaddl v6.8h, v17.8b, v21.8b // p6 + p2 338 uaddl2 v7.8h, v17.16b, v21.16b 339 add v12.8h, v12.8h, v10.8h 340 add v13.8h, v13.8h, v11.8h 341 uaddl v8.8h, v17.8b, v22.8b // p6 + p1 342 uaddl2 v9.8h, v17.16b, v22.16b 343 uaddl v10.8h, v18.8b, v23.8b // p5 + p0 344 uaddl2 v11.8h, v18.16b, v23.16b 345 add v6.8h, v6.8h, v8.8h 346 add v7.8h, v7.8h, v9.8h 347 uaddl v8.8h, v19.8b, v24.8b // p4 + q0 348 uaddl2 v9.8h, v19.16b, v24.16b 349 add v12.8h, v12.8h, v6.8h 350 add v13.8h, v13.8h, v7.8h 351 add v10.8h, v10.8h, v8.8h 352 add v11.8h, v11.8h, v9.8h 353 uaddl v6.8h, v20.8b, v25.8b // p3 + q1 354 uaddl2 v7.8h, v20.16b, v25.16b 355 add v12.8h, v12.8h, v10.8h 356 add v13.8h, v13.8h, v11.8h 357 sub v6.8h, v6.8h, v2.8h 358 sub v7.8h, v7.8h, v3.8h 359 uaddl v2.8h, v21.8b, v26.8b // p2 + q2 360 uaddl2 v3.8h, v21.16b, v26.16b 361 rshrn v0.8b, v12.8h, #4 // out p5 362 rshrn2 v0.16b, v13.8h, #4 363 add v12.8h, v12.8h, v6.8h // - (p6 + p6) + (p3 + q1) 364 add v13.8h, v13.8h, v7.8h 365 sub v2.8h, v2.8h, v4.8h 366 sub v3.8h, v3.8h, v5.8h 367 uaddl v4.8h, v22.8b, v27.8b // p1 + q3 368 uaddl2 v5.8h, v22.16b, v27.16b 369 uaddl v6.8h, v17.8b, v19.8b // p6 + p4 370 uaddl2 v7.8h, v17.16b, v19.16b 371 rshrn v1.8b, v12.8h, #4 // out p4 372 rshrn2 v1.16b, v13.8h, #4 373 add v12.8h, v12.8h, v2.8h // - (p6 + p5) + (p2 + q2) 374 add v13.8h, v13.8h, v3.8h 375 sub v4.8h, v4.8h, v6.8h 376 sub v5.8h, v5.8h, v7.8h 377 uaddl v6.8h, v23.8b, v28.8b // p0 + q4 378 uaddl2 v7.8h, v23.16b, v28.16b 379 uaddl v8.8h, v17.8b, v20.8b // p6 + p3 380 uaddl2 v9.8h, v17.16b, v20.16b 381 rshrn v2.8b, v12.8h, #4 // out p3 382 rshrn2 v2.16b, v13.8h, #4 383 add v12.8h, v12.8h, v4.8h // - (p6 + p4) + (p1 + q3) 384 add v13.8h, v13.8h, v5.8h 385 sub v6.8h, v6.8h, v8.8h 386 sub v7.8h, v7.8h, v9.8h 387 uaddl v8.8h, v24.8b, v29.8b // q0 + q5 388 uaddl2 v9.8h, v24.16b, v29.16b 389 uaddl v4.8h, v17.8b, v21.8b // p6 + p2 390 uaddl2 v5.8h, v17.16b, v21.16b 391 rshrn v3.8b, v12.8h, #4 // out p2 392 rshrn2 v3.16b, v13.8h, #4 393 add v12.8h, v12.8h, v6.8h // - (p6 + p3) + (p0 + q4) 394 add v13.8h, v13.8h, v7.8h 395 sub v8.8h, v8.8h, v4.8h 396 sub v9.8h, v9.8h, v5.8h 397 uaddl v6.8h, v25.8b, v30.8b // q1 + q6 398 uaddl2 v7.8h, v25.16b, v30.16b 399 uaddl v10.8h, v17.8b, v22.8b // p6 + p1 400 uaddl2 v11.8h, v17.16b, v22.16b 401 rshrn v4.8b, v12.8h, #4 // out p1 402 rshrn2 v4.16b, v13.8h, #4 403 add v12.8h, v12.8h, v8.8h // - (p6 + p2) + (q0 + q5) 404 add v13.8h, v13.8h, v9.8h 405 sub v6.8h, v6.8h, v10.8h 406 sub v7.8h, v7.8h, v11.8h 407 uaddl v8.8h, v26.8b, v30.8b // q2 + q6 408 uaddl2 v9.8h, v26.16b, v30.16b 409 bif v0.16b, v18.16b, v15.16b // out p5 410 uaddl v10.8h, v18.8b, v23.8b // p5 + p0 411 uaddl2 v11.8h, v18.16b, v23.16b 412 rshrn v5.8b, v12.8h, #4 // out p0 413 rshrn2 v5.16b, v13.8h, #4 414 add v12.8h, v12.8h, v6.8h // - (p6 + p1) + (q1 + q6) 415 add v13.8h, v13.8h, v7.8h 416 sub v8.8h, v8.8h, v10.8h 417 sub v9.8h, v9.8h, v11.8h 418 uaddl v10.8h, v27.8b, v30.8b // q3 + q6 419 uaddl2 v11.8h, v27.16b, v30.16b 420 bif v1.16b, v19.16b, v15.16b // out p4 421 uaddl v18.8h, v19.8b, v24.8b // p4 + q0 422 uaddl2 v19.8h, v19.16b, v24.16b 423 rshrn v6.8b, v12.8h, #4 // out q0 424 rshrn2 v6.16b, v13.8h, #4 425 add v12.8h, v12.8h, v8.8h // - (p5 + p0) + (q2 + q6) 426 add v13.8h, v13.8h, v9.8h 427 sub v10.8h, v10.8h, v18.8h 428 sub v11.8h, v11.8h, v19.8h 429 uaddl v8.8h, v28.8b, v30.8b // q4 + q6 430 uaddl2 v9.8h, v28.16b, v30.16b 431 bif v2.16b, v20.16b, v15.16b // out p3 432 uaddl v18.8h, v20.8b, v25.8b // p3 + q1 433 uaddl2 v19.8h, v20.16b, v25.16b 434 rshrn v7.8b, v12.8h, #4 // out q1 435 rshrn2 v7.16b, v13.8h, #4 436 add v12.8h, v12.8h, v10.8h // - (p4 + q0) + (q3 + q6) 437 add v13.8h, v13.8h, v11.8h 438 sub v18.8h, v8.8h, v18.8h 439 sub v19.8h, v9.8h, v19.8h 440 uaddl v10.8h, v29.8b, v30.8b // q5 + q6 441 uaddl2 v11.8h, v29.16b, v30.16b 442 bif v3.16b, v21.16b, v15.16b // out p2 443 uaddl v20.8h, v21.8b, v26.8b // p2 + q2 444 uaddl2 v21.8h, v21.16b, v26.16b 445 rshrn v8.8b, v12.8h, #4 // out q2 446 rshrn2 v8.16b, v13.8h, #4 447 add v12.8h, v12.8h, v18.8h // - (p3 + q1) + (q4 + q6) 448 add v13.8h, v13.8h, v19.8h 449 sub v10.8h, v10.8h, v20.8h 450 sub v11.8h, v11.8h, v21.8h 451 uaddl v18.8h, v30.8b, v30.8b // q6 + q6 452 uaddl2 v19.8h, v30.16b, v30.16b 453 bif v4.16b, v22.16b, v15.16b // out p1 454 uaddl v20.8h, v22.8b, v27.8b // p1 + q3 455 uaddl2 v21.8h, v22.16b, v27.16b 456 rshrn v9.8b, v12.8h, #4 // out q3 457 rshrn2 v9.16b, v13.8h, #4 458 add v12.8h, v12.8h, v10.8h // - (p2 + q2) + (q5 + q6) 459 add v13.8h, v13.8h, v11.8h 460 sub v18.8h, v18.8h, v20.8h 461 sub v19.8h, v19.8h, v21.8h 462 bif v5.16b, v23.16b, v15.16b // out p0 463 rshrn v10.8b, v12.8h, #4 // out q4 464 rshrn2 v10.16b, v13.8h, #4 465 add v12.8h, v12.8h, v18.8h // - (p1 + q3) + (q6 + q6) 466 add v13.8h, v13.8h, v19.8h 467 rshrn v11.8b, v12.8h, #4 // out q5 468 rshrn2 v11.16b, v13.8h, #4 469 bif v6.16b, v24.16b, v15.16b // out q0 470 bif v7.16b, v25.16b, v15.16b // out q1 471 bif v8.16b, v26.16b, v15.16b // out q2 472 bif v9.16b, v27.16b, v15.16b // out q3 473 bif v10.16b, v28.16b, v15.16b // out q4 474 bif v11.16b, v29.16b, v15.16b // out q5 475.endif 476 477 ret 478.if \wd == 16 4797: 480 // Return to a shorter epilogue, writing only the inner 6 pixels 481 ret x13 482.endif 483.if \wd >= 8 4848: 485 // Return to a shorter epilogue, writing only the inner 4 pixels 486 ret x14 487.endif 4889: 489 // Return directly without writing back any pixels 490 ret x15 491endfunc 492.endm 493 494loop_filter 16 495loop_filter 8 496loop_filter 6 497loop_filter 4 498 499.macro lpf_16_wd16 500 adr x13, 7f 501 adr x14, 8f 502 bl lpf_16_wd16_neon 503.endm 504 505.macro lpf_16_wd8 506 adr x14, 8f 507 bl lpf_16_wd8_neon 508.endm 509 510.macro lpf_16_wd6 511 bl lpf_16_wd6_neon 512.endm 513 514.macro lpf_16_wd4 515 bl lpf_16_wd4_neon 516.endm 517 518function lpf_v_4_16_neon 519 mov x15, x30 520 sub x16, x0, x1, lsl #1 521 ld1 {v22.16b}, [x16], x1 // p1 522 ld1 {v24.16b}, [x0], x1 // q0 523 ld1 {v23.16b}, [x16], x1 // p0 524 ld1 {v25.16b}, [x0], x1 // q1 525 sub x0, x0, x1, lsl #1 526 527 lpf_16_wd4 528 529 sub x16, x0, x1, lsl #1 530 st1 {v22.16b}, [x16], x1 // p1 531 st1 {v24.16b}, [x0], x1 // q0 532 st1 {v23.16b}, [x16], x1 // p0 533 st1 {v25.16b}, [x0], x1 // q1 534 sub x0, x0, x1, lsl #1 535 ret x15 536endfunc 537 538function lpf_h_4_16_neon 539 mov x15, x30 540 sub x16, x0, #2 541 add x0, x16, x1, lsl #3 542 ld1 {v22.s}[0], [x16], x1 543 ld1 {v22.s}[2], [x0], x1 544 ld1 {v23.s}[0], [x16], x1 545 ld1 {v23.s}[2], [x0], x1 546 ld1 {v24.s}[0], [x16], x1 547 ld1 {v24.s}[2], [x0], x1 548 ld1 {v25.s}[0], [x16], x1 549 ld1 {v25.s}[2], [x0], x1 550 ld1 {v22.s}[1], [x16], x1 551 ld1 {v22.s}[3], [x0], x1 552 ld1 {v23.s}[1], [x16], x1 553 ld1 {v23.s}[3], [x0], x1 554 ld1 {v24.s}[1], [x16], x1 555 ld1 {v24.s}[3], [x0], x1 556 ld1 {v25.s}[1], [x16], x1 557 ld1 {v25.s}[3], [x0], x1 558 add x0, x0, #2 559 560 transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29 561 562 lpf_16_wd4 563 564 sub x16, x0, x1, lsl #4 565 sub x16, x16, #2 566 transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29 567 add x0, x16, x1, lsl #3 568 569 st1 {v22.s}[0], [x16], x1 570 st1 {v22.s}[2], [x0], x1 571 st1 {v23.s}[0], [x16], x1 572 st1 {v23.s}[2], [x0], x1 573 st1 {v24.s}[0], [x16], x1 574 st1 {v24.s}[2], [x0], x1 575 st1 {v25.s}[0], [x16], x1 576 st1 {v25.s}[2], [x0], x1 577 st1 {v22.s}[1], [x16], x1 578 st1 {v22.s}[3], [x0], x1 579 st1 {v23.s}[1], [x16], x1 580 st1 {v23.s}[3], [x0], x1 581 st1 {v24.s}[1], [x16], x1 582 st1 {v24.s}[3], [x0], x1 583 st1 {v25.s}[1], [x16], x1 584 st1 {v25.s}[3], [x0], x1 585 add x0, x0, #2 586 ret x15 587endfunc 588 589function lpf_v_6_16_neon 590 mov x15, x30 591 sub x16, x0, x1, lsl #1 592 sub x16, x16, x1 593 ld1 {v21.16b}, [x16], x1 // p2 594 ld1 {v24.16b}, [x0], x1 // q0 595 ld1 {v22.16b}, [x16], x1 // p1 596 ld1 {v25.16b}, [x0], x1 // q1 597 ld1 {v23.16b}, [x16], x1 // p0 598 ld1 {v26.16b}, [x0], x1 // q2 599 sub x0, x0, x1, lsl #1 600 sub x0, x0, x1 601 602 lpf_16_wd6 603 604 sub x16, x0, x1, lsl #1 605 st1 {v22.16b}, [x16], x1 // p1 606 st1 {v24.16b}, [x0], x1 // q0 607 st1 {v23.16b}, [x16], x1 // p0 608 st1 {v25.16b}, [x0], x1 // q1 609 sub x0, x0, x1, lsl #1 610 ret x15 611endfunc 612 613function lpf_h_6_16_neon 614 mov x15, x30 615 sub x16, x0, #4 616 add x0, x16, x1, lsl #3 617 ld1 {v20.d}[0], [x16], x1 618 ld1 {v20.d}[1], [x0], x1 619 ld1 {v21.d}[0], [x16], x1 620 ld1 {v21.d}[1], [x0], x1 621 ld1 {v22.d}[0], [x16], x1 622 ld1 {v22.d}[1], [x0], x1 623 ld1 {v23.d}[0], [x16], x1 624 ld1 {v23.d}[1], [x0], x1 625 ld1 {v24.d}[0], [x16], x1 626 ld1 {v24.d}[1], [x0], x1 627 ld1 {v25.d}[0], [x16], x1 628 ld1 {v25.d}[1], [x0], x1 629 ld1 {v26.d}[0], [x16], x1 630 ld1 {v26.d}[1], [x0], x1 631 ld1 {v27.d}[0], [x16], x1 632 ld1 {v27.d}[1], [x0], x1 633 add x0, x0, #4 634 635 transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 636 637 lpf_16_wd6 638 639 sub x16, x0, x1, lsl #4 640 sub x16, x16, #2 641 transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29 642 add x0, x16, x1, lsl #3 643 644 st1 {v22.s}[0], [x16], x1 645 st1 {v22.s}[2], [x0], x1 646 st1 {v23.s}[0], [x16], x1 647 st1 {v23.s}[2], [x0], x1 648 st1 {v24.s}[0], [x16], x1 649 st1 {v24.s}[2], [x0], x1 650 st1 {v25.s}[0], [x16], x1 651 st1 {v25.s}[2], [x0], x1 652 st1 {v22.s}[1], [x16], x1 653 st1 {v22.s}[3], [x0], x1 654 st1 {v23.s}[1], [x16], x1 655 st1 {v23.s}[3], [x0], x1 656 st1 {v24.s}[1], [x16], x1 657 st1 {v24.s}[3], [x0], x1 658 st1 {v25.s}[1], [x16], x1 659 st1 {v25.s}[3], [x0], x1 660 add x0, x0, #2 661 ret x15 662endfunc 663 664function lpf_v_8_16_neon 665 mov x15, x30 666 sub x16, x0, x1, lsl #2 667 ld1 {v20.16b}, [x16], x1 // p3 668 ld1 {v24.16b}, [x0], x1 // q0 669 ld1 {v21.16b}, [x16], x1 // p2 670 ld1 {v25.16b}, [x0], x1 // q1 671 ld1 {v22.16b}, [x16], x1 // p1 672 ld1 {v26.16b}, [x0], x1 // q2 673 ld1 {v23.16b}, [x16], x1 // p0 674 ld1 {v27.16b}, [x0], x1 // q3 675 sub x0, x0, x1, lsl #2 676 677 lpf_16_wd8 678 679 sub x16, x0, x1, lsl #1 680 sub x16, x16, x1 681 st1 {v21.16b}, [x16], x1 // p2 682 st1 {v24.16b}, [x0], x1 // q0 683 st1 {v22.16b}, [x16], x1 // p1 684 st1 {v25.16b}, [x0], x1 // q1 685 st1 {v23.16b}, [x16], x1 // p0 686 st1 {v26.16b}, [x0], x1 // q2 687 sub x0, x0, x1, lsl #1 688 sub x0, x0, x1 689 ret x15 690 6918: 692 sub x16, x0, x1, lsl #1 693 st1 {v22.16b}, [x16], x1 // p1 694 st1 {v24.16b}, [x0], x1 // q0 695 st1 {v23.16b}, [x16], x1 // p0 696 st1 {v25.16b}, [x0], x1 // q1 697 sub x0, x0, x1, lsl #1 698 ret x15 699endfunc 700 701function lpf_h_8_16_neon 702 mov x15, x30 703 sub x16, x0, #4 704 add x0, x16, x1, lsl #3 705 ld1 {v20.d}[0], [x16], x1 706 ld1 {v20.d}[1], [x0], x1 707 ld1 {v21.d}[0], [x16], x1 708 ld1 {v21.d}[1], [x0], x1 709 ld1 {v22.d}[0], [x16], x1 710 ld1 {v22.d}[1], [x0], x1 711 ld1 {v23.d}[0], [x16], x1 712 ld1 {v23.d}[1], [x0], x1 713 ld1 {v24.d}[0], [x16], x1 714 ld1 {v24.d}[1], [x0], x1 715 ld1 {v25.d}[0], [x16], x1 716 ld1 {v25.d}[1], [x0], x1 717 ld1 {v26.d}[0], [x16], x1 718 ld1 {v26.d}[1], [x0], x1 719 ld1 {v27.d}[0], [x16], x1 720 ld1 {v27.d}[1], [x0], x1 721 add x0, x0, #4 722 723 transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 724 725 lpf_16_wd8 726 727 sub x16, x0, x1, lsl #4 728 sub x16, x16, #4 729 transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 730 add x0, x16, x1, lsl #3 731 732 st1 {v20.d}[0], [x16], x1 733 st1 {v20.d}[1], [x0], x1 734 st1 {v21.d}[0], [x16], x1 735 st1 {v21.d}[1], [x0], x1 736 st1 {v22.d}[0], [x16], x1 737 st1 {v22.d}[1], [x0], x1 738 st1 {v23.d}[0], [x16], x1 739 st1 {v23.d}[1], [x0], x1 740 st1 {v24.d}[0], [x16], x1 741 st1 {v24.d}[1], [x0], x1 742 st1 {v25.d}[0], [x16], x1 743 st1 {v25.d}[1], [x0], x1 744 st1 {v26.d}[0], [x16], x1 745 st1 {v26.d}[1], [x0], x1 746 st1 {v27.d}[0], [x16], x1 747 st1 {v27.d}[1], [x0], x1 748 add x0, x0, #4 749 ret x15 7508: 751 sub x16, x0, x1, lsl #4 752 sub x16, x16, #2 753 transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29 754 add x0, x16, x1, lsl #3 755 756 st1 {v22.s}[0], [x16], x1 757 st1 {v22.s}[2], [x0], x1 758 st1 {v23.s}[0], [x16], x1 759 st1 {v23.s}[2], [x0], x1 760 st1 {v24.s}[0], [x16], x1 761 st1 {v24.s}[2], [x0], x1 762 st1 {v25.s}[0], [x16], x1 763 st1 {v25.s}[2], [x0], x1 764 st1 {v22.s}[1], [x16], x1 765 st1 {v22.s}[3], [x0], x1 766 st1 {v23.s}[1], [x16], x1 767 st1 {v23.s}[3], [x0], x1 768 st1 {v24.s}[1], [x16], x1 769 st1 {v24.s}[3], [x0], x1 770 st1 {v25.s}[1], [x16], x1 771 st1 {v25.s}[3], [x0], x1 772 add x0, x0, #2 773 ret x15 774endfunc 775 776function lpf_v_16_16_neon 777 mov x15, x30 778 779 sub x16, x0, x1, lsl #3 780 add x16, x16, x1 781 ld1 {v17.16b}, [x16], x1 // p6 782 ld1 {v24.16b}, [x0], x1 // q0 783 ld1 {v18.16b}, [x16], x1 // p5 784 ld1 {v25.16b}, [x0], x1 // q1 785 ld1 {v19.16b}, [x16], x1 // p4 786 ld1 {v26.16b}, [x0], x1 // q2 787 ld1 {v20.16b}, [x16], x1 // p3 788 ld1 {v27.16b}, [x0], x1 // q3 789 ld1 {v21.16b}, [x16], x1 // p2 790 ld1 {v28.16b}, [x0], x1 // q4 791 ld1 {v22.16b}, [x16], x1 // p1 792 ld1 {v29.16b}, [x0], x1 // q5 793 ld1 {v23.16b}, [x16], x1 // p0 794 ld1 {v30.16b}, [x0], x1 // q6 795 sub x0, x0, x1, lsl #3 796 add x0, x0, x1 797 798 lpf_16_wd16 799 800 sub x16, x0, x1, lsl #2 801 sub x16, x16, x1, lsl #1 802 st1 {v0.16b}, [x16], x1 // p5 803 st1 {v6.16b}, [x0], x1 // q0 804 st1 {v1.16b}, [x16], x1 // p4 805 st1 {v7.16b}, [x0], x1 // q1 806 st1 {v2.16b}, [x16], x1 // p3 807 st1 {v8.16b}, [x0], x1 // q2 808 st1 {v3.16b}, [x16], x1 // p2 809 st1 {v9.16b}, [x0], x1 // q3 810 st1 {v4.16b}, [x16], x1 // p1 811 st1 {v10.16b}, [x0], x1 // q4 812 st1 {v5.16b}, [x16], x1 // p0 813 st1 {v11.16b}, [x0], x1 // q5 814 sub x0, x0, x1, lsl #2 815 sub x0, x0, x1, lsl #1 816 ret x15 8177: 818 sub x16, x0, x1 819 sub x16, x16, x1, lsl #1 820 st1 {v21.16b}, [x16], x1 // p2 821 st1 {v24.16b}, [x0], x1 // q0 822 st1 {v22.16b}, [x16], x1 // p1 823 st1 {v25.16b}, [x0], x1 // q1 824 st1 {v23.16b}, [x16], x1 // p0 825 st1 {v26.16b}, [x0], x1 // q2 826 sub x0, x0, x1, lsl #1 827 sub x0, x0, x1 828 ret x15 829 8308: 831 sub x16, x0, x1, lsl #1 832 st1 {v22.16b}, [x16], x1 // p1 833 st1 {v24.16b}, [x0], x1 // q0 834 st1 {v23.16b}, [x16], x1 // p0 835 st1 {v25.16b}, [x0], x1 // q1 836 sub x0, x0, x1, lsl #1 837 ret x15 838endfunc 839 840function lpf_h_16_16_neon 841 mov x15, x30 842 sub x16, x0, #8 843 ld1 {v16.d}[0], [x16], x1 844 ld1 {v24.d}[0], [x0], x1 845 ld1 {v17.d}[0], [x16], x1 846 ld1 {v25.d}[0], [x0], x1 847 ld1 {v18.d}[0], [x16], x1 848 ld1 {v26.d}[0], [x0], x1 849 ld1 {v19.d}[0], [x16], x1 850 ld1 {v27.d}[0], [x0], x1 851 ld1 {v20.d}[0], [x16], x1 852 ld1 {v28.d}[0], [x0], x1 853 ld1 {v21.d}[0], [x16], x1 854 ld1 {v29.d}[0], [x0], x1 855 ld1 {v22.d}[0], [x16], x1 856 ld1 {v30.d}[0], [x0], x1 857 ld1 {v23.d}[0], [x16], x1 858 ld1 {v31.d}[0], [x0], x1 859 ld1 {v16.d}[1], [x16], x1 860 ld1 {v24.d}[1], [x0], x1 861 ld1 {v17.d}[1], [x16], x1 862 ld1 {v25.d}[1], [x0], x1 863 ld1 {v18.d}[1], [x16], x1 864 ld1 {v26.d}[1], [x0], x1 865 ld1 {v19.d}[1], [x16], x1 866 ld1 {v27.d}[1], [x0], x1 867 ld1 {v20.d}[1], [x16], x1 868 ld1 {v28.d}[1], [x0], x1 869 ld1 {v21.d}[1], [x16], x1 870 ld1 {v29.d}[1], [x0], x1 871 ld1 {v22.d}[1], [x16], x1 872 ld1 {v30.d}[1], [x0], x1 873 ld1 {v23.d}[1], [x16], x1 874 ld1 {v31.d}[1], [x0], x1 875 876 transpose_8x16b v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 877 transpose_8x16b v24, v25, v26, v27, v28, v29, v30, v31, v0, v1 878 879 lpf_16_wd16 880 881 sub x0, x0, x1, lsl #4 882 sub x16, x0, #8 883 884 transpose_8x16b v16, v17, v0, v1, v2, v3, v4, v5, v18, v19 885 transpose_8x16b v6, v7, v8, v9, v10, v11, v30, v31, v18, v19 886 887 st1 {v16.d}[0], [x16], x1 888 st1 {v6.d}[0], [x0], x1 889 st1 {v17.d}[0], [x16], x1 890 st1 {v7.d}[0], [x0], x1 891 st1 {v0.d}[0], [x16], x1 892 st1 {v8.d}[0], [x0], x1 893 st1 {v1.d}[0], [x16], x1 894 st1 {v9.d}[0], [x0], x1 895 st1 {v2.d}[0], [x16], x1 896 st1 {v10.d}[0], [x0], x1 897 st1 {v3.d}[0], [x16], x1 898 st1 {v11.d}[0], [x0], x1 899 st1 {v4.d}[0], [x16], x1 900 st1 {v30.d}[0], [x0], x1 901 st1 {v5.d}[0], [x16], x1 902 st1 {v31.d}[0], [x0], x1 903 st1 {v16.d}[1], [x16], x1 904 st1 {v6.d}[1], [x0], x1 905 st1 {v17.d}[1], [x16], x1 906 st1 {v7.d}[1], [x0], x1 907 st1 {v0.d}[1], [x16], x1 908 st1 {v8.d}[1], [x0], x1 909 st1 {v1.d}[1], [x16], x1 910 st1 {v9.d}[1], [x0], x1 911 st1 {v2.d}[1], [x16], x1 912 st1 {v10.d}[1], [x0], x1 913 st1 {v3.d}[1], [x16], x1 914 st1 {v11.d}[1], [x0], x1 915 st1 {v4.d}[1], [x16], x1 916 st1 {v30.d}[1], [x0], x1 917 st1 {v5.d}[1], [x16], x1 918 st1 {v31.d}[1], [x0], x1 919 ret x15 920 9217: 922 sub x16, x0, x1, lsl #4 923 sub x16, x16, #4 924 transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 925 add x0, x16, x1, lsl #3 926 927 st1 {v20.d}[0], [x16], x1 928 st1 {v20.d}[1], [x0], x1 929 st1 {v21.d}[0], [x16], x1 930 st1 {v21.d}[1], [x0], x1 931 st1 {v22.d}[0], [x16], x1 932 st1 {v22.d}[1], [x0], x1 933 st1 {v23.d}[0], [x16], x1 934 st1 {v23.d}[1], [x0], x1 935 st1 {v24.d}[0], [x16], x1 936 st1 {v24.d}[1], [x0], x1 937 st1 {v25.d}[0], [x16], x1 938 st1 {v25.d}[1], [x0], x1 939 st1 {v26.d}[0], [x16], x1 940 st1 {v26.d}[1], [x0], x1 941 st1 {v27.d}[0], [x16], x1 942 st1 {v27.d}[1], [x0], x1 943 add x0, x0, #4 944 ret x15 9458: 946 sub x16, x0, x1, lsl #4 947 sub x16, x16, #2 948 transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29 949 add x0, x16, x1, lsl #3 950 951 st1 {v22.s}[0], [x16], x1 952 st1 {v22.s}[2], [x0], x1 953 st1 {v23.s}[0], [x16], x1 954 st1 {v23.s}[2], [x0], x1 955 st1 {v24.s}[0], [x16], x1 956 st1 {v24.s}[2], [x0], x1 957 st1 {v25.s}[0], [x16], x1 958 st1 {v25.s}[2], [x0], x1 959 st1 {v22.s}[1], [x16], x1 960 st1 {v22.s}[3], [x0], x1 961 st1 {v23.s}[1], [x16], x1 962 st1 {v23.s}[3], [x0], x1 963 st1 {v24.s}[1], [x16], x1 964 st1 {v24.s}[3], [x0], x1 965 st1 {v25.s}[1], [x16], x1 966 st1 {v25.s}[3], [x0], x1 967 add x0, x0, #2 968 ret x15 969endfunc 970 971// void dav1d_lpf_v_sb_y_8bpc_neon(pixel *dst, const ptrdiff_t stride, 972// const uint32_t *const vmask, 973// const uint8_t (*l)[4], ptrdiff_t b4_stride, 974// const Av1FilterLUT *lut, const int w) 975 976.macro lpf_func dir, type 977function lpf_\dir\()_sb_\type\()_8bpc_neon, export=1 978 mov x11, x30 979 stp d8, d9, [sp, #-0x40]! 980 stp d10, d11, [sp, #0x10] 981 stp d12, d13, [sp, #0x20] 982 stp d14, d15, [sp, #0x30] 983 ldp w6, w7, [x2] // vmask[0], vmask[1] 984.ifc \type, y 985 ldr w2, [x2, #8] // vmask[2] 986.endif 987 add x5, x5, #128 // Move to sharp part of lut 988.ifc \type, y 989 orr w7, w7, w2 // vmask[1] |= vmask[2] 990.endif 991.ifc \dir, v 992 sub x4, x3, x4, lsl #2 993.else 994 sub x3, x3, #4 995 lsl x4, x4, #2 996.endif 997 orr w6, w6, w7 // vmask[0] |= vmask[1] 998 9991: 1000 tst w6, #0x0f 1001.ifc \dir, v 1002 ld1 {v0.16b}, [x4], #16 1003 ld1 {v1.16b}, [x3], #16 1004.else 1005 ld2 {v0.s,v1.s}[0], [x3], x4 1006 ld2 {v0.s,v1.s}[1], [x3], x4 1007 ld2 {v0.s,v1.s}[2], [x3], x4 1008 ld2 {v0.s,v1.s}[3], [x3], x4 1009.endif 1010 b.eq 7f // if (!(vm & bits)) continue; 1011 1012 ld1r {v5.16b}, [x5] // sharp[0] 1013 add x5, x5, #8 1014 movi v2.4s, #0xff 1015 dup v13.4s, w6 // vmask[0] 1016 1017 and v0.16b, v0.16b, v2.16b // Keep only lowest byte in each 32 bit word 1018 and v1.16b, v1.16b, v2.16b 1019 cmtst v3.16b, v1.16b, v2.16b // Check for nonzero values in l[0][0] 1020 movi v4.16b, #1 1021 ld1r {v6.16b}, [x5] // sharp[1] 1022 sub x5, x5, #8 1023 bif v1.16b, v0.16b, v3.16b // if (!l[0][0]) L = l[offset][0] 1024 cmtst v2.4s, v1.4s, v2.4s // L != 0 1025 mul v1.4s, v1.4s, v4.4s // L 1026.ifc \type, y 1027 dup v15.4s, w2 // vmask[2] 1028.endif 1029 dup v14.4s, w7 // vmask[1] 1030 mov x16, v2.d[0] 1031 mov x17, v2.d[1] 1032 adds x16, x16, x17 1033 b.eq 7f // if (!L) continue; 1034 neg v5.16b, v5.16b // -sharp[0] 1035 movrel x16, word_1248 1036 ushr v12.16b, v1.16b, #4 // H 1037 ld1 {v16.4s}, [x16] 1038 sshl v3.16b, v1.16b, v5.16b // L >> sharp[0] 1039.ifc \type, y 1040 cmtst v15.4s, v15.4s, v16.4s // if (vmask[2] & bits) 1041.endif 1042 movi v7.16b, #2 1043 umin v3.16b, v3.16b, v6.16b // imin(L >> sharp[0], sharp[1]) 1044 add v0.16b, v1.16b, v7.16b // L + 2 1045 umax v11.16b, v3.16b, v4.16b // imax(imin(), 1) = limit = I 1046 add v0.16b, v0.16b, v0.16b // 2*(L + 2) 1047 cmtst v14.4s, v14.4s, v16.4s // if (vmask[1] & bits) 1048 add v10.16b, v0.16b, v11.16b // 2*(L + 2) + limit = E 1049 cmtst v13.4s, v13.4s, v16.4s // if (vmask[0] & bits) 1050 and v13.16b, v13.16b, v2.16b // vmask[0] &= L != 0 1051 1052.ifc \type, y 1053 tst w2, #0x0f 1054 b.eq 2f 1055 // wd16 1056 bl lpf_\dir\()_16_16_neon 1057 b 8f 10582: 1059.endif 1060 tst w7, #0x0f 1061 b.eq 3f 1062.ifc \type, y 1063 // wd8 1064 bl lpf_\dir\()_8_16_neon 1065.else 1066 // wd6 1067 bl lpf_\dir\()_6_16_neon 1068.endif 1069 b 8f 10703: 1071 // wd4 1072 bl lpf_\dir\()_4_16_neon 1073.ifc \dir, h 1074 b 8f 10757: 1076 // For dir h, the functions above increment x0. 1077 // If the whole function is skipped, increment it here instead. 1078 add x0, x0, x1, lsl #4 1079.else 10807: 1081.endif 10828: 1083 lsr w6, w6, #4 // vmask[0] >>= 4 1084 lsr w7, w7, #4 // vmask[1] >>= 4 1085.ifc \type, y 1086 lsr w2, w2, #4 // vmask[2] >>= 4 1087.endif 1088.ifc \dir, v 1089 add x0, x0, #16 1090.else 1091 // For dir h, x0 is returned incremented 1092.endif 1093 cbnz w6, 1b 1094 1095 ldp d14, d15, [sp, #0x30] 1096 ldp d12, d13, [sp, #0x20] 1097 ldp d10, d11, [sp, #0x10] 1098 ldp d8, d9, [sp], 0x40 1099 ret x11 1100endfunc 1101.endm 1102 1103lpf_func v, y 1104lpf_func h, y 1105lpf_func v, uv 1106lpf_func h, uv 1107 1108const word_1248 1109 .word 1, 2, 4, 8 1110endconst 1111