1/*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2019, Martin Storsjo
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/arm/asm.S"
29#include "util.S"
30
31.macro loop_filter wd
32function lpf_16_wd\wd\()_neon
33        uabd            v0.16b,  v22.16b, v23.16b // abs(p1 - p0)
34        uabd            v1.16b,  v25.16b, v24.16b // abs(q1 - q0)
35        uabd            v2.16b,  v23.16b, v24.16b // abs(p0 - q0)
36        uabd            v3.16b,  v22.16b, v25.16b // abs(p1 - q1)
37.if \wd >= 6
38        uabd            v4.16b,  v21.16b, v22.16b // abs(p2 - p1)
39        uabd            v5.16b,  v26.16b, v25.16b // abs(q2 - q1)
40.endif
41.if \wd >= 8
42        uabd            v6.16b,  v20.16b, v21.16b // abs(p3 - p2)
43        uabd            v7.16b,  v27.16b, v26.16b // abs(q3 - q3)
44.endif
45.if \wd >= 6
46        umax            v4.16b,  v4.16b,  v5.16b
47.endif
48        uqadd           v2.16b,  v2.16b,  v2.16b  // abs(p0 - q0) * 2
49.if \wd >= 8
50        umax            v6.16b,  v6.16b,  v7.16b
51.endif
52        ushr            v3.16b,  v3.16b,  #1
53.if \wd >= 8
54        umax            v4.16b,  v4.16b,  v6.16b
55.endif
56.if \wd >= 6
57        and             v4.16b,  v4.16b,  v14.16b
58.endif
59        umax            v0.16b,  v0.16b,  v1.16b  // max(abs(p1 - p0), abs(q1 - q0))
60        uqadd           v2.16b,  v2.16b,  v3.16b  // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
61.if \wd >= 6
62        umax            v4.16b,  v0.16b,  v4.16b
63        cmhs            v1.16b,  v11.16b, v4.16b  // max(abs(p1 - p0), abs(q1 - q0), abs(), abs(), ...) <= I
64.else
65        cmhs            v1.16b,  v11.16b, v0.16b  // max(abs(p1 - p0), abs(q1 - q0)) <= I
66.endif
67        cmhs            v2.16b,  v10.16b, v2.16b  // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 <= E
68        and             v1.16b,  v1.16b,  v2.16b  // fm
69        and             v1.16b,  v1.16b,  v13.16b // fm && wd >= 4
70.if \wd >= 6
71        and             v14.16b, v14.16b, v1.16b  // fm && wd > 4
72.endif
73.if \wd >= 16
74        and             v15.16b, v15.16b, v1.16b  // fm && wd == 16
75.endif
76
77        mov             x16, v1.d[0]
78        mov             x17, v1.d[1]
79        adds            x16, x16, x17
80        b.eq            9f                        // if (!fm || wd < 4) return;
81
82.if \wd >= 6
83        movi            v10.16b, #1
84        uabd            v2.16b,  v21.16b, v23.16b // abs(p2 - p0)
85        uabd            v3.16b,  v22.16b, v23.16b // abs(p1 - p0)
86        uabd            v4.16b,  v25.16b, v24.16b // abs(q1 - q0)
87        uabd            v5.16b,  v26.16b, v24.16b // abs(q2 - q0)
88.if \wd >= 8
89        uabd            v6.16b,  v20.16b, v23.16b // abs(p3 - p0)
90        uabd            v7.16b,  v27.16b, v24.16b // abs(q3 - q0)
91.endif
92        umax            v2.16b,  v2.16b,  v3.16b
93        umax            v4.16b,  v4.16b,  v5.16b
94.if \wd >= 8
95        umax            v6.16b,  v6.16b,  v7.16b
96.endif
97        umax            v2.16b,  v2.16b,  v4.16b
98.if \wd >= 8
99        umax            v2.16b,  v2.16b,  v6.16b
100.endif
101
102.if \wd == 16
103        uabd            v3.16b,  v17.16b, v23.16b // abs(p6 - p0)
104        uabd            v4.16b,  v18.16b, v23.16b // abs(p5 - p0)
105        uabd            v5.16b,  v19.16b, v23.16b // abs(p4 - p0)
106.endif
107        cmhs            v2.16b,  v10.16b, v2.16b  // flat8in
108.if \wd == 16
109        uabd            v6.16b,  v28.16b, v24.16b // abs(q4 - q0)
110        uabd            v7.16b,  v29.16b, v24.16b // abs(q5 - q0)
111        uabd            v8.16b,  v30.16b, v24.16b // abs(q6 - q0)
112.endif
113        and             v14.16b, v2.16b,  v14.16b // flat8in && fm && wd > 4
114        bic             v1.16b,  v1.16b,  v14.16b // fm && wd >= 4 && !flat8in
115.if \wd == 16
116        umax            v3.16b,  v3.16b,  v4.16b
117        umax            v5.16b,  v5.16b,  v6.16b
118.endif
119        mov             x16, v1.d[0]
120        mov             x17, v1.d[1]
121.if \wd == 16
122        umax            v7.16b,  v7.16b,  v8.16b
123        umax            v3.16b,  v3.16b,  v5.16b
124        umax            v3.16b,  v3.16b,  v7.16b
125        cmhs            v3.16b,  v10.16b, v3.16b  // flat8out
126.endif
127        adds            x16, x16, x17
128.if \wd == 16
129        and             v15.16b, v15.16b, v3.16b  // flat8out && fm && wd == 16
130        and             v15.16b, v15.16b, v14.16b // flat8out && flat8in && fm && wd == 16
131        bic             v14.16b, v14.16b, v15.16b // flat8in && fm && wd >= 4 && !flat8out
132.endif
133        b.eq            1f                        // skip wd == 4 case
134.endif
135        movi            v3.16b,  #128
136        eor             v2.16b,  v22.16b, v3.16b  // p1 - 128
137        eor             v3.16b,  v25.16b, v3.16b  // q1 - 128
138        cmhi            v0.16b,  v0.16b,  v12.16b // hev
139        sqsub           v2.16b,  v2.16b,  v3.16b  // iclip_diff(p1 - q1)
140        and             v4.16b,  v2.16b,  v0.16b  // if (hev) iclip_diff(p1 - q1)
141        bic             v0.16b,  v1.16b,  v0.16b  // (fm && wd >= 4 && !hev)
142        usubl           v2.8h,   v24.8b,  v23.8b
143        movi            v5.8h,   #3
144        usubl2          v3.8h,   v24.16b, v23.16b
145        mul             v2.8h,   v2.8h,   v5.8h
146        mul             v3.8h,   v3.8h,   v5.8h
147        movi            v6.16b,  #4
148        saddw           v2.8h,   v2.8h,   v4.8b
149        saddw2          v3.8h,   v3.8h,   v4.16b
150        movi            v7.16b,  #3
151        sqxtn           v2.8b,   v2.8h            // f
152        sqxtn2          v2.16b,  v3.8h
153        sqadd           v4.16b,  v6.16b,  v2.16b  // imin(f + 4, 127)
154        sqadd           v5.16b,  v7.16b,  v2.16b  // imin(f + 3, 127)
155        sshr            v4.16b,  v4.16b,  #3      // f1
156        sshr            v5.16b,  v5.16b,  #3      // f2
157        mov             v2.16b,  v23.16b          // p0
158        mov             v3.16b,  v24.16b          // q0
159        neg             v6.16b,  v4.16b           // -f1
160        srshr           v4.16b,  v4.16b,  #1      // (f1 + 1) >> 1
161        // p0 + f2, q0 - f1
162        usqadd          v2.16b,  v5.16b           // out p0
163        usqadd          v3.16b,  v6.16b           // out q0
164        neg             v6.16b,  v4.16b           // -((f1 + 1) >> 1)
165        bit             v23.16b, v2.16b,  v1.16b  // if (fm && wd >= 4)
166        bit             v24.16b, v3.16b,  v1.16b  // if (fm && wd >= 4)
167        mov             v2.16b,  v22.16b          // p1
168        mov             v3.16b,  v25.16b          // q1
169        // p1 + ((f1 + 1) >> 1), q1 - ((f1 + 1) >> 1)
170        usqadd          v2.16b,  v4.16b           // out p1
171        usqadd          v3.16b,  v6.16b           // out q1
172        bit             v22.16b, v2.16b,  v0.16b  // if (fm && wd >= 4 && !hev)
173        bit             v25.16b, v3.16b,  v0.16b  // if (fm && wd >= 4 && !hev)
1741:
175
176.if \wd == 6
177        mov             x16, v14.d[0]
178        mov             x17, v14.d[1]
179        adds            x16, x16, x17
180        b.eq            2f                        // skip if there's no flat8in
181
182        uaddl           v0.8h,   v21.8b,  v21.8b  // p2 * 2
183        uaddl2          v1.8h,   v21.16b, v21.16b
184        uaddl           v2.8h,   v21.8b,  v22.8b  // p2 + p1
185        uaddl2          v3.8h,   v21.16b, v22.16b
186        uaddl           v4.8h,   v22.8b,  v23.8b  // p1 + p0
187        uaddl2          v5.8h,   v22.16b, v23.16b
188        uaddl           v6.8h,   v23.8b,  v24.8b  // p0 + q0
189        uaddl2          v7.8h,   v23.16b, v24.16b
190        add             v8.8h,   v0.8h,   v2.8h
191        add             v9.8h,   v1.8h,   v3.8h
192        add             v10.8h,  v4.8h,   v6.8h
193        add             v11.8h,  v5.8h,   v7.8h
194        uaddl           v12.8h,  v24.8b,  v25.8b  // q0 + q1
195        uaddl2          v13.8h,  v24.16b, v25.16b
196        add             v8.8h,   v8.8h,   v10.8h
197        add             v9.8h,   v9.8h,   v11.8h
198        sub             v12.8h,  v12.8h,  v0.8h
199        sub             v13.8h,  v13.8h,  v1.8h
200        uaddl           v10.8h,  v25.8b,  v26.8b  // q1 + q2
201        uaddl2          v11.8h,  v25.16b, v26.16b
202        rshrn           v0.8b,   v8.8h,   #3      // out p1
203        rshrn2          v0.16b,  v9.8h,   #3
204
205        add             v8.8h,   v8.8h,   v12.8h
206        add             v9.8h,   v9.8h,   v13.8h
207        sub             v10.8h,  v10.8h,  v2.8h
208        sub             v11.8h,  v11.8h,  v3.8h
209        uaddl           v12.8h,  v26.8b,  v26.8b  // q2 + q2
210        uaddl2          v13.8h,  v26.16b, v26.16b
211        rshrn           v1.8b,   v8.8h,   #3      // out p0
212        rshrn2          v1.16b,  v9.8h,   #3
213
214        add             v8.8h,   v8.8h,   v10.8h
215        add             v9.8h,   v9.8h,   v11.8h
216        sub             v12.8h,  v12.8h,  v4.8h
217        sub             v13.8h,  v13.8h,  v5.8h
218        rshrn           v2.8b,   v8.8h,   #3      // out q0
219        rshrn2          v2.16b,  v9.8h,   #3
220
221        bit             v22.16b, v0.16b,  v14.16b // p1 if (flat8in)
222        add             v8.8h,   v8.8h,   v12.8h
223        add             v9.8h,   v9.8h,   v13.8h
224        bit             v23.16b, v1.16b,  v14.16b // p0 if (flat8in)
225        rshrn           v3.8b,   v8.8h,   #3      // out q1
226        rshrn2          v3.16b,  v9.8h,   #3
227        bit             v24.16b, v2.16b,  v14.16b // q0 if (flat8in)
228        bit             v25.16b, v3.16b,  v14.16b // q1 if (flat8in)
229.elseif \wd >= 8
230        mov             x16, v14.d[0]
231        mov             x17, v14.d[1]
232        adds            x16, x16, x17
233.if \wd == 8
234        b.eq            8f                        // skip if there's no flat8in
235.else
236        b.eq            2f                        // skip if there's no flat8in
237.endif
238
239        uaddl           v0.8h,   v20.8b,  v21.8b  // p3 + p2
240        uaddl2          v1.8h,   v20.16b, v21.16b
241        uaddl           v2.8h,   v22.8b,  v25.8b  // p1 + q1
242        uaddl2          v3.8h,   v22.16b, v25.16b
243        uaddl           v4.8h,   v20.8b,  v22.8b  // p3 + p1
244        uaddl2          v5.8h,   v20.16b, v22.16b
245        uaddl           v6.8h,   v23.8b,  v26.8b  // p0 + q2
246        uaddl2          v7.8h,   v23.16b, v26.16b
247        add             v8.8h,   v0.8h,   v0.8h   // 2 * (p3 + p2)
248        add             v9.8h,   v1.8h,   v1.8h
249        uaddw           v8.8h,   v8.8h,   v23.8b  // + p0
250        uaddw2          v9.8h,   v9.8h,   v23.16b
251        uaddw           v8.8h,   v8.8h,   v24.8b  // + q0
252        uaddw2          v9.8h,   v9.8h,   v24.16b
253        add             v8.8h,   v8.8h,   v4.8h
254        add             v9.8h,   v9.8h,   v5.8h   // + p3 + p1
255        sub             v2.8h,   v2.8h,   v0.8h   // p1 + q1 - p3 - p2
256        sub             v3.8h,   v3.8h,   v1.8h
257        sub             v6.8h,   v6.8h,   v4.8h   // p0 + q2 - p3 - p1
258        sub             v7.8h,   v7.8h,   v5.8h
259        rshrn           v10.8b,  v8.8h,   #3      // out p2
260        rshrn2          v10.16b, v9.8h,   #3
261
262        add             v8.8h,   v8.8h,   v2.8h
263        add             v9.8h,   v9.8h,   v3.8h
264        uaddl           v0.8h,   v20.8b,  v23.8b  // p3 + p0
265        uaddl2          v1.8h,   v20.16b, v23.16b
266        uaddl           v2.8h,   v24.8b,  v27.8b  // q0 + q3
267        uaddl2          v3.8h,   v24.16b, v27.16b
268        rshrn           v11.8b,  v8.8h,   #3      // out p1
269        rshrn2          v11.16b, v9.8h,   #3
270
271        add             v8.8h,   v8.8h,   v6.8h
272        add             v9.8h,   v9.8h,   v7.8h
273        sub             v2.8h,   v2.8h,   v0.8h   // q0 + q3 - p3 - p0
274        sub             v3.8h,   v3.8h,   v1.8h
275        uaddl           v4.8h,   v21.8b,  v24.8b  // p2 + q0
276        uaddl2          v5.8h,   v21.16b, v24.16b
277        uaddl           v6.8h,   v25.8b,  v27.8b  // q1 + q3
278        uaddl2          v7.8h,   v25.16b, v27.16b
279        rshrn           v12.8b,  v8.8h,   #3      // out p0
280        rshrn2          v12.16b, v9.8h,   #3
281
282        add             v8.8h,   v8.8h,   v2.8h
283        add             v9.8h,   v9.8h,   v3.8h
284        sub             v6.8h,   v6.8h,   v4.8h   // q1 + q3 - p2 - q0
285        sub             v7.8h,   v7.8h,   v5.8h
286        uaddl           v0.8h,   v22.8b,  v25.8b  // p1 + q1
287        uaddl2          v1.8h,   v22.16b, v25.16b
288        uaddl           v2.8h,   v26.8b,  v27.8b  // q2 + q3
289        uaddl2          v3.8h,   v26.16b, v27.16b
290        rshrn           v13.8b,  v8.8h,   #3      // out q0
291        rshrn2          v13.16b, v9.8h,   #3
292
293        add             v8.8h,   v8.8h,   v6.8h
294        add             v9.8h,   v9.8h,   v7.8h
295        sub             v2.8h,   v2.8h,   v0.8h   // q2 + q3 - p1 - q1
296        sub             v3.8h,   v3.8h,   v1.8h
297        rshrn           v0.8b,   v8.8h,   #3      // out q1
298        rshrn2          v0.16b,  v9.8h,   #3
299
300        add             v8.8h,   v8.8h,   v2.8h
301        add             v9.8h ,  v9.8h,   v3.8h
302
303        bit             v21.16b, v10.16b, v14.16b
304        bit             v22.16b, v11.16b, v14.16b
305        bit             v23.16b, v12.16b, v14.16b
306        rshrn           v1.8b,   v8.8h,   #3      // out q2
307        rshrn2          v1.16b,  v9.8h,   #3
308        bit             v24.16b, v13.16b, v14.16b
309        bit             v25.16b, v0.16b,  v14.16b
310        bit             v26.16b, v1.16b,  v14.16b
311.endif
3122:
313.if \wd == 16
314        mov             x16, v15.d[0]
315        mov             x17, v15.d[1]
316        adds            x16, x16, x17
317        b.ne            1f                        // check if flat8out is needed
318        mov             x16, v14.d[0]
319        mov             x17, v14.d[1]
320        adds            x16, x16, x17
321        b.eq            8f                        // if there was no flat8in, just write the inner 4 pixels
322        b               7f                        // if flat8in was used, write the inner 6 pixels
3231:
324
325        uaddl           v2.8h,   v17.8b,  v17.8b  // p6 + p6
326        uaddl2          v3.8h,   v17.16b, v17.16b
327        uaddl           v4.8h,   v17.8b,  v18.8b  // p6 + p5
328        uaddl2          v5.8h,   v17.16b, v18.16b
329        uaddl           v6.8h,   v17.8b,  v19.8b  // p6 + p4
330        uaddl2          v7.8h,   v17.16b, v19.16b
331        uaddl           v8.8h,   v17.8b,  v20.8b  // p6 + p3
332        uaddl2          v9.8h,   v17.16b, v20.16b
333        add             v12.8h,  v2.8h,   v4.8h
334        add             v13.8h,  v3.8h,   v5.8h
335        add             v10.8h,  v6.8h,   v8.8h
336        add             v11.8h,  v7.8h,   v9.8h
337        uaddl           v6.8h,   v17.8b,  v21.8b  // p6 + p2
338        uaddl2          v7.8h,   v17.16b, v21.16b
339        add             v12.8h,  v12.8h,  v10.8h
340        add             v13.8h,  v13.8h,  v11.8h
341        uaddl           v8.8h,   v17.8b,  v22.8b  // p6 + p1
342        uaddl2          v9.8h,   v17.16b, v22.16b
343        uaddl           v10.8h,  v18.8b,  v23.8b  // p5 + p0
344        uaddl2          v11.8h,  v18.16b, v23.16b
345        add             v6.8h,   v6.8h,   v8.8h
346        add             v7.8h,   v7.8h,   v9.8h
347        uaddl           v8.8h,   v19.8b,  v24.8b  // p4 + q0
348        uaddl2          v9.8h,   v19.16b, v24.16b
349        add             v12.8h,  v12.8h,  v6.8h
350        add             v13.8h,  v13.8h,  v7.8h
351        add             v10.8h,  v10.8h,  v8.8h
352        add             v11.8h,  v11.8h,  v9.8h
353        uaddl           v6.8h,   v20.8b,  v25.8b  // p3 + q1
354        uaddl2          v7.8h,   v20.16b, v25.16b
355        add             v12.8h,  v12.8h,  v10.8h
356        add             v13.8h,  v13.8h,  v11.8h
357        sub             v6.8h,   v6.8h,   v2.8h
358        sub             v7.8h,   v7.8h,   v3.8h
359        uaddl           v2.8h,   v21.8b,  v26.8b  // p2 + q2
360        uaddl2          v3.8h,   v21.16b, v26.16b
361        rshrn           v0.8b,   v12.8h,  #4      // out p5
362        rshrn2          v0.16b,  v13.8h,  #4
363        add             v12.8h,  v12.8h,  v6.8h   // - (p6 + p6) + (p3 + q1)
364        add             v13.8h,  v13.8h,  v7.8h
365        sub             v2.8h,   v2.8h,   v4.8h
366        sub             v3.8h,   v3.8h,   v5.8h
367        uaddl           v4.8h,   v22.8b,  v27.8b  // p1 + q3
368        uaddl2          v5.8h,   v22.16b, v27.16b
369        uaddl           v6.8h,   v17.8b,  v19.8b  // p6 + p4
370        uaddl2          v7.8h,   v17.16b, v19.16b
371        rshrn           v1.8b,   v12.8h,  #4      // out p4
372        rshrn2          v1.16b,  v13.8h,  #4
373        add             v12.8h,  v12.8h,  v2.8h   // - (p6 + p5) + (p2 + q2)
374        add             v13.8h,  v13.8h,  v3.8h
375        sub             v4.8h,   v4.8h,   v6.8h
376        sub             v5.8h,   v5.8h,   v7.8h
377        uaddl           v6.8h,   v23.8b,  v28.8b  // p0 + q4
378        uaddl2          v7.8h,   v23.16b, v28.16b
379        uaddl           v8.8h,   v17.8b,  v20.8b  // p6 + p3
380        uaddl2          v9.8h,   v17.16b, v20.16b
381        rshrn           v2.8b,   v12.8h,  #4      // out p3
382        rshrn2          v2.16b,  v13.8h,  #4
383        add             v12.8h,  v12.8h,  v4.8h   // - (p6 + p4) + (p1 + q3)
384        add             v13.8h,  v13.8h,  v5.8h
385        sub             v6.8h,   v6.8h,   v8.8h
386        sub             v7.8h,   v7.8h,   v9.8h
387        uaddl           v8.8h,   v24.8b,  v29.8b  // q0 + q5
388        uaddl2          v9.8h,   v24.16b, v29.16b
389        uaddl           v4.8h,   v17.8b,  v21.8b  // p6 + p2
390        uaddl2          v5.8h,   v17.16b, v21.16b
391        rshrn           v3.8b,   v12.8h,  #4      // out p2
392        rshrn2          v3.16b,  v13.8h,  #4
393        add             v12.8h,  v12.8h,  v6.8h   // - (p6 + p3) + (p0 + q4)
394        add             v13.8h,  v13.8h,  v7.8h
395        sub             v8.8h,   v8.8h,   v4.8h
396        sub             v9.8h,   v9.8h,   v5.8h
397        uaddl           v6.8h,   v25.8b,  v30.8b  // q1 + q6
398        uaddl2          v7.8h,   v25.16b, v30.16b
399        uaddl           v10.8h,  v17.8b,  v22.8b  // p6 + p1
400        uaddl2          v11.8h,  v17.16b, v22.16b
401        rshrn           v4.8b,   v12.8h,  #4      // out p1
402        rshrn2          v4.16b,  v13.8h,  #4
403        add             v12.8h,  v12.8h,  v8.8h   // - (p6 + p2) + (q0 + q5)
404        add             v13.8h,  v13.8h,  v9.8h
405        sub             v6.8h,   v6.8h,   v10.8h
406        sub             v7.8h,   v7.8h,   v11.8h
407        uaddl           v8.8h,   v26.8b,  v30.8b  // q2 + q6
408        uaddl2          v9.8h,   v26.16b, v30.16b
409        bif             v0.16b,  v18.16b, v15.16b // out p5
410        uaddl           v10.8h,  v18.8b,  v23.8b  // p5 + p0
411        uaddl2          v11.8h,  v18.16b, v23.16b
412        rshrn           v5.8b,   v12.8h,  #4      // out p0
413        rshrn2          v5.16b,  v13.8h,  #4
414        add             v12.8h,  v12.8h,  v6.8h   // - (p6 + p1) + (q1 + q6)
415        add             v13.8h,  v13.8h,  v7.8h
416        sub             v8.8h,   v8.8h,   v10.8h
417        sub             v9.8h,   v9.8h,   v11.8h
418        uaddl           v10.8h,  v27.8b,  v30.8b  // q3 + q6
419        uaddl2          v11.8h,  v27.16b, v30.16b
420        bif             v1.16b,  v19.16b, v15.16b // out p4
421        uaddl           v18.8h,  v19.8b,  v24.8b  // p4 + q0
422        uaddl2          v19.8h,  v19.16b, v24.16b
423        rshrn           v6.8b,   v12.8h,  #4      // out q0
424        rshrn2          v6.16b,  v13.8h,  #4
425        add             v12.8h,  v12.8h,  v8.8h   // - (p5 + p0) + (q2 + q6)
426        add             v13.8h,  v13.8h,  v9.8h
427        sub             v10.8h,  v10.8h,  v18.8h
428        sub             v11.8h,  v11.8h,  v19.8h
429        uaddl           v8.8h,   v28.8b,  v30.8b  // q4 + q6
430        uaddl2          v9.8h,   v28.16b, v30.16b
431        bif             v2.16b,  v20.16b, v15.16b // out p3
432        uaddl           v18.8h,  v20.8b,  v25.8b  // p3 + q1
433        uaddl2          v19.8h,  v20.16b, v25.16b
434        rshrn           v7.8b,   v12.8h,  #4      // out q1
435        rshrn2          v7.16b,  v13.8h,  #4
436        add             v12.8h,  v12.8h,  v10.8h  // - (p4 + q0) + (q3 + q6)
437        add             v13.8h,  v13.8h,  v11.8h
438        sub             v18.8h,  v8.8h,   v18.8h
439        sub             v19.8h,  v9.8h,   v19.8h
440        uaddl           v10.8h,  v29.8b,  v30.8b  // q5 + q6
441        uaddl2          v11.8h,  v29.16b, v30.16b
442        bif             v3.16b,  v21.16b, v15.16b // out p2
443        uaddl           v20.8h,  v21.8b,  v26.8b  // p2 + q2
444        uaddl2          v21.8h,  v21.16b, v26.16b
445        rshrn           v8.8b,   v12.8h,  #4      // out q2
446        rshrn2          v8.16b,  v13.8h,  #4
447        add             v12.8h,  v12.8h,  v18.8h  // - (p3 + q1) + (q4 + q6)
448        add             v13.8h,  v13.8h,  v19.8h
449        sub             v10.8h,  v10.8h,  v20.8h
450        sub             v11.8h,  v11.8h,  v21.8h
451        uaddl           v18.8h,  v30.8b,  v30.8b  // q6 + q6
452        uaddl2          v19.8h,  v30.16b, v30.16b
453        bif             v4.16b,  v22.16b, v15.16b // out p1
454        uaddl           v20.8h,  v22.8b,  v27.8b  // p1 + q3
455        uaddl2          v21.8h,  v22.16b, v27.16b
456        rshrn           v9.8b,   v12.8h,  #4      // out q3
457        rshrn2          v9.16b,  v13.8h,  #4
458        add             v12.8h,  v12.8h,  v10.8h  // - (p2 + q2) + (q5 + q6)
459        add             v13.8h,  v13.8h,  v11.8h
460        sub             v18.8h,  v18.8h,  v20.8h
461        sub             v19.8h,  v19.8h,  v21.8h
462        bif             v5.16b,  v23.16b, v15.16b // out p0
463        rshrn           v10.8b,  v12.8h,  #4      // out q4
464        rshrn2          v10.16b, v13.8h,  #4
465        add             v12.8h,  v12.8h,  v18.8h  // - (p1 + q3) + (q6 + q6)
466        add             v13.8h,  v13.8h,  v19.8h
467        rshrn           v11.8b,  v12.8h,  #4      // out q5
468        rshrn2          v11.16b, v13.8h,  #4
469        bif             v6.16b,  v24.16b, v15.16b // out q0
470        bif             v7.16b,  v25.16b, v15.16b // out q1
471        bif             v8.16b,  v26.16b, v15.16b // out q2
472        bif             v9.16b,  v27.16b, v15.16b // out q3
473        bif             v10.16b, v28.16b, v15.16b // out q4
474        bif             v11.16b, v29.16b, v15.16b // out q5
475.endif
476
477        ret
478.if \wd == 16
4797:
480        // Return to a shorter epilogue, writing only the inner 6 pixels
481        ret             x13
482.endif
483.if \wd >= 8
4848:
485        // Return to a shorter epilogue, writing only the inner 4 pixels
486        ret             x14
487.endif
4889:
489        // Return directly without writing back any pixels
490        ret             x15
491endfunc
492.endm
493
494loop_filter 16
495loop_filter 8
496loop_filter 6
497loop_filter 4
498
499.macro lpf_16_wd16
500        adr             x13, 7f
501        adr             x14, 8f
502        bl              lpf_16_wd16_neon
503.endm
504
505.macro lpf_16_wd8
506        adr             x14, 8f
507        bl              lpf_16_wd8_neon
508.endm
509
510.macro lpf_16_wd6
511        bl              lpf_16_wd6_neon
512.endm
513
514.macro lpf_16_wd4
515        bl              lpf_16_wd4_neon
516.endm
517
518function lpf_v_4_16_neon
519        mov             x15, x30
520        sub             x16, x0,  x1, lsl #1
521        ld1             {v22.16b}, [x16], x1 // p1
522        ld1             {v24.16b}, [x0],  x1 // q0
523        ld1             {v23.16b}, [x16], x1 // p0
524        ld1             {v25.16b}, [x0],  x1 // q1
525        sub             x0,  x0,  x1, lsl #1
526
527        lpf_16_wd4
528
529        sub             x16, x0,  x1, lsl #1
530        st1             {v22.16b}, [x16], x1 // p1
531        st1             {v24.16b}, [x0],  x1 // q0
532        st1             {v23.16b}, [x16], x1 // p0
533        st1             {v25.16b}, [x0],  x1 // q1
534        sub             x0,  x0,  x1, lsl #1
535        ret             x15
536endfunc
537
538function lpf_h_4_16_neon
539        mov             x15, x30
540        sub             x16, x0,  #2
541        add             x0,  x16, x1, lsl #3
542        ld1             {v22.s}[0], [x16], x1
543        ld1             {v22.s}[2], [x0],  x1
544        ld1             {v23.s}[0], [x16], x1
545        ld1             {v23.s}[2], [x0],  x1
546        ld1             {v24.s}[0], [x16], x1
547        ld1             {v24.s}[2], [x0],  x1
548        ld1             {v25.s}[0], [x16], x1
549        ld1             {v25.s}[2], [x0],  x1
550        ld1             {v22.s}[1], [x16], x1
551        ld1             {v22.s}[3], [x0],  x1
552        ld1             {v23.s}[1], [x16], x1
553        ld1             {v23.s}[3], [x0],  x1
554        ld1             {v24.s}[1], [x16], x1
555        ld1             {v24.s}[3], [x0],  x1
556        ld1             {v25.s}[1], [x16], x1
557        ld1             {v25.s}[3], [x0],  x1
558        add             x0,  x0,  #2
559
560        transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29
561
562        lpf_16_wd4
563
564        sub             x16, x0,  x1, lsl #4
565        sub             x16, x16, #2
566        transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29
567        add             x0,  x16, x1, lsl #3
568
569        st1             {v22.s}[0], [x16], x1
570        st1             {v22.s}[2], [x0],  x1
571        st1             {v23.s}[0], [x16], x1
572        st1             {v23.s}[2], [x0],  x1
573        st1             {v24.s}[0], [x16], x1
574        st1             {v24.s}[2], [x0],  x1
575        st1             {v25.s}[0], [x16], x1
576        st1             {v25.s}[2], [x0],  x1
577        st1             {v22.s}[1], [x16], x1
578        st1             {v22.s}[3], [x0],  x1
579        st1             {v23.s}[1], [x16], x1
580        st1             {v23.s}[3], [x0],  x1
581        st1             {v24.s}[1], [x16], x1
582        st1             {v24.s}[3], [x0],  x1
583        st1             {v25.s}[1], [x16], x1
584        st1             {v25.s}[3], [x0],  x1
585        add             x0,  x0,  #2
586        ret             x15
587endfunc
588
589function lpf_v_6_16_neon
590        mov             x15, x30
591        sub             x16, x0,  x1, lsl #1
592        sub             x16, x16, x1
593        ld1             {v21.16b}, [x16], x1 // p2
594        ld1             {v24.16b}, [x0],  x1 // q0
595        ld1             {v22.16b}, [x16], x1 // p1
596        ld1             {v25.16b}, [x0],  x1 // q1
597        ld1             {v23.16b}, [x16], x1 // p0
598        ld1             {v26.16b}, [x0],  x1 // q2
599        sub             x0,  x0,  x1, lsl #1
600        sub             x0,  x0,  x1
601
602        lpf_16_wd6
603
604        sub             x16, x0,  x1, lsl #1
605        st1             {v22.16b}, [x16], x1 // p1
606        st1             {v24.16b}, [x0],  x1 // q0
607        st1             {v23.16b}, [x16], x1 // p0
608        st1             {v25.16b}, [x0],  x1 // q1
609        sub             x0,  x0,  x1, lsl #1
610        ret             x15
611endfunc
612
613function lpf_h_6_16_neon
614        mov             x15, x30
615        sub             x16, x0,  #4
616        add             x0,  x16, x1, lsl #3
617        ld1             {v20.d}[0], [x16], x1
618        ld1             {v20.d}[1], [x0],  x1
619        ld1             {v21.d}[0], [x16], x1
620        ld1             {v21.d}[1], [x0],  x1
621        ld1             {v22.d}[0], [x16], x1
622        ld1             {v22.d}[1], [x0],  x1
623        ld1             {v23.d}[0], [x16], x1
624        ld1             {v23.d}[1], [x0],  x1
625        ld1             {v24.d}[0], [x16], x1
626        ld1             {v24.d}[1], [x0],  x1
627        ld1             {v25.d}[0], [x16], x1
628        ld1             {v25.d}[1], [x0],  x1
629        ld1             {v26.d}[0], [x16], x1
630        ld1             {v26.d}[1], [x0],  x1
631        ld1             {v27.d}[0], [x16], x1
632        ld1             {v27.d}[1], [x0],  x1
633        add             x0,  x0,  #4
634
635        transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
636
637        lpf_16_wd6
638
639        sub             x16, x0,  x1, lsl #4
640        sub             x16, x16, #2
641        transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29
642        add             x0,  x16, x1, lsl #3
643
644        st1             {v22.s}[0], [x16], x1
645        st1             {v22.s}[2], [x0],  x1
646        st1             {v23.s}[0], [x16], x1
647        st1             {v23.s}[2], [x0],  x1
648        st1             {v24.s}[0], [x16], x1
649        st1             {v24.s}[2], [x0],  x1
650        st1             {v25.s}[0], [x16], x1
651        st1             {v25.s}[2], [x0],  x1
652        st1             {v22.s}[1], [x16], x1
653        st1             {v22.s}[3], [x0],  x1
654        st1             {v23.s}[1], [x16], x1
655        st1             {v23.s}[3], [x0],  x1
656        st1             {v24.s}[1], [x16], x1
657        st1             {v24.s}[3], [x0],  x1
658        st1             {v25.s}[1], [x16], x1
659        st1             {v25.s}[3], [x0],  x1
660        add             x0,  x0,  #2
661        ret             x15
662endfunc
663
664function lpf_v_8_16_neon
665        mov             x15, x30
666        sub             x16, x0,  x1, lsl #2
667        ld1             {v20.16b}, [x16], x1 // p3
668        ld1             {v24.16b}, [x0],  x1 // q0
669        ld1             {v21.16b}, [x16], x1 // p2
670        ld1             {v25.16b}, [x0],  x1 // q1
671        ld1             {v22.16b}, [x16], x1 // p1
672        ld1             {v26.16b}, [x0],  x1 // q2
673        ld1             {v23.16b}, [x16], x1 // p0
674        ld1             {v27.16b}, [x0],  x1 // q3
675        sub             x0,  x0,  x1, lsl #2
676
677        lpf_16_wd8
678
679        sub             x16, x0,  x1, lsl #1
680        sub             x16, x16,  x1
681        st1             {v21.16b}, [x16], x1 // p2
682        st1             {v24.16b}, [x0],  x1 // q0
683        st1             {v22.16b}, [x16], x1 // p1
684        st1             {v25.16b}, [x0],  x1 // q1
685        st1             {v23.16b}, [x16], x1 // p0
686        st1             {v26.16b}, [x0],  x1 // q2
687        sub             x0,  x0,  x1, lsl #1
688        sub             x0,  x0,  x1
689        ret             x15
690
6918:
692        sub             x16, x0,  x1, lsl #1
693        st1             {v22.16b}, [x16], x1 // p1
694        st1             {v24.16b}, [x0],  x1 // q0
695        st1             {v23.16b}, [x16], x1 // p0
696        st1             {v25.16b}, [x0],  x1 // q1
697        sub             x0,  x0,  x1, lsl #1
698        ret             x15
699endfunc
700
701function lpf_h_8_16_neon
702        mov             x15, x30
703        sub             x16, x0,  #4
704        add             x0,  x16, x1, lsl #3
705        ld1             {v20.d}[0], [x16], x1
706        ld1             {v20.d}[1], [x0],  x1
707        ld1             {v21.d}[0], [x16], x1
708        ld1             {v21.d}[1], [x0],  x1
709        ld1             {v22.d}[0], [x16], x1
710        ld1             {v22.d}[1], [x0],  x1
711        ld1             {v23.d}[0], [x16], x1
712        ld1             {v23.d}[1], [x0],  x1
713        ld1             {v24.d}[0], [x16], x1
714        ld1             {v24.d}[1], [x0],  x1
715        ld1             {v25.d}[0], [x16], x1
716        ld1             {v25.d}[1], [x0],  x1
717        ld1             {v26.d}[0], [x16], x1
718        ld1             {v26.d}[1], [x0],  x1
719        ld1             {v27.d}[0], [x16], x1
720        ld1             {v27.d}[1], [x0],  x1
721        add             x0,  x0,  #4
722
723        transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
724
725        lpf_16_wd8
726
727        sub             x16, x0,  x1, lsl #4
728        sub             x16, x16, #4
729        transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
730        add             x0,  x16, x1, lsl #3
731
732        st1             {v20.d}[0], [x16], x1
733        st1             {v20.d}[1], [x0],  x1
734        st1             {v21.d}[0], [x16], x1
735        st1             {v21.d}[1], [x0],  x1
736        st1             {v22.d}[0], [x16], x1
737        st1             {v22.d}[1], [x0],  x1
738        st1             {v23.d}[0], [x16], x1
739        st1             {v23.d}[1], [x0],  x1
740        st1             {v24.d}[0], [x16], x1
741        st1             {v24.d}[1], [x0],  x1
742        st1             {v25.d}[0], [x16], x1
743        st1             {v25.d}[1], [x0],  x1
744        st1             {v26.d}[0], [x16], x1
745        st1             {v26.d}[1], [x0],  x1
746        st1             {v27.d}[0], [x16], x1
747        st1             {v27.d}[1], [x0],  x1
748        add             x0,  x0,  #4
749        ret             x15
7508:
751        sub             x16, x0,  x1, lsl #4
752        sub             x16, x16, #2
753        transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29
754        add             x0,  x16, x1, lsl #3
755
756        st1             {v22.s}[0], [x16], x1
757        st1             {v22.s}[2], [x0],  x1
758        st1             {v23.s}[0], [x16], x1
759        st1             {v23.s}[2], [x0],  x1
760        st1             {v24.s}[0], [x16], x1
761        st1             {v24.s}[2], [x0],  x1
762        st1             {v25.s}[0], [x16], x1
763        st1             {v25.s}[2], [x0],  x1
764        st1             {v22.s}[1], [x16], x1
765        st1             {v22.s}[3], [x0],  x1
766        st1             {v23.s}[1], [x16], x1
767        st1             {v23.s}[3], [x0],  x1
768        st1             {v24.s}[1], [x16], x1
769        st1             {v24.s}[3], [x0],  x1
770        st1             {v25.s}[1], [x16], x1
771        st1             {v25.s}[3], [x0],  x1
772        add             x0,  x0,  #2
773        ret             x15
774endfunc
775
776function lpf_v_16_16_neon
777        mov             x15, x30
778
779        sub             x16, x0,  x1, lsl #3
780        add             x16, x16, x1
781        ld1             {v17.16b}, [x16], x1 // p6
782        ld1             {v24.16b}, [x0],  x1 // q0
783        ld1             {v18.16b}, [x16], x1 // p5
784        ld1             {v25.16b}, [x0],  x1 // q1
785        ld1             {v19.16b}, [x16], x1 // p4
786        ld1             {v26.16b}, [x0],  x1 // q2
787        ld1             {v20.16b}, [x16], x1 // p3
788        ld1             {v27.16b}, [x0],  x1 // q3
789        ld1             {v21.16b}, [x16], x1 // p2
790        ld1             {v28.16b}, [x0],  x1 // q4
791        ld1             {v22.16b}, [x16], x1 // p1
792        ld1             {v29.16b}, [x0],  x1 // q5
793        ld1             {v23.16b}, [x16], x1 // p0
794        ld1             {v30.16b}, [x0],  x1 // q6
795        sub             x0,  x0,  x1, lsl #3
796        add             x0,  x0,  x1
797
798        lpf_16_wd16
799
800        sub             x16, x0,  x1, lsl #2
801        sub             x16, x16,  x1, lsl #1
802        st1             {v0.16b},  [x16], x1 // p5
803        st1             {v6.16b},  [x0],  x1 // q0
804        st1             {v1.16b},  [x16], x1 // p4
805        st1             {v7.16b},  [x0],  x1 // q1
806        st1             {v2.16b},  [x16], x1 // p3
807        st1             {v8.16b},  [x0],  x1 // q2
808        st1             {v3.16b},  [x16], x1 // p2
809        st1             {v9.16b},  [x0],  x1 // q3
810        st1             {v4.16b},  [x16], x1 // p1
811        st1             {v10.16b}, [x0],  x1 // q4
812        st1             {v5.16b},  [x16], x1 // p0
813        st1             {v11.16b}, [x0],  x1 // q5
814        sub             x0,  x0,  x1, lsl #2
815        sub             x0,  x0,  x1, lsl #1
816        ret             x15
8177:
818        sub             x16, x0,  x1
819        sub             x16, x16, x1, lsl #1
820        st1             {v21.16b}, [x16], x1 // p2
821        st1             {v24.16b}, [x0],  x1 // q0
822        st1             {v22.16b}, [x16], x1 // p1
823        st1             {v25.16b}, [x0],  x1 // q1
824        st1             {v23.16b}, [x16], x1 // p0
825        st1             {v26.16b}, [x0],  x1 // q2
826        sub             x0,  x0,  x1, lsl #1
827        sub             x0,  x0,  x1
828        ret             x15
829
8308:
831        sub             x16, x0,  x1, lsl #1
832        st1             {v22.16b}, [x16], x1 // p1
833        st1             {v24.16b}, [x0],  x1 // q0
834        st1             {v23.16b}, [x16], x1 // p0
835        st1             {v25.16b}, [x0],  x1 // q1
836        sub             x0,  x0,  x1, lsl #1
837        ret             x15
838endfunc
839
840function lpf_h_16_16_neon
841        mov             x15, x30
842        sub             x16, x0,  #8
843        ld1             {v16.d}[0], [x16], x1
844        ld1             {v24.d}[0], [x0],  x1
845        ld1             {v17.d}[0], [x16], x1
846        ld1             {v25.d}[0], [x0],  x1
847        ld1             {v18.d}[0], [x16], x1
848        ld1             {v26.d}[0], [x0],  x1
849        ld1             {v19.d}[0], [x16], x1
850        ld1             {v27.d}[0], [x0],  x1
851        ld1             {v20.d}[0], [x16], x1
852        ld1             {v28.d}[0], [x0],  x1
853        ld1             {v21.d}[0], [x16], x1
854        ld1             {v29.d}[0], [x0],  x1
855        ld1             {v22.d}[0], [x16], x1
856        ld1             {v30.d}[0], [x0],  x1
857        ld1             {v23.d}[0], [x16], x1
858        ld1             {v31.d}[0], [x0],  x1
859        ld1             {v16.d}[1], [x16], x1
860        ld1             {v24.d}[1], [x0],  x1
861        ld1             {v17.d}[1], [x16], x1
862        ld1             {v25.d}[1], [x0],  x1
863        ld1             {v18.d}[1], [x16], x1
864        ld1             {v26.d}[1], [x0],  x1
865        ld1             {v19.d}[1], [x16], x1
866        ld1             {v27.d}[1], [x0],  x1
867        ld1             {v20.d}[1], [x16], x1
868        ld1             {v28.d}[1], [x0],  x1
869        ld1             {v21.d}[1], [x16], x1
870        ld1             {v29.d}[1], [x0],  x1
871        ld1             {v22.d}[1], [x16], x1
872        ld1             {v30.d}[1], [x0],  x1
873        ld1             {v23.d}[1], [x16], x1
874        ld1             {v31.d}[1], [x0],  x1
875
876        transpose_8x16b v16, v17, v18, v19, v20, v21, v22, v23, v0,  v1
877        transpose_8x16b v24, v25, v26, v27, v28, v29, v30, v31, v0,  v1
878
879        lpf_16_wd16
880
881        sub             x0,  x0,  x1, lsl #4
882        sub             x16, x0,  #8
883
884        transpose_8x16b v16, v17, v0,  v1,  v2,  v3,  v4,  v5,  v18, v19
885        transpose_8x16b v6,  v7,  v8,  v9,  v10, v11, v30, v31, v18, v19
886
887        st1             {v16.d}[0], [x16], x1
888        st1             {v6.d}[0],  [x0],  x1
889        st1             {v17.d}[0], [x16], x1
890        st1             {v7.d}[0],  [x0],  x1
891        st1             {v0.d}[0],  [x16], x1
892        st1             {v8.d}[0],  [x0],  x1
893        st1             {v1.d}[0],  [x16], x1
894        st1             {v9.d}[0],  [x0],  x1
895        st1             {v2.d}[0],  [x16], x1
896        st1             {v10.d}[0], [x0],  x1
897        st1             {v3.d}[0],  [x16], x1
898        st1             {v11.d}[0], [x0],  x1
899        st1             {v4.d}[0],  [x16], x1
900        st1             {v30.d}[0], [x0],  x1
901        st1             {v5.d}[0],  [x16], x1
902        st1             {v31.d}[0], [x0],  x1
903        st1             {v16.d}[1], [x16], x1
904        st1             {v6.d}[1],  [x0],  x1
905        st1             {v17.d}[1], [x16], x1
906        st1             {v7.d}[1],  [x0],  x1
907        st1             {v0.d}[1],  [x16], x1
908        st1             {v8.d}[1],  [x0],  x1
909        st1             {v1.d}[1],  [x16], x1
910        st1             {v9.d}[1],  [x0],  x1
911        st1             {v2.d}[1],  [x16], x1
912        st1             {v10.d}[1], [x0],  x1
913        st1             {v3.d}[1],  [x16], x1
914        st1             {v11.d}[1], [x0],  x1
915        st1             {v4.d}[1],  [x16], x1
916        st1             {v30.d}[1], [x0],  x1
917        st1             {v5.d}[1],  [x16], x1
918        st1             {v31.d}[1], [x0],  x1
919        ret             x15
920
9217:
922        sub             x16, x0,  x1, lsl #4
923        sub             x16, x16, #4
924        transpose_8x16b v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
925        add             x0,  x16, x1, lsl #3
926
927        st1             {v20.d}[0], [x16], x1
928        st1             {v20.d}[1], [x0],  x1
929        st1             {v21.d}[0], [x16], x1
930        st1             {v21.d}[1], [x0],  x1
931        st1             {v22.d}[0], [x16], x1
932        st1             {v22.d}[1], [x0],  x1
933        st1             {v23.d}[0], [x16], x1
934        st1             {v23.d}[1], [x0],  x1
935        st1             {v24.d}[0], [x16], x1
936        st1             {v24.d}[1], [x0],  x1
937        st1             {v25.d}[0], [x16], x1
938        st1             {v25.d}[1], [x0],  x1
939        st1             {v26.d}[0], [x16], x1
940        st1             {v26.d}[1], [x0],  x1
941        st1             {v27.d}[0], [x16], x1
942        st1             {v27.d}[1], [x0],  x1
943        add             x0,  x0,  #4
944        ret             x15
9458:
946        sub             x16, x0,  x1, lsl #4
947        sub             x16, x16, #2
948        transpose_4x16b v22, v23, v24, v25, v26, v27, v28, v29
949        add             x0,  x16, x1, lsl #3
950
951        st1             {v22.s}[0], [x16], x1
952        st1             {v22.s}[2], [x0],  x1
953        st1             {v23.s}[0], [x16], x1
954        st1             {v23.s}[2], [x0],  x1
955        st1             {v24.s}[0], [x16], x1
956        st1             {v24.s}[2], [x0],  x1
957        st1             {v25.s}[0], [x16], x1
958        st1             {v25.s}[2], [x0],  x1
959        st1             {v22.s}[1], [x16], x1
960        st1             {v22.s}[3], [x0],  x1
961        st1             {v23.s}[1], [x16], x1
962        st1             {v23.s}[3], [x0],  x1
963        st1             {v24.s}[1], [x16], x1
964        st1             {v24.s}[3], [x0],  x1
965        st1             {v25.s}[1], [x16], x1
966        st1             {v25.s}[3], [x0],  x1
967        add             x0,  x0,  #2
968        ret             x15
969endfunc
970
971// void dav1d_lpf_v_sb_y_8bpc_neon(pixel *dst, const ptrdiff_t stride,
972//                                 const uint32_t *const vmask,
973//                                 const uint8_t (*l)[4], ptrdiff_t b4_stride,
974//                                 const Av1FilterLUT *lut, const int w)
975
976.macro lpf_func dir, type
977function lpf_\dir\()_sb_\type\()_8bpc_neon, export=1
978        mov             x11, x30
979        stp             d8,  d9,  [sp, #-0x40]!
980        stp             d10, d11, [sp, #0x10]
981        stp             d12, d13, [sp, #0x20]
982        stp             d14, d15, [sp, #0x30]
983        ldp             w6,  w7,  [x2]           // vmask[0], vmask[1]
984.ifc \type, y
985        ldr             w2,  [x2, #8]            // vmask[2]
986.endif
987        add             x5,  x5,  #128           // Move to sharp part of lut
988.ifc \type, y
989        orr             w7,  w7,  w2             // vmask[1] |= vmask[2]
990.endif
991.ifc \dir, v
992        sub             x4,  x3,  x4, lsl #2
993.else
994        sub             x3,  x3,  #4
995        lsl             x4,  x4,  #2
996.endif
997        orr             w6,  w6,  w7             // vmask[0] |= vmask[1]
998
9991:
1000        tst             w6,  #0x0f
1001.ifc \dir, v
1002        ld1             {v0.16b}, [x4], #16
1003        ld1             {v1.16b}, [x3], #16
1004.else
1005        ld2             {v0.s,v1.s}[0], [x3], x4
1006        ld2             {v0.s,v1.s}[1], [x3], x4
1007        ld2             {v0.s,v1.s}[2], [x3], x4
1008        ld2             {v0.s,v1.s}[3], [x3], x4
1009.endif
1010        b.eq            7f                        // if (!(vm & bits)) continue;
1011
1012        ld1r            {v5.16b}, [x5]            // sharp[0]
1013        add             x5,  x5,  #8
1014        movi            v2.4s,   #0xff
1015        dup             v13.4s,  w6               // vmask[0]
1016
1017        and             v0.16b,  v0.16b,  v2.16b  // Keep only lowest byte in each 32 bit word
1018        and             v1.16b,  v1.16b,  v2.16b
1019        cmtst           v3.16b,  v1.16b,  v2.16b  // Check for nonzero values in l[0][0]
1020        movi            v4.16b,  #1
1021        ld1r            {v6.16b}, [x5]            // sharp[1]
1022        sub             x5,  x5,  #8
1023        bif             v1.16b,  v0.16b,  v3.16b  // if (!l[0][0]) L = l[offset][0]
1024        cmtst           v2.4s,   v1.4s,   v2.4s   // L != 0
1025        mul             v1.4s,   v1.4s,   v4.4s   // L
1026.ifc \type, y
1027        dup             v15.4s,  w2               // vmask[2]
1028.endif
1029        dup             v14.4s,  w7               // vmask[1]
1030        mov             x16, v2.d[0]
1031        mov             x17, v2.d[1]
1032        adds            x16, x16, x17
1033        b.eq            7f                        // if (!L) continue;
1034        neg             v5.16b,  v5.16b           // -sharp[0]
1035        movrel          x16,  word_1248
1036        ushr            v12.16b, v1.16b,  #4      // H
1037        ld1             {v16.4s}, [x16]
1038        sshl            v3.16b,  v1.16b,  v5.16b  // L >> sharp[0]
1039.ifc \type, y
1040        cmtst           v15.4s,  v15.4s,  v16.4s  // if (vmask[2] & bits)
1041.endif
1042        movi            v7.16b,  #2
1043        umin            v3.16b,  v3.16b,  v6.16b  // imin(L >> sharp[0], sharp[1])
1044        add             v0.16b,  v1.16b,  v7.16b  // L + 2
1045        umax            v11.16b, v3.16b,  v4.16b  // imax(imin(), 1) = limit = I
1046        add             v0.16b,  v0.16b,  v0.16b  // 2*(L + 2)
1047        cmtst           v14.4s,  v14.4s,  v16.4s  // if (vmask[1] & bits)
1048        add             v10.16b, v0.16b,  v11.16b // 2*(L + 2) + limit = E
1049        cmtst           v13.4s,  v13.4s,  v16.4s  // if (vmask[0] & bits)
1050        and             v13.16b, v13.16b, v2.16b  // vmask[0] &= L != 0
1051
1052.ifc \type, y
1053        tst             w2,  #0x0f
1054        b.eq            2f
1055        // wd16
1056        bl              lpf_\dir\()_16_16_neon
1057        b               8f
10582:
1059.endif
1060        tst             w7,  #0x0f
1061        b.eq            3f
1062.ifc \type, y
1063        // wd8
1064        bl              lpf_\dir\()_8_16_neon
1065.else
1066        // wd6
1067        bl              lpf_\dir\()_6_16_neon
1068.endif
1069        b               8f
10703:
1071        // wd4
1072        bl              lpf_\dir\()_4_16_neon
1073.ifc \dir, h
1074        b               8f
10757:
1076        // For dir h, the functions above increment x0.
1077        // If the whole function is skipped, increment it here instead.
1078        add             x0,  x0,  x1,  lsl #4
1079.else
10807:
1081.endif
10828:
1083        lsr             w6,  w6,  #4              // vmask[0] >>= 4
1084        lsr             w7,  w7,  #4              // vmask[1] >>= 4
1085.ifc \type, y
1086        lsr             w2,  w2,  #4              // vmask[2] >>= 4
1087.endif
1088.ifc \dir, v
1089        add             x0,  x0,  #16
1090.else
1091        // For dir h, x0 is returned incremented
1092.endif
1093        cbnz            w6,  1b
1094
1095        ldp             d14, d15, [sp, #0x30]
1096        ldp             d12, d13, [sp, #0x20]
1097        ldp             d10, d11, [sp, #0x10]
1098        ldp             d8,  d9,  [sp], 0x40
1099        ret             x11
1100endfunc
1101.endm
1102
1103lpf_func v, y
1104lpf_func h, y
1105lpf_func v, uv
1106lpf_func h, uv
1107
1108const word_1248
1109        .word 1, 2, 4, 8
1110endconst
1111