1/*
2 * Copyright (c) 2017 Google Inc.
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/aarch64/asm.S"
22#include "neon.S"
23
24
25.macro transpose_4x8H r0, r1, r2, r3, t4, t5, t6, t7
26        trn1            \t4\().8h,  \r0\().8h,  \r1\().8h
27        trn2            \t5\().8h,  \r0\().8h,  \r1\().8h
28        trn1            \t6\().8h,  \r2\().8h,  \r3\().8h
29        trn2            \t7\().8h,  \r2\().8h,  \r3\().8h
30
31        trn1            \r0\().4s,  \t4\().4s,  \t6\().4s
32        trn2            \r2\().4s,  \t4\().4s,  \t6\().4s
33        trn1            \r1\().4s,  \t5\().4s,  \t7\().4s
34        trn2            \r3\().4s,  \t5\().4s,  \t7\().4s
35.endm
36
37// The input to and output from this macro is in the registers v16-v31,
38// and v0-v7 are used as scratch registers.
39// p7 = v16 .. p3 = v20, p0 = v23, q0 = v24, q3 = v27, q7 = v31
40// Depending on the width of the loop filter, we either use v16-v19
41// and v28-v31 as temp registers, or v8-v15.
42.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
43        dup             v0.8h,  w2                   // E
44        dup             v2.8h,  w3                   // I
45        dup             v3.8h,  w4                   // H
46
47        uabd            v4.8h,  v20.8h, v21.8h       // abs(p3 - p2)
48        uabd            v5.8h,  v21.8h, v22.8h       // abs(p2 - p1)
49        uabd            v6.8h,  v22.8h, v23.8h       // abs(p1 - p0)
50        uabd            v7.8h,  v24.8h, v25.8h       // abs(q0 - q1)
51        uabd            \tmp1\().8h,  v25.8h, v26.8h // abs(q1 - q2)
52        uabd            \tmp2\().8h,  v26.8h, v27.8h // abs(q2 - q3)
53        umax            v4.8h,  v4.8h,  v5.8h
54        umax            v5.8h,  v6.8h,  v7.8h
55        umax            \tmp1\().8h,  \tmp1\().8h, \tmp2\().8h
56        uabd            v6.8h,  v23.8h, v24.8h       // abs(p0 - q0)
57        umax            v4.8h,  v4.8h,  v5.8h
58        add             v6.8h,  v6.8h,  v6.8h        // abs(p0 - q0) * 2
59        uabd            v5.8h,  v22.8h, v25.8h       // abs(p1 - q1)
60        umax            v4.8h,  v4.8h,  \tmp1\().8h  // max(abs(p3 - p2), ..., abs(q2 - q3))
61        ushr            v5.8h,  v5.8h,  #1
62        cmhs            v4.8h,  v2.8h,  v4.8h        // max(abs()) <= I
63        add             v6.8h,  v6.8h,  v5.8h        // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
64        cmhs            v6.8h,  v0.8h,  v6.8h
65        and             v4.16b, v4.16b, v6.16b       // fm
66
67        // If no pixels need filtering, just exit as soon as possible
68        mov             x11, v4.d[0]
69        mov             x12, v4.d[1]
70        adds            x11, x11, x12
71        b.ne            1f
72        br              x10
731:
74
75.if \wd >= 8
76        dup             v0.8h,  w5
77
78        uabd            v6.8h,  v20.8h, v23.8h       // abs(p3 - p0)
79        uabd            v2.8h,  v21.8h, v23.8h       // abs(p2 - p0)
80        uabd            v1.8h,  v22.8h, v23.8h       // abs(p1 - p0)
81        uabd            \tmp1\().8h,  v25.8h, v24.8h // abs(q1 - q0)
82        uabd            \tmp2\().8h,  v26.8h, v24.8h // abs(q2 - q0)
83        uabd            \tmp3\().8h,  v27.8h, v24.8h // abs(q3 - q0)
84        umax            v6.8h,  v6.8h,  v2.8h
85        umax            v1.8h,  v1.8h,  \tmp1\().8h
86        umax            \tmp2\().8h,  \tmp2\().8h,  \tmp3\().8h
87.if \wd == 16
88        uabd            v7.8h,  v16.8h, v23.8h       // abs(p7 - p0)
89        umax            v6.8h,  v6.8h,  v1.8h
90        uabd            v2.8h,  v17.8h, v23.8h       // abs(p6 - p0)
91        umax            v6.8h,  v6.8h,  \tmp2\().8h
92        uabd            v1.8h,  v18.8h, v23.8h       // abs(p5 - p0)
93        cmhs            v6.8h,  v0.8h,  v6.8h        // flat8in
94        uabd            v8.8h,  v19.8h, v23.8h       // abs(p4 - p0)
95        and             v6.16b, v6.16b, v4.16b       // flat8in && fm
96        uabd            v9.8h,  v28.8h, v24.8h       // abs(q4 - q0)
97        bic             v4.16b, v4.16b, v6.16b       // fm && !flat8in
98        uabd            v10.8h, v29.8h, v24.8h       // abs(q5 - q0)
99        uabd            v11.8h, v30.8h, v24.8h       // abs(q6 - q0)
100        uabd            v12.8h, v31.8h, v24.8h       // abs(q7 - q0)
101
102        umax            v7.8h,  v7.8h,  v2.8h
103        umax            v1.8h,  v1.8h,  v8.8h
104        umax            v9.8h,  v9.8h,  v10.8h
105        umax            v11.8h, v11.8h, v12.8h
106        // The rest of the calculation of flat8out is interleaved below
107.else
108        // The rest of the calculation of flat8in is interleaved below
109.endif
110.endif
111
112        // Calculate the normal inner loop filter for 2 or 4 pixels
113        uabd            v5.8h,  v22.8h, v23.8h                  // abs(p1 - p0)
114.if \wd == 16
115        umax            v7.8h,  v7.8h,  v1.8h
116        umax            v9.8h,  v9.8h,  v11.8h
117.elseif \wd == 8
118        umax            v6.8h,  v6.8h,  v1.8h
119.endif
120        uabd            v1.8h,  v25.8h, v24.8h                  // abs(q1 - q0)
121.if \wd == 16
122        umax            v7.8h,  v7.8h,  v9.8h
123.elseif \wd == 8
124        umax            v6.8h,  v6.8h,  \tmp2\().8h
125.endif
126        dup             \tmp2\().8h,  w6                        // left shift for saturation
127        sub             \tmp1\().8h,  v22.8h,  v25.8h           // p1 - q1
128        neg             \tmp6\().8h,  \tmp2\().8h               // negative left shift after saturation
129        umax            v5.8h,  v5.8h,  v1.8h                   // max(abs(p1 - p0), abs(q1 - q0))
130        sub             \tmp3\().8h,  v24.8h,  v23.8h           // q0 - p0
131        movi            \tmp5\().8h,  #3
132.if \wd == 8
133        cmhs            v6.8h,  v0.8h,  v6.8h                   // flat8in
134.endif
135        cmhs            v5.8h,  v3.8h,  v5.8h                   // !hev
136.if \wd == 8
137        and             v6.16b, v6.16b, v4.16b                  // flat8in && fm
138.endif
139        sqshl           \tmp1\().8h,  \tmp1\().8h,  \tmp2\().8h
140.if \wd == 16
141        cmhs            v7.8h,  v0.8h,  v7.8h                   // flat8out
142.elseif \wd == 8
143        bic             v4.16b, v4.16b, v6.16b                  // fm && !flat8in
144.endif
145        and             v5.16b,  v5.16b,  v4.16b                // !hev && fm && !flat8in
146.if \wd == 16
147        and             v7.16b, v7.16b, v6.16b                  // flat8out && flat8in && fm
148.endif
149        sshl            \tmp1\().8h,  \tmp1\().8h,  \tmp6\().8h // av_clip_int2p(p1 - q1, BIT_DEPTH - 1)
150
151        mul             \tmp3\().8h,  \tmp3\().8h,  \tmp5\().8h // 3 * (q0 - p0)
152        bic             \tmp1\().16b, \tmp1\().16b, v5.16b      // if (!hev) av_clip_int8 = 0
153        movi            v2.8h,  #4
154        add             \tmp3\().8h,  \tmp3\().8h,  \tmp1\().8h // 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)]
155        movi            v3.8h,  #3
156        sqshl           \tmp1\().8h,  \tmp3\().8h,  \tmp2\().8h
157        movi            \tmp5\().8h,  #0
158        sshl            \tmp1\().8h,  \tmp1\().8h,  \tmp6\().8h // av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f
159        dup             \tmp6\().8h,  w7                        // max pixel value
160.if \wd == 16
161        bic             v6.16b, v6.16b, v7.16b                  // fm && flat8in && !flat8out
162.endif
163
164        ushr            \tmp2\().8h,  \tmp6\().8h,  #1          // (1 << (BIT_DEPTH - 1)) - 1
165
166        add             \tmp3\().8h,  \tmp1\().8h,  v2.8h       // f + 4
167        add             \tmp4\().8h,  \tmp1\().8h,  v3.8h       // f + 3
168        smin            \tmp3\().8h,  \tmp3\().8h,  \tmp2\().8h // FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1)
169        smin            \tmp4\().8h,  \tmp4\().8h,  \tmp2\().8h // FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1)
170        sshr            \tmp3\().8h,  \tmp3\().8h,  #3          // f1
171        sshr            \tmp4\().8h,  \tmp4\().8h,  #3          // f2
172
173        add             v0.8h,   v23.8h,  \tmp4\().8h           // p0 + f2
174        sub             v2.8h,   v24.8h,  \tmp3\().8h           // q0 - f1
175        smin            v0.8h,   v0.8h,   \tmp6\().8h
176        smin            v2.8h,   v2.8h,   \tmp6\().8h
177        srshr           \tmp3\().8h, \tmp3\().8h, #1            // f = (f1 + 1) >> 1
178        smax            v0.8h,   v0.8h,   \tmp5\().8h           // out p0
179        smax            v2.8h,   v2.8h,   \tmp5\().8h           // out q0
180        bit             v23.16b, v0.16b,  v4.16b                // if (fm && !flat8in)
181        bit             v24.16b, v2.16b,  v4.16b
182
183        add             v0.8h,  v22.8h,  \tmp3\().8h            // p1 + f
184        sub             v2.8h,  v25.8h,  \tmp3\().8h            // q1 - f
185.if \wd >= 8
186        mov             x11, v6.d[0]
187.endif
188        smin            v0.8h,  v0.8h,  \tmp6\().8h
189        smin            v2.8h,  v2.8h,  \tmp6\().8h
190.if \wd >= 8
191        mov             x12, v6.d[1]
192.endif
193        smax            v0.8h,  v0.8h,  \tmp5\().8h             // out p1
194        smax            v2.8h,  v2.8h,  \tmp5\().8h             // out q1
195.if \wd >= 8
196        adds            x11, x11, x12
197.endif
198        bit             v22.16b, v0.16b,  v5.16b                // if (!hev && fm && !flat8in)
199        bit             v25.16b, v2.16b,  v5.16b
200
201        // If no pixels need flat8in, jump to flat8out
202        // (or to a writeout of the inner 4 pixels, for wd=8)
203.if \wd >= 8
204.if \wd == 16
205        b.eq            6f
206.else
207        b.ne            1f
208        br              x13
2091:
210.endif
211
212        // flat8in
213        add             \tmp1\().8h, v20.8h, v21.8h
214        add             \tmp3\().8h, v22.8h, v25.8h
215        add             \tmp5\().8h, v20.8h, v22.8h
216        add             \tmp7\().8h, v23.8h, v26.8h
217        add             v0.8h,  \tmp1\().8h, \tmp1\().8h
218        add             v0.8h,  v0.8h,  v23.8h
219        add             v0.8h,  v0.8h,  v24.8h
220        add             v0.8h,  v0.8h,  \tmp5\().8h
221        sub             \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
222        sub             \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
223        urshr           v2.8h,  v0.8h,  #3                      // out p2
224
225        add             v0.8h,  v0.8h,  \tmp3\().8h
226        add             \tmp1\().8h, v20.8h,  v23.8h
227        add             \tmp3\().8h, v24.8h,  v27.8h
228        urshr           v3.8h,  v0.8h,  #3                      // out p1
229
230        add             v0.8h,  v0.8h,  \tmp7\().8h
231        sub             \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
232        add             \tmp5\().8h, v21.8h,  v24.8h
233        add             \tmp7\().8h, v25.8h,  v27.8h
234        urshr           v4.8h,  v0.8h,  #3                      // out p0
235
236        add             v0.8h,  v0.8h,  \tmp3\().8h
237        sub             \tmp7\().8h, \tmp7\().8h, \tmp5\().8h
238        add             \tmp1\().8h, v22.8h,  v25.8h
239        add             \tmp3\().8h, v26.8h,  v27.8h
240        urshr           v5.8h,  v0.8h,  #3                      // out q0
241
242        add             v0.8h,  v0.8h,  \tmp7\().8h
243        sub             \tmp3\().8h, \tmp3\().8h, \tmp1\().8h
244        urshr           \tmp5\().8h, v0.8h,  #3                 // out q1
245
246        add             v0.8h,  v0.8h,  \tmp3\().8h
247        // The output here is written back into the input registers. This doesn't
248        // matter for the flat8part below, since we only update those pixels
249        // which won't be touched below.
250        bit             v21.16b, v2.16b,  v6.16b
251        bit             v22.16b, v3.16b,  v6.16b
252        bit             v23.16b, v4.16b,  v6.16b
253        urshr           \tmp6\().8h,  v0.8h,  #3                // out q2
254        bit             v24.16b, v5.16b,  v6.16b
255        bit             v25.16b, \tmp5\().16b,  v6.16b
256        bit             v26.16b, \tmp6\().16b,  v6.16b
257.endif
258.if \wd == 16
2596:
260        orr             v2.16b,  v6.16b,  v7.16b
261        mov             x11, v2.d[0]
262        mov             x12, v2.d[1]
263        adds            x11, x11, x12
264        b.ne            1f
265        // If no pixels needed flat8in nor flat8out, jump to a
266        // writeout of the inner 4 pixels
267        br              x14
2681:
269
270        mov             x11, v7.d[0]
271        mov             x12, v7.d[1]
272        adds            x11, x11, x12
273        b.ne            1f
274        // If no pixels need flat8out, jump to a writeout of the inner 6 pixels
275        br              x15
276
2771:
278        // flat8out
279        // This writes all outputs into v2-v17 (skipping v6 and v16).
280        // If this part is skipped, the output is read from v21-v26 (which is the input
281        // to this section).
282        shl             v0.8h,   v16.8h,  #3     // 8 * v16
283        sub             v0.8h,   v0.8h,   v16.8h // 7 * v16
284        add             v0.8h,   v0.8h,   v17.8h
285        add             v8.8h,   v17.8h,  v18.8h
286        add             v10.8h,  v19.8h,  v20.8h
287        add             v0.8h,   v0.8h,   v8.8h
288        add             v8.8h,   v16.8h,  v17.8h
289        add             v12.8h,  v21.8h,  v22.8h
290        add             v0.8h,   v0.8h,   v10.8h
291        add             v10.8h,  v18.8h,  v25.8h
292        add             v14.8h,  v23.8h,  v24.8h
293        sub             v10.8h,  v10.8h,  v8.8h
294        add             v0.8h,   v0.8h,   v12.8h
295        add             v0.8h,   v0.8h,   v14.8h
296        add             v12.8h,  v16.8h,  v18.8h
297        add             v14.8h,  v19.8h,  v26.8h
298        urshr           v2.8h,   v0.8h,   #4
299
300        add             v0.8h,   v0.8h,   v10.8h
301        add             v8.8h,   v16.8h,  v19.8h
302        add             v10.8h,  v20.8h,  v27.8h
303        sub             v14.8h,  v14.8h,  v12.8h
304        bif             v2.16b,  v17.16b, v7.16b
305        urshr           v3.8h ,  v0.8h,   #4
306
307        add             v0.8h,   v0.8h,   v14.8h
308        add             v12.8h,  v16.8h,  v20.8h
309        add             v14.8h,  v21.8h,  v28.8h
310        sub             v10.8h,  v10.8h,  v8.8h
311        bif             v3.16b,  v18.16b, v7.16b
312        urshr           v4.8h,   v0.8h,   #4
313
314        add             v0.8h,   v0.8h,   v10.8h
315        add             v8.8h,   v16.8h,  v21.8h
316        add             v10.8h,  v22.8h,  v29.8h
317        sub             v14.8h,  v14.8h,  v12.8h
318        bif             v4.16b,  v19.16b, v7.16b
319        urshr           v5.8h,   v0.8h,   #4
320
321        add             v0.8h,   v0.8h,   v14.8h
322        add             v12.8h,  v16.8h,  v22.8h
323        add             v14.8h,  v23.8h,  v30.8h
324        sub             v10.8h,  v10.8h,  v8.8h
325        bif             v5.16b,  v20.16b, v7.16b
326        urshr           v6.8h,   v0.8h,   #4
327
328        add             v0.8h,   v0.8h,   v10.8h
329        add             v10.8h,  v16.8h,  v23.8h
330        sub             v14.8h,  v14.8h,  v12.8h
331        add             v12.8h,  v24.8h,  v31.8h
332        bif             v6.16b,  v21.16b, v7.16b
333        urshr           v8.8h,   v0.8h,   #4
334
335        add             v0.8h,   v0.8h,   v14.8h
336        sub             v10.8h,  v12.8h,  v10.8h
337        add             v12.8h,  v17.8h,  v24.8h
338        add             v14.8h,  v25.8h,  v31.8h
339        bif             v8.16b,  v22.16b, v7.16b
340        urshr           v9.8h,   v0.8h,   #4
341
342        add             v0.8h,   v0.8h,   v10.8h
343        sub             v14.8h,  v14.8h,  v12.8h
344        add             v12.8h,  v26.8h,  v31.8h
345        bif             v9.16b,  v23.16b, v7.16b
346        urshr           v10.8h,  v0.8h,   #4
347
348        add             v0.8h,   v0.8h,   v14.8h
349        add             v14.8h,  v18.8h,  v25.8h
350        add             v18.8h,  v19.8h,  v26.8h
351        sub             v12.8h,  v12.8h,  v14.8h
352        add             v14.8h,  v27.8h,  v31.8h
353        bif             v10.16b, v24.16b, v7.16b
354        urshr           v11.8h,  v0.8h,   #4
355
356        add             v0.8h,   v0.8h,   v12.8h
357        add             v12.8h,  v20.8h,  v27.8h
358        sub             v14.8h,  v14.8h,  v18.8h
359        add             v18.8h,  v28.8h,  v31.8h
360        bif             v11.16b, v25.16b, v7.16b
361        sub             v18.8h,  v18.8h,  v12.8h
362        urshr           v12.8h,  v0.8h,   #4
363
364        add             v0.8h,   v0.8h,   v14.8h
365        add             v14.8h,  v21.8h,  v28.8h
366        add             v20.8h,  v29.8h,  v31.8h
367        bif             v12.16b, v26.16b, v7.16b
368        urshr           v13.8h,  v0.8h,   #4
369
370        add             v0.8h,   v0.8h,   v18.8h
371        sub             v20.8h,  v20.8h,  v14.8h
372        add             v18.8h,  v22.8h,  v29.8h
373        add             v22.8h,  v30.8h,  v31.8h
374        bif             v13.16b, v27.16b, v7.16b
375        urshr           v14.8h,  v0.8h,   #4
376
377        add             v0.8h,   v0.8h,   v20.8h
378        sub             v22.8h,  v22.8h,  v18.8h
379        bif             v14.16b, v28.16b, v7.16b
380        urshr           v15.8h,  v0.8h,   #4
381
382        add             v0.8h,   v0.8h,   v22.8h
383        bif             v15.16b, v29.16b, v7.16b
384        urshr           v17.8h,  v0.8h,   #4
385        bif             v17.16b, v30.16b, v7.16b
386.endif
387.endm
388
389// For wd <= 8, we use v16-v19 and v28-v31 for temp registers,
390// while we need those for inputs/outputs in wd=16 and use v8-v15
391// for temp registers there instead.
392function vp9_loop_filter_4
393        loop_filter     4,  v16, v17, v18, v19, v28, v29, v30, v31
394        ret
395endfunc
396
397function vp9_loop_filter_8
398        loop_filter     8,  v16, v17, v18, v19, v28, v29, v30, v31
399        ret
400endfunc
401
402function vp9_loop_filter_16
403        loop_filter     16, v8,  v9,  v10, v11, v12, v13, v14, v15
404        ret
405endfunc
406
407.macro loop_filter_4
408        bl              vp9_loop_filter_4
409.endm
410
411.macro loop_filter_8
412        // calculate alternative 'return' targets
413        adr             x13, 6f
414        bl              vp9_loop_filter_8
415.endm
416
417.macro loop_filter_16
418        // calculate alternative 'return' targets
419        adr             x14, 7f
420        adr             x15, 8f
421        bl              vp9_loop_filter_16
422.endm
423
424
425// The public functions in this file have got the following signature:
426// void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
427
428.macro bpp_frontend func, bpp, push
429function ff_\func\()_\bpp\()_neon, export=1
430.if \push
431        mov             x16, x30
432        stp             d14, d15, [sp, #-0x10]!
433        stp             d12, d13, [sp, #-0x10]!
434        stp             d10, d11, [sp, #-0x10]!
435        stp             d8,  d9,  [sp, #-0x10]!
436.endif
437        lsl             w2,  w2,  #\bpp - 8
438        lsl             w3,  w3,  #\bpp - 8
439        lsl             w4,  w4,  #\bpp - 8
440        mov             x5,  #1 << (\bpp - 8)
441        mov             x6,  #16 - \bpp
442        mov             x7,  #((1 << \bpp) - 1)
443.if \push
444        bl              \func\()_16_neon
445        ldp             d8,  d9,  [sp], 0x10
446        ldp             d10, d11, [sp], 0x10
447        ldp             d12, d13, [sp], 0x10
448        ldp             d14, d15, [sp], 0x10
449        br              x16
450.else
451        b               \func\()_16_neon
452.endif
453endfunc
454.endm
455
456.macro bpp_frontends func, push=0
457        bpp_frontend    \func, 10, \push
458        bpp_frontend    \func, 12, \push
459.endm
460
461.macro bpp_frontend_rep func, suffix, int_suffix, dir, bpp, push
462function ff_\func\()_\suffix\()_\bpp\()_neon, export=1
463        mov             x16, x30
464.if \push
465        stp             d14, d15, [sp, #-0x10]!
466        stp             d12, d13, [sp, #-0x10]!
467        stp             d10, d11, [sp, #-0x10]!
468        stp             d8,  d9,  [sp, #-0x10]!
469.endif
470        lsl             w2,  w2,  #\bpp - 8
471        lsl             w3,  w3,  #\bpp - 8
472        lsl             w4,  w4,  #\bpp - 8
473        mov             x5,  #1 << (\bpp - 8)
474        mov             x6,  #16 - \bpp
475        mov             x7,  #((1 << \bpp) - 1)
476        bl              \func\()_\int_suffix\()_16_neon
477.ifc \dir,h
478        add             x0,  x0,  x1, lsl #3
479.else
480        add             x0,  x0,  #16
481.endif
482        bl              \func\()_\int_suffix\()_16_neon
483.if \push
484        ldp             d8,  d9,  [sp], 0x10
485        ldp             d10, d11, [sp], 0x10
486        ldp             d12, d13, [sp], 0x10
487        ldp             d14, d15, [sp], 0x10
488.endif
489        br              x16
490endfunc
491.endm
492
493.macro bpp_frontends_rep func, suffix, int_suffix, dir, push=0
494        bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 10, \push
495        bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 12, \push
496.endm
497
498.macro bpp_frontend_mix2 wd1, wd2, dir, bpp
499function ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1
500        mov             x16, x30
501        lsr             w8,  w2,  #8
502        lsr             w14, w3,  #8
503        lsr             w15, w4,  #8
504        and             w2,  w2,  #0xff
505        and             w3,  w3,  #0xff
506        and             w4,  w4,  #0xff
507        lsl             w2,  w2,  #\bpp - 8
508        lsl             w3,  w3,  #\bpp - 8
509        lsl             w4,  w4,  #\bpp - 8
510        mov             x5,  #1 << (\bpp - 8)
511        mov             x6,  #16 - \bpp
512        mov             x7,  #((1 << \bpp) - 1)
513        bl              vp9_loop_filter_\dir\()_\wd1\()_8_16_neon
514.ifc \dir,h
515        add             x0,  x0,  x1, lsl #3
516.else
517        add             x0,  x0,  #16
518.endif
519        lsl             w2,  w8,  #\bpp - 8
520        lsl             w3,  w14, #\bpp - 8
521        lsl             w4,  w15, #\bpp - 8
522        bl              vp9_loop_filter_\dir\()_\wd2\()_8_16_neon
523        br              x16
524endfunc
525.endm
526
527.macro bpp_frontends_mix2 wd1, wd2
528        bpp_frontend_mix2 \wd1, \wd2, v, 10
529        bpp_frontend_mix2 \wd1, \wd2, v, 12
530        bpp_frontend_mix2 \wd1, \wd2, h, 10
531        bpp_frontend_mix2 \wd1, \wd2, h, 12
532.endm
533
534function vp9_loop_filter_v_4_8_16_neon
535        mov             x10, x30
536        sub             x9,  x0,  x1, lsl #2
537        ld1             {v20.8h}, [x9], x1 // p3
538        ld1             {v24.8h}, [x0], x1 // q0
539        ld1             {v21.8h}, [x9], x1 // p2
540        ld1             {v25.8h}, [x0], x1 // q1
541        ld1             {v22.8h}, [x9], x1 // p1
542        ld1             {v26.8h}, [x0], x1 // q2
543        ld1             {v23.8h}, [x9], x1 // p0
544        ld1             {v27.8h}, [x0], x1 // q3
545        sub             x0,  x0,  x1, lsl #2
546        sub             x9,  x9,  x1, lsl #1
547
548        loop_filter_4
549
550        st1             {v22.8h}, [x9], x1
551        st1             {v24.8h}, [x0], x1
552        st1             {v23.8h}, [x9], x1
553        st1             {v25.8h}, [x0], x1
554        sub             x0,  x0,  x1, lsl #1
555
556        br              x10
557endfunc
558
559bpp_frontends vp9_loop_filter_v_4_8
560
561function vp9_loop_filter_h_4_8_16_neon
562        mov             x10, x30
563        sub             x9,  x0,  #8
564        add             x0,  x9,  x1, lsl #2
565        ld1             {v20.8h}, [x9], x1
566        ld1             {v24.8h}, [x0], x1
567        ld1             {v21.8h}, [x9], x1
568        ld1             {v25.8h}, [x0], x1
569        ld1             {v22.8h}, [x9], x1
570        ld1             {v26.8h}, [x0], x1
571        ld1             {v23.8h}, [x9], x1
572        ld1             {v27.8h}, [x0], x1
573
574        sub             x9,  x9,  x1, lsl #2
575        sub             x0,  x0,  x1, lsl #3
576        add             x0,  x0,  #8
577
578        transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
579
580        loop_filter_4
581
582        // Move x9 forward by 2 pixels; we don't need to rewrite the
583        // outermost 2 pixels since they aren't changed.
584        add             x9,  x9,  #4
585        add             x0,  x9,  x1, lsl #2
586
587        // We only will write the mid 4 pixels back; after the loop filter,
588        // these are in v22, v23, v24, v25, ordered as rows (8x4 pixels).
589        // We need to transpose them to columns, done with a 4x8 transpose
590        // (which in practice is two 4x4 transposes of the two 4x4 halves
591        // of the 8x4 pixels; into 4x8 pixels).
592        transpose_4x8H  v22, v23, v24, v25, v26, v27, v28, v29
593        st1             {v22.d}[0], [x9], x1
594        st1             {v22.d}[1], [x0], x1
595        st1             {v23.d}[0], [x9], x1
596        st1             {v23.d}[1], [x0], x1
597        st1             {v24.d}[0], [x9], x1
598        st1             {v24.d}[1], [x0], x1
599        st1             {v25.d}[0], [x9], x1
600        st1             {v25.d}[1], [x0], x1
601        sub             x0,  x0,  x1, lsl #3
602        add             x0,  x0,  #4
603
604        br              x10
605endfunc
606
607bpp_frontends vp9_loop_filter_h_4_8
608
609function vp9_loop_filter_v_8_8_16_neon
610        mov             x10, x30
611        sub             x9,  x0,  x1, lsl #2
612        ld1             {v20.8h}, [x9], x1 // p3
613        ld1             {v24.8h}, [x0], x1 // q0
614        ld1             {v21.8h}, [x9], x1 // p2
615        ld1             {v25.8h}, [x0], x1 // q1
616        ld1             {v22.8h}, [x9], x1 // p1
617        ld1             {v26.8h}, [x0], x1 // q2
618        ld1             {v23.8h}, [x9], x1 // p0
619        ld1             {v27.8h}, [x0], x1 // q3
620        sub             x9,  x9,  x1, lsl #2
621        sub             x0,  x0,  x1, lsl #2
622        add             x9,  x9,  x1
623
624        loop_filter_8
625
626        st1             {v21.8h}, [x9], x1
627        st1             {v24.8h}, [x0], x1
628        st1             {v22.8h}, [x9], x1
629        st1             {v25.8h}, [x0], x1
630        st1             {v23.8h}, [x9], x1
631        st1             {v26.8h}, [x0], x1
632        sub             x0,  x0,  x1, lsl #1
633        sub             x0,  x0,  x1
634
635        br              x10
6366:
637        sub             x9,  x0,  x1, lsl #1
638        st1             {v22.8h}, [x9], x1
639        st1             {v24.8h}, [x0], x1
640        st1             {v23.8h}, [x9], x1
641        st1             {v25.8h}, [x0], x1
642        sub             x0,  x0,  x1, lsl #1
643        br              x10
644endfunc
645
646bpp_frontends vp9_loop_filter_v_8_8
647
648function vp9_loop_filter_h_8_8_16_neon
649        mov             x10, x30
650        sub             x9,  x0,  #8
651        add             x0,  x9,  x1, lsl #2
652        ld1             {v20.8h}, [x9], x1
653        ld1             {v24.8h}, [x0], x1
654        ld1             {v21.8h}, [x9], x1
655        ld1             {v25.8h}, [x0], x1
656        ld1             {v22.8h}, [x9], x1
657        ld1             {v26.8h}, [x0], x1
658        ld1             {v23.8h}, [x9], x1
659        ld1             {v27.8h}, [x0], x1
660
661        sub             x9,  x9,  x1, lsl #2
662        sub             x0,  x0,  x1, lsl #3
663        add             x0,  x0,  #8
664
665        transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
666
667        loop_filter_8
668
669        add             x0,  x9,  x1, lsl #2
670
671        // Even though only 6 pixels per row have been changed, we write the
672        // full 8 pixel registers.
673        transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
674
675        st1             {v20.8h}, [x9], x1
676        st1             {v24.8h}, [x0], x1
677        st1             {v21.8h}, [x9], x1
678        st1             {v25.8h}, [x0], x1
679        st1             {v22.8h}, [x9], x1
680        st1             {v26.8h}, [x0], x1
681        st1             {v23.8h}, [x9], x1
682        st1             {v27.8h}, [x0], x1
683        sub             x0,  x0,  x1, lsl #3
684        add             x0,  x0,  #8
685
686        br              x10
6876:
688        // If we didn't need to do the flat8in part, we use the same writeback
689        // as in loop_filter_h_4_8.
690        add             x9,  x9,  #4
691        add             x0,  x9,  x1, lsl #2
692        transpose_4x8H  v22, v23, v24, v25, v26, v27, v28, v29
693        st1             {v22.d}[0], [x9], x1
694        st1             {v22.d}[1], [x0], x1
695        st1             {v23.d}[0], [x9], x1
696        st1             {v23.d}[1], [x0], x1
697        st1             {v24.d}[0], [x9], x1
698        st1             {v24.d}[1], [x0], x1
699        st1             {v25.d}[0], [x9], x1
700        st1             {v25.d}[1], [x0], x1
701        sub             x0,  x0,  x1, lsl #3
702        add             x0,  x0,  #4
703        br              x10
704endfunc
705
706bpp_frontends vp9_loop_filter_h_8_8
707
708bpp_frontends_mix2 4, 4
709bpp_frontends_mix2 4, 8
710bpp_frontends_mix2 8, 4
711bpp_frontends_mix2 8, 8
712
713function vp9_loop_filter_v_16_8_16_neon
714        mov             x10, x30
715        sub             x9,  x0,  x1, lsl #3
716        ld1             {v16.8h}, [x9], x1 // p7
717        ld1             {v24.8h}, [x0], x1 // q0
718        ld1             {v17.8h}, [x9], x1 // p6
719        ld1             {v25.8h}, [x0], x1 // q1
720        ld1             {v18.8h}, [x9], x1 // p5
721        ld1             {v26.8h}, [x0], x1 // q2
722        ld1             {v19.8h}, [x9], x1 // p4
723        ld1             {v27.8h}, [x0], x1 // q3
724        ld1             {v20.8h}, [x9], x1 // p3
725        ld1             {v28.8h}, [x0], x1 // q4
726        ld1             {v21.8h}, [x9], x1 // p2
727        ld1             {v29.8h}, [x0], x1 // q5
728        ld1             {v22.8h}, [x9], x1 // p1
729        ld1             {v30.8h}, [x0], x1 // q6
730        ld1             {v23.8h}, [x9], x1 // p0
731        ld1             {v31.8h}, [x0], x1 // q7
732        sub             x9,  x9,  x1, lsl #3
733        sub             x0,  x0,  x1, lsl #3
734        add             x9,  x9,  x1
735
736        loop_filter_16
737
738        // If we did the flat8out part, we get the output in
739        // v2-v17 (skipping v7 and v16). x9 points to x0 - 7 * stride,
740        // store v2-v9 there, and v10-v17 into x0.
741        st1             {v2.8h},  [x9], x1
742        st1             {v10.8h}, [x0], x1
743        st1             {v3.8h},  [x9], x1
744        st1             {v11.8h}, [x0], x1
745        st1             {v4.8h},  [x9], x1
746        st1             {v12.8h}, [x0], x1
747        st1             {v5.8h},  [x9], x1
748        st1             {v13.8h}, [x0], x1
749        st1             {v6.8h},  [x9], x1
750        st1             {v14.8h}, [x0], x1
751        st1             {v8.8h},  [x9], x1
752        st1             {v15.8h}, [x0], x1
753        st1             {v9.8h},  [x9], x1
754        st1             {v17.8h}, [x0], x1
755        sub             x0,  x0,  x1, lsl #3
756        add             x0,  x0,  x1
757
758        br              x10
7598:
760        add             x9,  x9,  x1, lsl #2
761        // If we didn't do the flat8out part, the output is left in the
762        // input registers.
763        st1             {v21.8h}, [x9], x1
764        st1             {v24.8h}, [x0], x1
765        st1             {v22.8h}, [x9], x1
766        st1             {v25.8h}, [x0], x1
767        st1             {v23.8h}, [x9], x1
768        st1             {v26.8h}, [x0], x1
769        sub             x0,  x0,  x1, lsl #1
770        sub             x0,  x0,  x1
771        br              x10
7727:
773        sub             x9,  x0,  x1, lsl #1
774        st1             {v22.8h}, [x9], x1
775        st1             {v24.8h}, [x0], x1
776        st1             {v23.8h}, [x9], x1
777        st1             {v25.8h}, [x0], x1
778        sub             x0,  x0,  x1, lsl #1
779        br              x10
780endfunc
781
782bpp_frontends vp9_loop_filter_v_16_8, push=1
783bpp_frontends_rep vp9_loop_filter_v_16, 16, 8, v, push=1
784
785function vp9_loop_filter_h_16_8_16_neon
786        mov             x10, x30
787        sub             x9,  x0,  #16
788        ld1             {v16.8h}, [x9], x1
789        ld1             {v24.8h}, [x0], x1
790        ld1             {v17.8h}, [x9], x1
791        ld1             {v25.8h}, [x0], x1
792        ld1             {v18.8h}, [x9], x1
793        ld1             {v26.8h}, [x0], x1
794        ld1             {v19.8h}, [x9], x1
795        ld1             {v27.8h}, [x0], x1
796        ld1             {v20.8h}, [x9], x1
797        ld1             {v28.8h}, [x0], x1
798        ld1             {v21.8h}, [x9], x1
799        ld1             {v29.8h}, [x0], x1
800        ld1             {v22.8h}, [x9], x1
801        ld1             {v30.8h}, [x0], x1
802        ld1             {v23.8h}, [x9], x1
803        ld1             {v31.8h}, [x0], x1
804        sub             x0,  x0,  x1, lsl #3
805        sub             x9,  x9,  x1, lsl #3
806
807        // The 16x8 pixels read above is in two 8x8 blocks; the left
808        // half in v16-v23, and the right half in v24-v31. Do two 8x8 transposes
809        // of this, to get one column per register.
810        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v0, v1
811        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v0, v1
812
813        loop_filter_16
814
815        transpose_8x8H  v16, v2,  v3,  v4,  v5,  v6,  v8,  v9,  v0, v1
816        transpose_8x8H  v10, v11, v12, v13, v14, v15, v17, v31, v0, v1
817
818        st1             {v16.8h}, [x9], x1
819        st1             {v10.8h}, [x0], x1
820        st1             {v2.8h},  [x9], x1
821        st1             {v11.8h}, [x0], x1
822        st1             {v3.8h},  [x9], x1
823        st1             {v12.8h}, [x0], x1
824        st1             {v4.8h},  [x9], x1
825        st1             {v13.8h}, [x0], x1
826        st1             {v5.8h},  [x9], x1
827        st1             {v14.8h}, [x0], x1
828        st1             {v6.8h},  [x9], x1
829        st1             {v15.8h}, [x0], x1
830        st1             {v8.8h},  [x9], x1
831        st1             {v17.8h}, [x0], x1
832        st1             {v9.8h},  [x9], x1
833        st1             {v31.8h}, [x0], x1
834        sub             x0,  x0,  x1, lsl #3
835
836        br              x10
8378:
838        // The same writeback as in loop_filter_h_8_8
839        sub             x9,  x0,  #8
840        add             x0,  x9,  x1, lsl #2
841        transpose_8x8H  v20, v21, v22, v23, v24, v25, v26, v27, v28, v29
842
843        st1             {v20.8h}, [x9], x1
844        st1             {v24.8h}, [x0], x1
845        st1             {v21.8h}, [x9], x1
846        st1             {v25.8h}, [x0], x1
847        st1             {v22.8h}, [x9], x1
848        st1             {v26.8h}, [x0], x1
849        st1             {v23.8h}, [x9], x1
850        st1             {v27.8h}, [x0], x1
851        sub             x0,  x0,  x1, lsl #3
852        add             x0,  x0,  #8
853        br              x10
8547:
855        // The same writeback as in loop_filter_h_4_8
856        sub             x9,  x0,  #4
857        add             x0,  x9,  x1, lsl #2
858        transpose_4x8H  v22, v23, v24, v25, v26, v27, v28, v29
859        st1             {v22.d}[0], [x9], x1
860        st1             {v22.d}[1], [x0], x1
861        st1             {v23.d}[0], [x9], x1
862        st1             {v23.d}[1], [x0], x1
863        st1             {v24.d}[0], [x9], x1
864        st1             {v24.d}[1], [x0], x1
865        st1             {v25.d}[0], [x9], x1
866        st1             {v25.d}[1], [x0], x1
867        sub             x0,  x0,  x1, lsl #3
868        add             x0,  x0,  #4
869        br              x10
870endfunc
871
872bpp_frontends vp9_loop_filter_h_16_8, push=1
873bpp_frontends_rep vp9_loop_filter_h_16, 16, 8, h, push=1
874