1/*
2 * Copyright (c) 2017 Google Inc.
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "libavutil/arm/asm.S"
22
23.macro transpose16_q_8x8 rq0, rq1, rq2, rq3, rq4, rq5, rq6, rq7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15
24        vswp             \r1,  \r8  @ vtrn.64 \rq0, \rq4
25        vswp             \r3,  \r10 @ vtrn.64 \rq1, \rq5
26        vswp             \r5,  \r12 @ vtrn.64 \rq2, \rq6
27        vswp             \r7,  \r14 @ vtrn.64 \rq3, \rq7
28        vtrn.32          \rq0, \rq2
29        vtrn.32          \rq1, \rq3
30        vtrn.32          \rq4, \rq6
31        vtrn.32          \rq5, \rq7
32        vtrn.16          \rq0, \rq1
33        vtrn.16          \rq2, \rq3
34        vtrn.16          \rq4, \rq5
35        vtrn.16          \rq6, \rq7
36.endm
37
38.macro transpose16_4x4 r0, r1, r2, r3
39        vtrn.32          \r0, \r2
40        vtrn.32          \r1, \r3
41        vtrn.16          \r0, \r1
42        vtrn.16          \r2, \r3
43.endm
44
45@ Do a 4x4 transpose, using q registers for the subtransposes that don't
46@ need to address the indiviudal d registers.
47@ r0,r1 == rq0, r2,r3 == rq1
48.macro transpose16_q_4x4 rq0, rq1, r0, r1, r2, r3
49        vtrn.32         \rq0, \rq1
50        vtrn.16         \r0,  \r1
51        vtrn.16         \r2,  \r3
52.endm
53
54@ The input to and output from this macro is in the registers q8-q15,
55@ and q0-q7 are used as scratch registers.
56@ p3 = q8, p0 = q11, q0 = q12, q3 = q15
57.macro loop_filter_q wd
58        vdup.u16        q0,  r2          @ E
59        vdup.u16        q1,  r3          @ I
60
61        vabd.u16        q2,  q8,  q9     @ abs(p3 - p2)
62        vabd.u16        q3,  q9,  q10    @ abs(p2 - p1)
63        vabd.u16        q4,  q10, q11    @ abs(p1 - p0)
64        vabd.u16        q5,  q12, q13    @ abs(q0 - q1)
65        vabd.u16        q6,  q13, q14    @ abs(q1 - q2)
66        vabd.u16        q7,  q14, q15    @ abs(q2 - q3)
67        vmax.u16        q2,  q2,  q3
68        vmax.u16        q3,  q4,  q5
69        vmax.u16        q4,  q6,  q7
70        vabd.u16        q5,  q11, q12    @ abs(p0 - q0)
71        vmax.u16        q2,  q2,  q3
72        vadd.u16        q5,  q5,  q5     @ abs(p0 - q0) * 2
73        vabd.u16        q6,  q10, q13    @ abs(p1 - q1)
74        vmax.u16        q2,  q2,  q4     @ max(abs(p3 - p2), ..., abs(q2 - q3))
75        vshr.u16        q6,  q6,  #1
76        vcle.u16        q2,  q2,  q1     @ max(abs()) <= I
77        vadd.u16        q5,  q5,  q6     @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
78        vcle.u16        q5,  q5,  q0
79        vand            q2,  q2,  q5     @ fm
80
81        vmovn.u16       d10, q2
82        vmov            r8,  r9,  d10
83        orrs            r8,  r8,  r9
84        @ If no pixels need filtering, just exit as soon as possible
85        beq             9f
86
87.if \wd >= 8
88        vdup.u16        q0,  r5
89
90        vabd.u16        q1,  q8,  q11    @ abs(p3 - p0)
91        vabd.u16        q3,  q9,  q11    @ abs(p2 - p0)
92        vabd.u16        q4,  q10, q11    @ abs(p1 - p0)
93        vabd.u16        q5,  q13, q12    @ abs(q1 - q0)
94        vabd.u16        q6,  q14, q12    @ abs(q2 - q0)
95        vabd.u16        q7,  q15, q12    @ abs(q3 - q0)
96        vmax.u16        q1,  q1,  q3
97        vmax.u16        q4,  q4,  q5
98        vmax.u16        q6,  q6,  q7
99        @ The rest of the calculation of flat8in is interleaved below
100.endif
101
102        @ Calculate the normal inner loop filter for 2 or 4 pixels
103        vabd.u16        q3,  q10, q11    @ abs(p1 - p0)
104.if \wd == 8
105        vmax.u16        q1,  q1,  q4
106.endif
107        vabd.u16        q4,  q13, q12    @ abs(q1 - q0)
108.if \wd == 8
109        vmax.u16        q1,  q1,  q6
110.endif
111
112        vsub.u16        q5,  q10, q13    @ p1 - q1
113        vmax.u16        q3,  q3,  q4     @ max(abs(p1 - p0), abs(q1 - q0))
114        vdup.u16        q4,  r4          @ H
115        vsub.u16        q6,  q12, q11    @ q0 - p0
116.if \wd == 8
117        vcle.u16        q1,  q1,  q0     @ flat8in
118.endif
119        vdup.u16        q0,  r6          @ left shift for saturation
120        vcle.u16        q3,  q3,  q4     @ !hev
121.if \wd == 8
122        vand            q1,  q1,  q2     @ flat8in && fm
123.endif
124        vneg.s16        q4,  q0          @ negative left shift after saturation
125        vqshl.s16       q5,  q5,  q0
126.if \wd == 8
127        vbic            q2,  q2,  q1     @ fm && !flat8in
128.endif
129        vmov.s16        q7,  #3
130        vand            q3,  q3,  q2     @ !hev && fm && !flat8in
131        vshl.s16        q5,  q5,  q4     @ av_clip_int2p(p1 - q1, BIT_DEPTH - 1)
132
133        vmul.s16        q6,  q6,  q7     @ 3 * (q0 - p0)
134        vbic            q5,  q5,  q3     @ if (!hev) av_clip_int2p = 0
135        vadd.s16        q6,  q6,  q5     @ 3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)]
136        vmov.s16        q5,  #4
137        vqshl.s16       q6,  q6,  q0
138        vmov.s16        q0,  #3
139        vshl.s16        q6,  q6,  q4     @ av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f
140        vdup.u16        q4,  r7          @ max pixel value
141
142        vshr.u16        q4,  q4,  #1     @ (1 << (BIT_DEPTH - 1)) - 1)
143
144        vadd.s16        q5,  q6,  q5     @ f + 4
145        vadd.s16        q0,  q6,  q0     @ f + 3
146        vmov.s16        q6,  #0
147        vmin.s16        q5,  q5,  q4     @ FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1)
148        vmin.s16        q0,  q0,  q4     @ FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1)
149        vdup.u16        q4,  r7          @ max pixel value
150        vshr.s16        q5,  q5,  #3     @ f1
151        vshr.s16        q0,  q0,  #3     @ f2
152
153        vadd.s16        q0,  q11, q0     @ p0 + f2
154        vsub.s16        q7,  q12, q5     @ q0 - f1
155        vmin.s16        q0,  q0,  q4
156        vmin.s16        q7,  q7,  q4
157        vrshr.s16       q5,  q5,  #1     @ f = (f1 + 1) >> 1
158        vmax.s16        q0,  q0,  q6     @ out p0
159        vmax.s16        q7,  q7,  q6     @ out q0
160        vbit            q11, q0,  q2     @ if (fm && !flat8in)
161        vbit            q12, q7,  q2
162.if \wd >= 8
163        vmovn.u16       d4,  q1
164.endif
165
166        vadd.s16        q0,  q10, q5     @ p1 + f
167        vsub.s16        q7,  q13, q5     @ q1 - f
168.if \wd >= 8
169        vmov            r8,  r9,  d4
170.endif
171        vmin.s16        q0,  q0,  q4
172        vmin.s16        q7,  q7,  q4
173.if \wd >= 8
174        orrs            r8,  r8,  r9
175.endif
176        vmax.s16        q0,  q0,  q6     @ out p1
177        vmax.s16        q7,  q7,  q6     @ out q1
178        vbit            q10, q0,  q3     @ if (!hev && fm && !flat8in)
179        vbit            q13, q7,  q3
180
181.if \wd >= 8
182        @ If no pixels need flat8in, jump to a writeout of the inner 4 pixels
183        beq             6f
184
185        @ flat8in
186        vadd.u16        q2,  q8,  q9
187        vadd.u16        q3,  q10, q13
188        vadd.u16        q4,  q8,  q10
189        vadd.u16        q5,  q11, q14
190        vadd.u16        q0,  q2,  q2
191        vadd.u16        q0,  q0,  q11
192        vadd.u16        q0,  q0,  q12
193        vadd.u16        q0,  q0,  q4
194        vsub.s16        q3,  q3,  q2
195        vsub.s16        q5,  q5,  q4
196        vrshr.u16       q6,  q0,  #3     @ out p2
197
198        vadd.u16        q0,  q0,  q3
199        vadd.u16        q2,  q8,  q11
200        vadd.u16        q3,  q12, q15
201        vrshr.u16       q7,  q0,  #3     @ out p1
202
203        vadd.u16        q0,  q0,  q5
204        vsub.s16        q3,  q3,  q2
205        vadd.u16        q4,  q9,  q12
206        vbit            q9,  q6,  q1
207        vadd.u16        q5,  q13, q15
208        vrshr.u16       q6,  q0,  #3     @ out p0
209
210        vadd.u16        q0,  q0,  q3
211        vsub.s16        q5,  q5,  q4
212        vadd.u16        q2,  q10, q13
213        vbit            q10, q7,  q1
214        vadd.u16        q3,  q14, q15
215        vrshr.u16       q7,  q0,  #3     @ out q0
216
217        vadd.u16        q0,  q0,  q5
218        vsub.s16        q3,  q3,  q2
219        vbit            q11, q6,  q1
220        vrshr.u16       q6,  q0,  #3     @ out q1
221
222        vadd.u16        q0,  q0,  q3
223        vbit            q12, q7,  q1
224        vrshr.u16       q7,  q0,  #3     @ out q2
225        vbit            q13, q6,  q1
226        vbit            q14, q7,  q1
227.endif
228.endm
229
230@ The input to and output from this macro is in the registers d16-d31,
231@ and d0-d7 are used as scratch registers.
232@ p7 = d16 .. p3 = d20, p0 = d23, q0 = d24, q3 = d27, q7 = d31
233@ Depending on the width of the loop filter, we either use d16-d19
234@ and d28-d31 as temp registers, or d8-d15.
235@ In practice, this is only ever instantiated once, so the macro parameters
236@ could be hardcoded, but keeping them as is, to keep similarities to the
237@ 8 bpp and aarch64 versions.
238.macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
239        vdup.u16        d0,  r2          @ E
240        vdup.u16        d2,  r3          @ I
241
242        vabd.u16        d4,  d20, d21    @ abs(p3 - p2)
243        vabd.u16        d5,  d21, d22    @ abs(p2 - p1)
244        vabd.u16        d6,  d22, d23    @ abs(p1 - p0)
245        vabd.u16        d7,  d24, d25    @ abs(q0 - q1)
246        vabd.u16        \tmp1,  d25, d26 @ abs(q1 - q2)
247        vabd.u16        \tmp2,  d26, d27 @ abs(q2 - q3)
248        vmax.u16        d4,  d4,  d5
249        vmax.u16        d5,  d6,  d7
250        vmax.u16        \tmp1,  \tmp1,  \tmp2
251        vabd.u16        d6,  d23, d24    @ abs(p0 - q0)
252        vmax.u16        d4,  d4,  d5
253        vadd.u16        d6,  d6,  d6     @ abs(p0 - q0) * 2
254        vabd.u16        d5,  d22, d25    @ abs(p1 - q1)
255        vmax.u16        d4,  d4,  \tmp1  @ max(abs(p3 - p2), ..., abs(q2 - q3))
256        vshr.u16        d5,  d5,  #1
257        vcle.u16        d4,  d4,  d2     @ max(abs()) <= I
258        vadd.u16        d6,  d6,  d5     @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
259        vcle.u16        d6,  d6,  d0
260        vand            d4,  d4,  d6     @ fm
261
262        vdup.u16        d3,  r4          @ H
263        vmov            r8,  r9,  d4
264        orrs            r8,  r8,  r9
265        @ If no pixels need filtering, just exit as soon as possible
266        beq             9f
267
268.if \wd >= 8
269        vdup.u16        d0,  r5
270
271        vabd.u16        d6,  d20, d23    @ abs(p3 - p0)
272        vabd.u16        d2,  d21, d23    @ abs(p2 - p0)
273        vabd.u16        d1,  d22, d23    @ abs(p1 - p0)
274        vabd.u16        \tmp1,  d25, d24 @ abs(q1 - q0)
275        vabd.u16        \tmp2,  d26, d24 @ abs(q2 - q0)
276        vabd.u16        \tmp3,  d27, d24 @ abs(q3 - q0)
277        vmax.u16        d6,  d6,  d2
278        vmax.u16        d1,  d1,  \tmp1
279        vmax.u16        \tmp2,  \tmp2,  \tmp3
280.if \wd == 16
281        vabd.u16        d7,  d16, d23    @ abs(p7 - p0)
282        vmax.u16        d6,  d6,  d1
283        vabd.u16        d2,  d17, d23    @ abs(p6 - p0)
284        vmax.u16        d6,  d6,  \tmp2
285        vabd.u16        d1,  d18, d23    @ abs(p5 - p0)
286        vcle.u16        d6,  d6,  d0     @ flat8in
287        vabd.u16        d8,  d19, d23    @ abs(p4 - p0)
288        vand            d6,  d6,  d4     @ flat8in && fm
289        vabd.u16        d9,  d28, d24    @ abs(q4 - q0)
290        vbic            d4,  d4,  d6     @ fm && !flat8in
291        vabd.u16        d10, d29, d24    @ abs(q5 - q0)
292        vabd.u16        d11, d30, d24    @ abs(q6 - q0)
293        vabd.u16        d12, d31, d24    @ abs(q7 - q0)
294
295        vmax.u16        d7,  d7,  d2
296        vmax.u16        d1,  d1,  d8
297        vmax.u16        d9,  d9,  d10
298        vmax.u16        d11, d11, d12
299        @ The rest of the calculation of flat8out is interleaved below
300.else
301        @ The rest of the calculation of flat8in is interleaved below
302.endif
303.endif
304
305        @ Calculate the normal inner loop filter for 2 or 4 pixels
306        vabd.u16        d5,  d22, d23           @ abs(p1 - p0)
307.if \wd == 16
308        vmax.u16        d7,  d7,  d1
309        vmax.u16        d9,  d9,  d11
310.elseif \wd == 8
311        vmax.u16        d6,  d6,  d1
312.endif
313        vabd.u16        d1,  d25, d24           @ abs(q1 - q0)
314.if \wd == 16
315        vmax.u16        d7,  d7,  d9
316.elseif \wd == 8
317        vmax.u16        d6,  d6,  \tmp2
318.endif
319        vdup.u16        \tmp2,  r6              @ left shift for saturation
320        vsub.u16        \tmp1,  d22, d25        @ p1 - q1
321        vneg.s16        \tmp6,  \tmp2           @ negative left shift after saturation
322        vmax.u16        d5,  d5,  d1            @ max(abs(p1 - p0), abs(q1 - q0))
323        vsub.u16        \tmp3,   d24, d23       @ q0 - p0
324        vmov.s16        \tmp5,  #3
325.if \wd == 8
326        vcle.u16        d6,  d6,  d0            @ flat8in
327.endif
328        vcle.u16        d5,  d5,  d3            @ !hev
329.if \wd == 8
330        vand            d6,  d6,  d4            @ flat8in && fm
331.endif
332        vqshl.s16       \tmp1,  \tmp1,  \tmp2
333.if \wd == 16
334        vcle.u16        d7,  d7,  d0            @ flat8out
335.elseif \wd == 8
336        vbic            d4,  d4,  d6            @ fm && !flat8in
337.endif
338        vand            d5,  d5,  d4            @ !hev && fm && !flat8in
339.if \wd == 16
340        vand            d7,  d7,  d6            @ flat8out && flat8in && fm
341.endif
342        vshl.s16        \tmp1,  \tmp1,  \tmp6   @ av_clip_int2p(p1 - q1, BIT_DEPTH - 1)
343
344        vmul.s16        \tmp3,  \tmp3,  \tmp5   @ 3 * (q0 - p0)
345        vbic            \tmp1,  \tmp1,   d5     @ if (!hev) av_clip_int2p = 0
346        vmov.s16        d2,  #4
347        vadd.s16        \tmp3,  \tmp3,  \tmp1   @ 3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)]
348        vmov.s16        d3,  #3
349        vqshl.s16       \tmp1,  \tmp3,  \tmp2
350        vmov.s16        \tmp5,  #0
351        vshl.s16        \tmp1,  \tmp1,  \tmp6   @ av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f
352        vdup.u16        \tmp6,  r7              @ max pixel value
353.if \wd == 16
354        vbic            d6,  d6,  d7            @ fm && flat8in && !flat8out
355.endif
356
357        vshr.u16        \tmp2,  \tmp6,  #1      @ (1 << (BIT_DEPTH - 1)) - 1
358
359        vadd.s16        \tmp3,  \tmp1,  d2      @ f + 4
360        vadd.s16        \tmp4,  \tmp1,  d3      @ f + 3
361        vmin.s16        \tmp3,  \tmp3,  \tmp2   @ FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1)
362        vmin.s16        \tmp4,  \tmp4,  \tmp2   @ FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1)
363        vshr.s16        \tmp3,  \tmp3,  #3      @ f1
364        vshr.s16        \tmp4,  \tmp4,  #3      @ f2
365
366        vadd.s16        d0,  d23, \tmp4         @ p0 + f2
367        vsub.s16        d2,  d24, \tmp3         @ q0 - f1
368        vmin.s16        d0,  d0,  \tmp6
369        vmin.s16        d2,  d2,  \tmp6
370        vrshr.s16       \tmp3,  \tmp3,  #1      @ f = (f1 + 1) >> 1
371        vmax.s16        d0,  d0,  \tmp5         @ out p0
372        vmax.s16        d2,  d2,  \tmp5         @ out q0
373        vbit            d23, d0,  d4            @ if (fm && !flat8in)
374        vbit            d24, d2,  d4
375
376        vadd.s16        d0,  d22, \tmp3         @ p1 + f
377        vsub.s16        d2,  d25, \tmp3         @ q1 - f
378.if \wd >= 8
379        vmov            r8,  r9,  d6
380.endif
381        vmin.s16        d0,  d0,  \tmp6
382        vmin.s16        d2,  d2,  \tmp6
383.if \wd >= 8
384        orrs            r8,  r8,  r9
385.endif
386        vmax.s16        d0,  d0,  \tmp5         @ out p1
387        vmax.s16        d2,  d2,  \tmp5         @ out q1
388        vbit            d22, d0,  d5            @ if (!hev && fm && !flat8in)
389        vbit            d25, d2,  d5
390
391.if \wd >= 8
392        @ If no pixels need flat8in, jump to flat8out
393        @ (or to a writeout of the inner 4 pixels, for wd=8)
394        beq             6f
395
396        @ flat8in
397        vadd.u16        \tmp1,  d20, d21
398        vadd.u16        \tmp3,  d22, d25
399        vadd.u16        \tmp5,  d20, d22
400        vadd.u16        \tmp7,  d23, d26
401        vadd.u16        d0,  \tmp1,  \tmp1
402        vadd.u16        d0,  d0,  d23
403        vadd.u16        d0,  d0,  d24
404        vadd.u16        d0,  d0,  \tmp5
405        vsub.s16        \tmp3,  \tmp3,  \tmp1
406        vsub.s16        \tmp7,  \tmp7,  \tmp5
407        vrshr.u16       d2,  d0,  #3            @ out p2
408
409        vadd.u16        d0,  d0,  \tmp3
410        vadd.u16        \tmp1,  d20, d23
411        vadd.u16        \tmp3,  d24, d27
412        vrshr.u16       d3,  d0,  #3            @ out p1
413
414        vadd.u16        d0,  d0,  \tmp7
415        vsub.s16        \tmp3,  \tmp3,  \tmp1
416        vadd.u16        \tmp5,  d21, d24
417        vadd.u16        \tmp7,  d25, d27
418        vrshr.u16       d4,  d0,  #3            @ out p0
419
420        vadd.u16        d0,  d0,  \tmp3
421        vsub.s16        \tmp7,  \tmp7,  \tmp5
422        vadd.u16        \tmp1,  d22, d25
423        vadd.u16        \tmp3,  d26, d27
424        vrshr.u16       d5,  d0,  #3            @ out d0
425
426        vadd.u16        d0,  d0,  \tmp7
427        vsub.s16        \tmp3,  \tmp3,  \tmp1
428        vrshr.u16       \tmp5,  d0,  #3         @ out q1
429
430        vadd.u16        d0,  d0,  \tmp3
431        @ The output here is written back into the input registers. This doesn't
432        @ matter for the flat8out part below, since we only update those pixels
433        @ which won't be touched below.
434        vbit            d21, d2,  d6
435        vbit            d22, d3,  d6
436        vbit            d23, d4,  d6
437        vrshr.u16       \tmp6,  d0,  #3         @ out q2
438        vbit            d24, d5,  d6
439        vbit            d25, \tmp5,  d6
440        vbit            d26, \tmp6,  d6
441.endif
442.if \wd == 16
4436:
444        vorr            d2,  d6,  d7
445        vmov            r8,  r9,  d2
446        orrs            r8,  r8,  r9
447        @ If no pixels needed flat8in nor flat8out, jump to a
448        @ writeout of the inner 4 pixels
449        beq             7f
450        vmov            r8,  r9,  d7
451        orrs            r8,  r8,  r9
452        @ If no pixels need flat8out, jump to a writeout of the inner 6 pixels
453        beq             8f
454
455        @ flat8out
456        @ This writes all outputs into d2-d17 (skipping d6 and d16).
457        @ If this part is skipped, the output is read from d21-d26 (which is the input
458        @ to this section).
459        vshl.u16        d0,  d16, #3  @ 8 * d16
460        vsub.u16        d0,  d0,  d16 @ 7 * d16
461        vadd.u16        d0,  d0,  d17
462        vadd.u16        d8,  d17, d18
463        vadd.u16        d10, d19, d20
464        vadd.s16        d0,  d0,  d8
465        vadd.u16        d8,  d16, d17
466        vadd.u16        d12, d21, d22
467        vadd.s16        d0,  d0,  d10
468        vadd.u16        d10, d18, d25
469        vadd.u16        d14, d23, d24
470        vsub.s16        d10, d10, d8
471        vadd.s16        d0,  d0,  d12
472        vadd.s16        d0,  d0,  d14
473        vadd.u16        d12, d16, d18
474        vadd.u16        d14, d19, d26
475        vrshr.u16       d2,  d0,  #4
476
477        vadd.s16        d0,  d0,  d10
478        vadd.u16        d8,  d16, d19
479        vadd.u16        d10, d20, d27
480        vsub.s16        d14, d14, d12
481        vbif            d2,  d17, d7
482        vrshr.u16       d3,  d0,  #4
483
484        vadd.s16        d0,  d0,  d14
485        vadd.u16        d12, d16, d20
486        vadd.u16        d14, d21, d28
487        vsub.s16        d10, d10, d8
488        vbif            d3,  d18, d7
489        vrshr.u16       d4,  d0,  #4
490
491        vadd.s16        d0,  d0,  d10
492        vadd.u16        d8,  d16, d21
493        vadd.u16        d10, d22, d29
494        vsub.s16        d14, d14, d12
495        vbif            d4,  d19, d7
496        vrshr.u16       d5,  d0,  #4
497
498        vadd.s16        d0,  d0,  d14
499        vadd.u16        d12, d16, d22
500        vadd.u16        d14, d23, d30
501        vsub.s16        d10, d10, d8
502        vbif            d5,  d20, d7
503        vrshr.u16       d6,  d0,  #4
504
505        vadd.s16        d0,  d0,  d10
506        vadd.u16        d10, d16, d23
507        vsub.s16        d14, d14, d12
508        vadd.u16        d12, d24, d31
509        vbif            d6,  d21, d7
510        vrshr.u16       d8,  d0,  #4
511
512        vadd.s16        d0,  d0,  d14
513        vsub.s16        d10, d12, d10
514        vadd.u16        d12, d17, d24
515        vadd.u16        d14, d25, d31
516        vbif            d8,  d22, d7
517        vrshr.u16       d9,  d0,  #4
518
519        vadd.s16        d0,  d0,  d10
520        vsub.s16        d14, d14, d12
521        vadd.u16        d12, d26, d31
522        vbif            d9,  d23, d7
523        vrshr.u16       d10, d0,  #4
524
525        vadd.s16        d0,  d0,  d14
526        vadd.u16        d14, d18, d25
527        vadd.u16        d18, d19, d26
528        vsub.s16        d12, d12, d14
529        vadd.u16        d14, d27, d31
530        vbif            d10, d24, d7
531        vrshr.u16       d11, d0,  #4
532
533        vadd.s16        d0,  d0,  d12
534        vadd.u16        d12, d20, d27
535        vsub.s16        d14, d14, d18
536        vadd.u16        d18, d28, d31
537        vbif            d11, d25, d7
538        vsub.s16        d18, d18, d12
539        vrshr.u16       d12, d0,  #4
540
541        vadd.s16        d0,  d0,  d14
542        vadd.u16        d14, d21, d28
543        vadd.u16        d20, d29, d31
544        vbif            d12, d26, d7
545        vrshr.u16       d13, d0,  #4
546
547        vadd.s16        d0,  d0,  d18
548        vsub.s16        d20, d20, d14
549        vadd.u16        d18, d22, d29
550        vadd.u16        d22, d30, d31
551        vbif            d13, d27, d7
552        vrshr.u16       d14, d0,  #4
553
554        vadd.s16        d0,  d0,  d20
555        vsub.s16        d22, d22, d18
556        vbif            d14, d28, d7
557        vrshr.u16       d15, d0,  #4
558
559        vadd.s16        d0,  d0,  d22
560        vbif            d15, d29, d7
561        vrshr.u16       d17, d0,  #4
562        vbif            d17, d30, d7
563.endif
564.endm
565
566.macro loop_filter_q_4
567        loop_filter_q   4
568.endm
569
570.macro loop_filter_q_8
571        loop_filter_q   8
572.endm
573
574.macro loop_filter_16
575        loop_filter     16, d8,  d9,  d10, d11, d12, d13, d14, d15
576.endm
577
578
579@ The public functions in this file have got the following signature:
580@ void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr);
581
582.macro bpp_frontend func, bpp
583function ff_\func\()_\bpp\()_neon, export=1
584        push            {r4-r9,lr}
585        ldr             r4,  [sp, #28]
586        vpush           {q4-q7}
587        lsl             r2,  r2,  #\bpp - 8
588        lsl             r3,  r3,  #\bpp - 8
589        lsl             r4,  r4,  #\bpp - 8
590        mov             r5,  #1 << (\bpp - 8)
591        mov             r6,  #16 - \bpp
592        movw            r7,  #((1 << \bpp) - 1)
593        bl              \func\()_16_neon
594        vpop            {q4-q7}
595        pop             {r4-r9,pc}
596endfunc
597.endm
598
599.macro bpp_frontends func
600        bpp_frontend    \func, 10
601        bpp_frontend    \func, 12
602.endm
603
604.macro bpp_frontend_rep func, suffix, int_suffix, rep, dir, bpp
605function ff_\func\()_\suffix\()_\bpp\()_neon, export=1
606        push            {r4-r9,lr}
607        ldr             r4,  [sp, #28]
608        vpush           {q4-q7}
609        lsl             r2,  r2,  #\bpp - 8
610        lsl             r3,  r3,  #\bpp - 8
611        lsl             r4,  r4,  #\bpp - 8
612        mov             r5,  #1 << (\bpp - 8)
613        mov             r6,  #16 - \bpp
614        movw            r7,  #((1 << \bpp) - 1)
615        bl              \func\()_\int_suffix\()_16_neon
616.ifc \dir,h
617        add             r0,  r0,  r1, lsl #2
618.else
619        add             r0,  r0,  #8
620.endif
621        bl              \func\()_\int_suffix\()_16_neon
622.if \rep >= 4
623.ifc \dir,h
624        add             r0,  r0,  r1, lsl #2
625        bl              \func\()_\int_suffix\()_16_neon
626        add             r0,  r0,  r1, lsl #2
627        bl              \func\()_\int_suffix\()_16_neon
628.else
629        add             r0,  r0,  #8
630        bl              \func\()_\int_suffix\()_16_neon
631        add             r0,  r0,  #8
632        bl              \func\()_\int_suffix\()_16_neon
633.endif
634.endif
635        vpop            {q4-q7}
636        pop             {r4-r9,pc}
637endfunc
638.endm
639
640.macro bpp_frontends_rep func, suffix, int_suffix, rep, dir
641        bpp_frontend_rep \func, \suffix, \int_suffix, \rep, \dir, 10
642        bpp_frontend_rep \func, \suffix, \int_suffix, \rep, \dir, 12
643.endm
644
645.macro bpp_frontend_mix2 wd1, wd2, dir, bpp
646function ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1
647        push            {r4-r9,lr}
648        ldr             r4,  [sp, #28]
649        vpush           {q4-q7}
650        push            {r2, r3, r4}
651        and             r2,  r2,  #0xff
652        and             r3,  r3,  #0xff
653        and             r4,  r4,  #0xff
654        lsl             r2,  r2,  #\bpp - 8
655        lsl             r3,  r3,  #\bpp - 8
656        lsl             r4,  r4,  #\bpp - 8
657        mov             r5,  #1 << (\bpp - 8)
658        mov             r6,  #16 - \bpp
659        movw            r7,  #((1 << \bpp) - 1)
660        bl              vp9_loop_filter_\dir\()_\wd1\()_8_16_neon
661.ifc \dir,h
662        add             r0,  r0,  r1, lsl #3
663.else
664        add             r0,  r0,  #16
665.endif
666        pop             {r2, r3, r4}
667        lsr             r2,  r2,  #8
668        lsr             r3,  r3,  #8
669        lsr             r4,  r4,  #8
670        lsl             r2,  r2,  #\bpp - 8
671        lsl             r3,  r3,  #\bpp - 8
672        lsl             r4,  r4,  #\bpp - 8
673        bl              vp9_loop_filter_\dir\()_\wd2\()_8_16_neon
674        vpop            {q4-q7}
675        pop             {r4-r9,pc}
676endfunc
677.endm
678
679.macro bpp_frontends_mix2 wd1, wd2
680        bpp_frontend_mix2 \wd1, \wd2, v, 10
681        bpp_frontend_mix2 \wd1, \wd2, v, 12
682        bpp_frontend_mix2 \wd1, \wd2, h, 10
683        bpp_frontend_mix2 \wd1, \wd2, h, 12
684.endm
685
686function vp9_loop_filter_v_4_8_16_neon
687        sub             r12, r0,  r1, lsl #2
688        vld1.16         {q8},  [r12,:128], r1 @ p3
689        vld1.16         {q12}, [r0, :128], r1 @ q0
690        vld1.16         {q9},  [r12,:128], r1 @ p2
691        vld1.16         {q13}, [r0, :128], r1 @ q1
692        vld1.16         {q10}, [r12,:128], r1 @ p1
693        vld1.16         {q14}, [r0, :128], r1 @ q2
694        vld1.16         {q11}, [r12,:128], r1 @ p0
695        vld1.16         {q15}, [r0, :128], r1 @ q3
696        sub             r0,  r0,  r1, lsl #2
697        sub             r12, r12, r1, lsl #1
698
699        loop_filter_q_4
700
701        vst1.16         {q10}, [r12,:128], r1
702        vst1.16         {q12}, [r0, :128], r1
703        vst1.16         {q11}, [r12,:128], r1
704        vst1.16         {q13}, [r0, :128], r1
705        sub             r0,  r0,  r1, lsl #1
7069:
707        bx              lr
708endfunc
709
710bpp_frontends vp9_loop_filter_v_4_8
711
712
713function vp9_loop_filter_h_4_8_16_neon
714        sub             r12, r0,  #8
715        add             r0,  r12, r1, lsl #2
716        vld1.16         {q8},  [r12,:64], r1
717        vld1.16         {q12}, [r0, :64], r1
718        vld1.16         {q9},  [r12,:64], r1
719        vld1.16         {q13}, [r0, :64], r1
720        vld1.16         {q10}, [r12,:64], r1
721        vld1.16         {q14}, [r0, :64], r1
722        vld1.16         {q11}, [r12,:64], r1
723        vld1.16         {q15}, [r0, :64], r1
724
725        sub             r12, r12, r1, lsl #2
726        sub             r0,  r0,  r1, lsl #2
727        @ Move r0/r12 forward by 2 pixels; we don't need to rewrite the
728        @ outermost 2 pixels since they aren't changed.
729        add             r12, r12, #4
730        add             r0,  r0,  #4
731
732        transpose16_q_8x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
733
734        loop_filter_q_4
735
736        @ We only will write the mid 4 pixels back; after the loop filter,
737        @ these are in q10, q11, q12, q13, ordered as rows (8x4 pixels).
738        @ We need to transpose them to columns, done with a
739        @ 4x4 transpose (which in practice is two 4x4 transposes of the two
740        @ 4x4 halves of the 8x4 pixels; into 4x8 pixels).
741        transpose16_4x4 q10, q11, q12, q13
742
743        vst1.16         {d20}, [r12], r1
744        vst1.16         {d21}, [r0],  r1
745        vst1.16         {d22}, [r12], r1
746        vst1.16         {d23}, [r0],  r1
747        vst1.16         {d24}, [r12], r1
748        vst1.16         {d25}, [r0],  r1
749        vst1.16         {d26}, [r12], r1
750        vst1.16         {d27}, [r0],  r1
751        sub             r12, r12, r1, lsl #2
7529:
753        add             r0,  r12, #4
754        bx              lr
755endfunc
756
757bpp_frontends vp9_loop_filter_h_4_8
758
759
760function vp9_loop_filter_v_8_8_16_neon
761        sub             r12, r0,  r1, lsl #2
762        vld1.16         {q8},  [r12,:128], r1 @ p3
763        vld1.16         {q12}, [r0, :128], r1 @ q0
764        vld1.16         {q9},  [r12,:128], r1 @ p2
765        vld1.16         {q13}, [r0, :128], r1 @ q1
766        vld1.16         {q10}, [r12,:128], r1 @ p1
767        vld1.16         {q14}, [r0, :128], r1 @ q2
768        vld1.16         {q11}, [r12,:128], r1 @ p0
769        vld1.16         {q15}, [r0, :128], r1 @ q3
770        sub             r12, r12, r1, lsl #2
771        sub             r0,  r0,  r1, lsl #2
772        add             r12, r12, r1
773
774        loop_filter_q_8
775
776        vst1.16         {q9},  [r12,:128], r1
777        vst1.16         {q12}, [r0, :128], r1
778        vst1.16         {q10}, [r12,:128], r1
779        vst1.16         {q13}, [r0, :128], r1
780        vst1.16         {q11}, [r12,:128], r1
781        vst1.16         {q14}, [r0, :128], r1
782        sub             r0,  r0,  r1, lsl #1
783        sub             r0,  r0,  r1
7849:
785        bx              lr
7866:
787        sub             r12, r0,  r1, lsl #1
788        vst1.16         {q10}, [r12,:128], r1
789        vst1.16         {q12}, [r0, :128], r1
790        vst1.16         {q11}, [r12,:128], r1
791        vst1.16         {q13}, [r0, :128], r1
792        sub             r0,  r0,  r1, lsl #1
793        bx              lr
794endfunc
795
796bpp_frontends vp9_loop_filter_v_8_8
797
798
799function vp9_loop_filter_h_8_8_16_neon
800        sub             r12, r0,  #8
801        add             r0,  r12, r1, lsl #2
802        vld1.16         {q8},  [r12,:64], r1
803        vld1.16         {q12}, [r0, :64], r1
804        vld1.16         {q9},  [r12,:64], r1
805        vld1.16         {q13}, [r0, :64], r1
806        vld1.16         {q10}, [r12,:64], r1
807        vld1.16         {q14}, [r0, :64], r1
808        vld1.16         {q11}, [r12,:64], r1
809        vld1.16         {q15}, [r0, :64], r1
810
811        sub             r12, r12, r1, lsl #2
812        sub             r0,  r0,  r1, lsl #2
813
814        transpose16_q_8x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
815
816        loop_filter_q_8
817
818        @ Even though only 6 pixels per row have been changed, we write the
819        @ full 8 pixel registers.
820        transpose16_q_8x8 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
821
822        vst1.16         {q8},  [r12,:64], r1
823        vst1.16         {q12}, [r0, :64], r1
824        vst1.16         {q9},  [r12,:64], r1
825        vst1.16         {q13}, [r0, :64], r1
826        vst1.16         {q10}, [r12,:64], r1
827        vst1.16         {q14}, [r0, :64], r1
828        vst1.16         {q11}, [r12,:64], r1
829        vst1.16         {q15}, [r0, :64], r1
830        sub             r12, r12, r1, lsl #2
8319:
832        add             r0,  r12, #8
833        bx              lr
8346:
835        @ If we didn't need to do the flat8in part, we use the same writeback
836        @ as in loop_filter_h_4_8.
837        add             r12, r12, #4
838        add             r0,  r0,  #4
839        transpose16_4x4 q10, q11, q12, q13
840
841        vst1.16         {d20}, [r12], r1
842        vst1.16         {d21}, [r0],  r1
843        vst1.16         {d22}, [r12], r1
844        vst1.16         {d23}, [r0],  r1
845        vst1.16         {d24}, [r12], r1
846        vst1.16         {d25}, [r0],  r1
847        vst1.16         {d26}, [r12], r1
848        vst1.16         {d27}, [r0],  r1
849        sub             r12, r12, r1, lsl #2
850        add             r0,  r12, #4
851        bx              lr
852endfunc
853
854bpp_frontends vp9_loop_filter_h_8_8
855
856bpp_frontends_mix2 4, 4
857bpp_frontends_mix2 4, 8
858bpp_frontends_mix2 8, 4
859bpp_frontends_mix2 8, 8
860
861function vp9_loop_filter_v_16_4_16_neon
862        sub             r12, r0,  r1, lsl #3
863        @ Read p7-p0 using r12 and q0-q7 using r0
864        vld1.16         {d16}, [r12,:64], r1 @ p7
865        vld1.16         {d24}, [r0, :64], r1 @ q0
866        vld1.16         {d17}, [r12,:64], r1 @ p6
867        vld1.16         {d25}, [r0, :64], r1 @ q1
868        vld1.16         {d18}, [r12,:64], r1 @ p5
869        vld1.16         {d26}, [r0, :64], r1 @ q2
870        vld1.16         {d19}, [r12,:64], r1 @ p4
871        vld1.16         {d27}, [r0, :64], r1 @ q3
872        vld1.16         {d20}, [r12,:64], r1 @ p3
873        vld1.16         {d28}, [r0, :64], r1 @ q4
874        vld1.16         {d21}, [r12,:64], r1 @ p2
875        vld1.16         {d29}, [r0, :64], r1 @ q5
876        vld1.16         {d22}, [r12,:64], r1 @ p1
877        vld1.16         {d30}, [r0, :64], r1 @ q6
878        vld1.16         {d23}, [r12,:64], r1 @ p0
879        vld1.16         {d31}, [r0, :64], r1 @ q7
880        sub             r12, r12, r1, lsl #3
881        sub             r0,  r0,  r1, lsl #3
882        add             r12, r12, r1
883
884        loop_filter_16
885
886        @ If we did the flat8out part, we get the output in
887        @ d2-d17 (skipping d7 and d16). r12 points to r0 - 7 * stride,
888        @ store d2-d9 there, and d10-d17 into r0.
889        vst1.16         {d2},  [r12,:64], r1
890        vst1.16         {d10}, [r0, :64], r1
891        vst1.16         {d3},  [r12,:64], r1
892        vst1.16         {d11}, [r0, :64], r1
893        vst1.16         {d4},  [r12,:64], r1
894        vst1.16         {d12}, [r0, :64], r1
895        vst1.16         {d5},  [r12,:64], r1
896        vst1.16         {d13}, [r0, :64], r1
897        vst1.16         {d6},  [r12,:64], r1
898        vst1.16         {d14}, [r0, :64], r1
899        vst1.16         {d8},  [r12,:64], r1
900        vst1.16         {d15}, [r0, :64], r1
901        vst1.16         {d9},  [r12,:64], r1
902        vst1.16         {d17}, [r0, :64], r1
903        sub             r0,  r0,  r1, lsl #3
904        add             r0,  r0,  r1
905
9069:
907        bx              lr
908
9098:
910        add             r12, r12, r1, lsl #2
911        @ If we didn't do the flat8out part, the output is left in the
912        @ input registers.
913        vst1.16         {d21}, [r12,:64], r1
914        vst1.16         {d24}, [r0, :64], r1
915        vst1.16         {d22}, [r12,:64], r1
916        vst1.16         {d25}, [r0, :64], r1
917        vst1.16         {d23}, [r12,:64], r1
918        vst1.16         {d26}, [r0, :64], r1
919        sub             r0,  r0,  r1, lsl #1
920        sub             r0,  r0,  r1
921        bx              lr
9227:
923        sub             r12, r0,  r1, lsl #1
924        vst1.16         {d22}, [r12,:64], r1
925        vst1.16         {d24}, [r0, :64], r1
926        vst1.16         {d23}, [r12,:64], r1
927        vst1.16         {d25}, [r0, :64], r1
928        sub             r0,  r0,  r1, lsl #1
929        bx              lr
930endfunc
931
932bpp_frontends_rep vp9_loop_filter_v_16, 8,  4, 2, v
933bpp_frontends_rep vp9_loop_filter_v_16, 16, 4, 4, v
934
935function vp9_loop_filter_h_16_4_16_neon
936        sub             r12, r0,  #16
937        sub             r0,  r0,  #8
938        vld1.16         {d16}, [r12,:64], r1
939        vld1.16         {d20}, [r0, :64], r1
940        vld1.16         {d17}, [r12,:64], r1
941        vld1.16         {d21}, [r0, :64], r1
942        vld1.16         {d18}, [r12,:64], r1
943        vld1.16         {d22}, [r0, :64], r1
944        vld1.16         {d19}, [r12,:64], r1
945        vld1.16         {d23}, [r0, :64], r1
946        sub             r12, r12, r1, lsl #2
947        sub             r0,  r0,  r1, lsl #2
948        add             r12, r12, #16
949        add             r0,  r0,  #16
950        vld1.16         {d24}, [r12,:64], r1
951        vld1.16         {d28}, [r0, :64], r1
952        vld1.16         {d25}, [r12,:64], r1
953        vld1.16         {d29}, [r0, :64], r1
954        vld1.16         {d26}, [r12,:64], r1
955        vld1.16         {d30}, [r0, :64], r1
956        vld1.16         {d27}, [r12,:64], r1
957        vld1.16         {d31}, [r0, :64], r1
958        sub             r0,  r0,  r1, lsl #2
959        sub             r12, r12, r1, lsl #2
960        sub             r12, r12, #16
961        sub             r0,  r0,  #16
962
963        @ The 16x4 pixels read above is in four 4x4 blocks
964        transpose16_q_4x4 q8,  q9,  d16, d17, d18, d19
965        transpose16_q_4x4 q10, q11, d20, d21, d22, d23
966        transpose16_q_4x4 q12, q13, d24, d25, d26, d27
967        transpose16_q_4x4 q14, q15, d28, d29, d30, d31
968
969        loop_filter_16
970
971        @ Transpose back; this is the same transpose as above, but
972        @ we can't take advantage of q registers for the transpose, since
973        @ all d registers in the transpose aren't consecutive.
974        transpose16_4x4 d16, d2,  d3,  d4
975        transpose16_4x4 d5,  d6,  d8,  d9
976        transpose16_4x4 d10, d11, d12, d13
977        transpose16_4x4 d14, d15, d17, d31
978
979        vst1.16         {d16}, [r12,:64], r1
980        vst1.16         {d5},  [r0, :64], r1
981
982        vst1.16         {d2},  [r12,:64], r1
983        vst1.16         {d6},  [r0, :64], r1
984
985        vst1.16         {d3},  [r12,:64], r1
986        vst1.16         {d8},  [r0, :64], r1
987
988        vst1.16         {d4},  [r12,:64], r1
989        vst1.16         {d9},  [r0, :64], r1
990
991        sub             r12, r12, r1, lsl #2
992        sub             r0,  r0,  r1, lsl #2
993        add             r12, r12, #16
994        add             r0,  r0,  #16
995
996        vst1.16         {d10}, [r12,:64], r1
997        vst1.16         {d14}, [r0, :64], r1
998
999        vst1.16         {d11}, [r12,:64], r1
1000        vst1.16         {d15}, [r0, :64], r1
1001
1002        vst1.16         {d12}, [r12,:64], r1
1003        vst1.16         {d17}, [r0, :64], r1
1004
1005        vst1.16         {d13}, [r12,:64], r1
1006        vst1.16         {d31}, [r0, :64], r1
1007        sub             r0,  r0,  r1, lsl #2
1008        sub             r0,  r0,  #8
1009        bx              lr
10109:
1011        add             r0,  r0,  #8
1012        bx              lr
10138:
1014        add             r12, r12, #8
1015        add             r0,  r0,  #8
1016        transpose16_q_4x4 q10, q11, d20, d21, d22, d23
1017        transpose16_q_4x4 q12, q13, d24, d25, d26, d27
1018
1019        vst1.16         {d20}, [r12,:64], r1
1020        vst1.16         {d24}, [r0, :64], r1
1021        vst1.16         {d21}, [r12,:64], r1
1022        vst1.16         {d25}, [r0, :64], r1
1023        vst1.16         {d22}, [r12,:64], r1
1024        vst1.16         {d26}, [r0, :64], r1
1025        vst1.16         {d23}, [r12,:64], r1
1026        vst1.16         {d27}, [r0, :64], r1
1027        sub             r0,  r0,  r1, lsl #2
1028        bx              lr
10297:
1030        add             r12, r12, #12
1031        add             r0,  r12, r1, lsl #1
1032        transpose16_q_4x4 q11, q12, d22, d23, d24, d25
1033
1034        vst1.16         {d22}, [r12], r1
1035        vst1.16         {d24}, [r0],  r1
1036        vst1.16         {d23}, [r12], r1
1037        vst1.16         {d25}, [r0],  r1
1038        sub             r0,  r0,  r1, lsl #2
1039        add             r0,  r0,  #4
1040        bx              lr
1041endfunc
1042
1043bpp_frontends_rep vp9_loop_filter_h_16, 8,  4, 2, h
1044bpp_frontends_rep vp9_loop_filter_h_16, 16, 4, 4, h
1045