1/*
2 * VP8 NEON optimisations
3 *
4 * Copyright (c) 2010 Rob Clark <rob@ti.com>
5 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
6 * Copyright (c) 2018 Magnus Röös <mla2.roos@gmail.com>
7 * Copyright (c) 2019 Martin Storsjo <martin@martin.st>
8 *
9 * This file is part of FFmpeg.
10 *
11 * FFmpeg is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License as published by the Free Software Foundation; either
14 * version 2.1 of the License, or (at your option) any later version.
15 *
16 * FFmpeg is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19 * Lesser General Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with FFmpeg; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 */
25
26#include "libavutil/aarch64/asm.S"
27#include "neon.S"
28
29function ff_vp8_luma_dc_wht_neon, export=1
30        ld1             {v0.4h - v3.4h}, [x1]
31        movi            v30.8h, #0
32
33        add             v4.4h,  v0.4h,  v3.4h
34        add             v6.4h,  v1.4h,  v2.4h
35        st1             {v30.8h}, [x1], #16
36        sub             v7.4h,  v1.4h,  v2.4h
37        sub             v5.4h,  v0.4h,  v3.4h
38        st1             {v30.8h}, [x1]
39        add             v0.4h,  v4.4h,  v6.4h
40        add             v1.4h,  v5.4h,  v7.4h
41        sub             v2.4h,  v4.4h,  v6.4h
42        sub             v3.4h,  v5.4h,  v7.4h
43
44        movi            v16.4h, #3
45
46        transpose_4x4H  v0, v1, v2, v3, v4, v5, v6, v7
47
48        add             v0.4h,  v0.4h,  v16.4h
49
50        add             v4.4h,  v0.4h,  v3.4h
51        add             v6.4h,  v1.4h,  v2.4h
52        sub             v7.4h,  v1.4h,  v2.4h
53        sub             v5.4h,  v0.4h,  v3.4h
54        add             v0.4h,  v4.4h,  v6.4h
55        add             v1.4h,  v5.4h,  v7.4h
56        sub             v2.4h,  v4.4h,  v6.4h
57        sub             v3.4h,  v5.4h,  v7.4h
58
59        sshr            v0.4h,  v0.4h,  #3
60        sshr            v1.4h,  v1.4h,  #3
61        sshr            v2.4h,  v2.4h,  #3
62        sshr            v3.4h,  v3.4h,  #3
63
64        mov             x3,  #32
65        st1             {v0.h}[0],  [x0], x3
66        st1             {v1.h}[0],  [x0], x3
67        st1             {v2.h}[0],  [x0], x3
68        st1             {v3.h}[0],  [x0], x3
69        st1             {v0.h}[1],  [x0], x3
70        st1             {v1.h}[1],  [x0], x3
71        st1             {v2.h}[1],  [x0], x3
72        st1             {v3.h}[1],  [x0], x3
73        st1             {v0.h}[2],  [x0], x3
74        st1             {v1.h}[2],  [x0], x3
75        st1             {v2.h}[2],  [x0], x3
76        st1             {v3.h}[2],  [x0], x3
77        st1             {v0.h}[3],  [x0], x3
78        st1             {v1.h}[3],  [x0], x3
79        st1             {v2.h}[3],  [x0], x3
80        st1             {v3.h}[3],  [x0], x3
81
82        ret
83endfunc
84
85function ff_vp8_idct_add_neon, export=1
86        ld1             {v0.8b - v3.8b},  [x1]
87        mov             w4,  #20091
88        movk            w4,  #35468/2, lsl #16
89        dup             v4.2s, w4
90
91        smull           v26.4s, v1.4h,  v4.h[0]
92        smull           v27.4s, v3.4h,  v4.h[0]
93        sqdmulh         v20.4h, v1.4h,  v4.h[1]
94        sqdmulh         v23.4h, v3.4h,  v4.h[1]
95        shrn            v21.4h, v26.4s, #16
96        shrn            v22.4h, v27.4s, #16
97        add             v21.4h, v21.4h, v1.4h
98        add             v22.4h, v22.4h, v3.4h
99
100        add             v16.4h,  v0.4h,   v2.4h
101        sub             v17.4h,  v0.4h,   v2.4h
102
103        add             v18.4h,  v21.4h,  v23.4h
104        sub             v19.4h,  v20.4h,  v22.4h
105
106        add             v0.4h,   v16.4h,  v18.4h
107        add             v1.4h,   v17.4h,  v19.4h
108        sub             v3.4h,   v16.4h,  v18.4h
109        sub             v2.4h,   v17.4h,  v19.4h
110
111        transpose_4x4H  v0, v1, v2, v3, v24, v5, v6, v7
112
113        movi            v29.8h, #0
114        smull           v26.4s,     v1.4h,  v4.h[0]
115        st1             {v29.8h},   [x1],   #16
116        smull           v27.4s,     v3.4h,  v4.h[0]
117        st1             {v29.16b},  [x1]
118        sqdmulh         v21.4h,     v1.4h,  v4.h[1]
119        sqdmulh         v23.4h,     v3.4h,  v4.h[1]
120        shrn            v20.4h,     v26.4s, #16
121        shrn            v22.4h,     v27.4s, #16
122        add             v20.4h,     v20.4h, v1.4h
123        add             v22.4h,     v22.4h, v3.4h
124        add             v16.4h,     v0.4h,  v2.4h
125        sub             v17.4h,     v0.4h,  v2.4h
126
127        add             v18.4h,     v20.4h, v23.4h
128        ld1             {v24.s}[0], [x0],   x2
129        sub             v19.4h, v21.4h, v22.4h
130        ld1             {v25.s}[0], [x0],   x2
131        add             v0.4h,      v16.4h, v18.4h
132        add             v1.4h,      v17.4h, v19.4h
133        ld1             {v26.s}[0], [x0],   x2
134        sub             v3.4h,      v16.4h, v18.4h
135        sub             v2.4h,      v17.4h, v19.4h
136        ld1             {v27.s}[0], [x0],   x2
137        srshr           v0.4h,      v0.4h,  #3
138        srshr           v1.4h,      v1.4h,  #3
139        srshr           v2.4h,      v2.4h,  #3
140        srshr           v3.4h,      v3.4h,  #3
141
142        sub             x0,  x0,  x2,  lsl #2
143
144        transpose_4x4H  v0, v1, v2, v3, v5, v6, v7, v16
145
146        uaddw           v0.8h,  v0.8h, v24.8b
147        uaddw           v1.8h,  v1.8h, v25.8b
148        uaddw           v2.8h,  v2.8h, v26.8b
149        uaddw           v3.8h,  v3.8h, v27.8b
150        sqxtun          v0.8b,  v0.8h
151        sqxtun          v1.8b,  v1.8h
152        sqxtun          v2.8b,  v2.8h
153        sqxtun          v3.8b,  v3.8h
154
155        st1             {v0.s}[0],  [x0], x2
156        st1             {v1.s}[0],  [x0], x2
157        st1             {v2.s}[0],  [x0], x2
158        st1             {v3.s}[0],  [x0], x2
159
160        ret
161endfunc
162
163function ff_vp8_idct_dc_add4uv_neon, export=1
164        movi            v0.4h,  #0
165        mov             x3,     #32
166        ld1r            {v16.4h},  [x1]
167        st1             {v0.h}[0], [x1], x3
168        ld1r            {v17.4h},  [x1]
169        st1             {v0.h}[0], [x1], x3
170        ld1r            {v18.4h},  [x1]
171        st1             {v0.h}[0], [x1], x3
172        ld1r            {v19.4h},  [x1]
173        st1             {v0.h}[0], [x1], x3
174        ins             v16.d[1],  v17.d[0]
175        ins             v18.d[1],  v19.d[0]
176        mov             x3,  x0
177        srshr           v16.8h,    v16.8h,  #3            // dc >>= 3
178        ld1             {v0.8b},   [x0], x2
179        srshr           v18.8h,    v18.8h,  #3
180        ld1             {v1.8b},   [x0], x2
181        uaddw           v20.8h,    v16.8h, v0.8b
182        ld1             {v2.8b},   [x0], x2
183        uaddw           v0.8h,     v16.8h, v1.8b
184        ld1             {v3.8b},   [x0], x2
185        uaddw           v22.8h,    v16.8h, v2.8b
186        ld1             {v4.8b},   [x0], x2
187        uaddw           v2.8h,     v16.8h, v3.8b
188        ld1             {v5.8b},   [x0], x2
189        uaddw           v24.8h,    v18.8h, v4.8b
190        ld1             {v6.8b},   [x0], x2
191        uaddw           v4.8h,     v18.8h, v5.8b
192        ld1             {v7.8b},   [x0], x2
193        uaddw           v26.8h,    v18.8h, v6.8b
194        sqxtun          v20.8b,    v20.8h
195        uaddw           v6.8h,     v18.8h, v7.8b
196        sqxtun          v21.8b,    v0.8h
197        sqxtun          v22.8b,    v22.8h
198        st1             {v20.8b},  [x3], x2
199        sqxtun          v23.8b,    v2.8h
200        st1             {v21.8b},  [x3], x2
201        sqxtun          v24.8b,    v24.8h
202        st1             {v22.8b},  [x3], x2
203        sqxtun          v25.8b,    v4.8h
204        st1             {v23.8b},  [x3], x2
205        sqxtun          v26.8b,    v26.8h
206        st1             {v24.8b},  [x3], x2
207        sqxtun          v27.8b,    v6.8h
208        st1             {v25.8b},  [x3], x2
209        st1             {v26.8b},  [x3], x2
210        st1             {v27.8b},  [x3], x2
211
212        ret
213endfunc
214
215function ff_vp8_idct_dc_add4y_neon, export=1
216        movi            v0.16b,  #0
217        mov             x3,  #32
218        ld1r            {v16.4h},    [x1]
219        st1             {v0.h}[0],   [x1], x3
220        ld1r            {v17.4h},    [x1]
221        st1             {v0.h}[0],   [x1], x3
222        zip1            v16.2d,      v16.2d, v17.2d
223        ld1r            {v18.4h},    [x1]
224        st1             {v0.h}[0],   [x1], x3
225        ld1r            {v19.4h},    [x1]
226        st1             {v0.h}[0],   [x1], x3
227        zip1            v18.2d,      v18.2d, v19.2d
228        srshr           v16.8h,      v16.8h,  #3            // dc >>= 3
229        ld1             {v0.16b},     [x0], x2
230        srshr           v18.8h,       v18.8h,  #3
231        ld1             {v1.16b},     [x0], x2
232        uaddw           v20.8h,       v16.8h,  v0.8b
233        ld1             {v2.16b},     [x0], x2
234        uaddw2          v0.8h,        v18.8h,   v0.16b
235        ld1             {v3.16b},     [x0], x2
236        uaddw           v21.8h, v16.8h,  v1.8b
237        uaddw2          v1.8h,  v18.8h,  v1.16b
238        uaddw           v22.8h, v16.8h,  v2.8b
239        uaddw2          v2.8h,  v18.8h,  v2.16b
240        uaddw           v23.8h, v16.8h,  v3.8b
241        uaddw2          v3.8h,  v18.8h,  v3.16b
242        sub             x0,  x0,  x2,  lsl #2
243        sqxtun          v20.8b,  v20.8h
244        sqxtun2         v20.16b, v0.8h
245        sqxtun          v21.8b,  v21.8h
246        sqxtun2         v21.16b, v1.8h
247        sqxtun          v22.8b,  v22.8h
248        st1             {v20.16b},    [x0], x2
249        sqxtun2         v22.16b, v2.8h
250        st1             {v21.16b},    [x0], x2
251        sqxtun          v23.8b,  v23.8h
252        st1             {v22.16b},    [x0], x2
253        sqxtun2         v23.16b, v3.8h
254        st1             {v23.16b},    [x0], x2
255
256        ret
257endfunc
258
259function ff_vp8_idct_dc_add_neon, export=1
260        mov             w3,       #0
261        ld1r            {v2.8h},  [x1]
262        strh            w3,       [x1]
263        srshr           v2.8h,  v2.8h,  #3
264        ld1             {v0.s}[0],  [x0], x2
265        ld1             {v0.s}[1],  [x0], x2
266        uaddw           v3.8h,  v2.8h,  v0.8b
267        ld1             {v1.s}[0],  [x0], x2
268        ld1             {v1.s}[1],  [x0], x2
269        uaddw           v4.8h,  v2.8h,  v1.8b
270        sqxtun          v0.8b,  v3.8h
271        sqxtun          v1.8b,  v4.8h
272        sub             x0,  x0,  x2, lsl #2
273        st1             {v0.s}[0],  [x0], x2
274        st1             {v0.s}[1],  [x0], x2
275        st1             {v1.s}[0],  [x0], x2
276        st1             {v1.s}[1],  [x0], x2
277        ret
278endfunc
279
280// Register layout:
281//   P3..Q3 -> v0..v7
282//   flim_E -> v22
283//   flim_I -> v23
284//   hev_thresh -> x5
285//
286.macro  vp8_loop_filter, inner=0, simple=0, hev_thresh
287    .if \simple
288        uabd            v17.16b, v3.16b,  v4.16b      // abs(P0-Q0)
289        uabd            v23.16b, v2.16b,  v5.16b      // abs(P1-Q1)
290        uqadd           v17.16b, v17.16b, v17.16b     // abs(P0-Q0) * 2
291        ushr            v18.16b, v23.16b, #1          // abs(P1-Q1) / 2
292        uqadd           v19.16b, v17.16b,  v18.16b    // (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
293        movi            v21.16b, #0x80
294        cmhs            v16.16b, v22.16b, v19.16b    // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim
295    .else
296        // calculate hev and normal_limit:
297        uabd            v20.16b, v2.16b,  v3.16b      // abs(P1-P0)
298        uabd            v21.16b, v5.16b,  v4.16b      // abs(Q1-Q0)
299        uabd            v18.16b, v0.16b,  v1.16b      // abs(P3-P2)
300        uabd            v19.16b, v1.16b,  v2.16b      // abs(P2-P1)
301        cmhs            v16.16b, v23.16b, v20.16b     // abs(P1-P0) <= flim_I
302        cmhs            v17.16b, v23.16b, v21.16b     // abs(Q1-Q0) <= flim_I
303        cmhs            v18.16b, v23.16b, v18.16b     // abs(P3-P2) <= flim_I
304        cmhs            v19.16b, v23.16b, v19.16b     // abs(P2-P1) <= flim_I
305        and             v16.16b, v17.16b, v16.16b
306        uabd            v17.16b, v7.16b,  v6.16b      // abs(Q3-Q2)
307        and             v16.16b, v16.16b, v19.16b
308        uabd            v19.16b, v6.16b,  v5.16b      // abs(Q2-Q1)
309        and             v16.16b, v16.16b, v18.16b
310        cmhs            v18.16b, v23.16b, v17.16b     // abs(Q3-Q2) <= flim_I
311        cmhs            v19.16b, v23.16b, v19.16b     // abs(Q2-Q1) <= flim_I
312        uabd            v17.16b, v3.16b,  v4.16b      // abs(P0-Q0)
313        uabd            v23.16b, v2.16b,  v5.16b      // abs(P1-Q1)
314        and             v16.16b, v16.16b, v18.16b
315        uqadd           v17.16b, v17.16b, v17.16b     // abs(P0-Q0) * 2
316        and             v16.16b, v16.16b, v19.16b
317        ushr            v18.16b, v23.16b, #1          // abs(P1-Q1) / 2
318        dup             v23.16b, \hev_thresh          // hev_thresh
319        uqadd           v19.16b, v17.16b, v18.16b     // (abs(P0-Q0)*2) + (abs(P1-Q1)/2)
320        cmhi            v20.16b, v20.16b, v23.16b     // abs(P1-P0) > hev_thresh
321        cmhs            v19.16b, v22.16b, v19.16b     // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E
322        cmhi            v22.16b, v21.16b, v23.16b     // abs(Q1-Q0) > hev_thresh
323        and             v16.16b, v16.16b, v19.16b
324        movi            v21.16b, #0x80
325        orr             v17.16b, v20.16b, v22.16b
326    .endif
327
328        // at this point:
329        //   v16: normal_limit
330        //   v17: hev
331
332        // convert to signed value:
333        eor            v3.16b, v3.16b, v21.16b           // PS0 = P0 ^ 0x80
334        eor            v4.16b, v4.16b, v21.16b           // QS0 = Q0 ^ 0x80
335
336        movi           v20.8h, #3
337        ssubl          v18.8h, v4.8b,  v3.8b             // QS0 - PS0
338        ssubl2         v19.8h, v4.16b, v3.16b            //   (widened to 16bit)
339        eor            v2.16b, v2.16b, v21.16b           // PS1 = P1 ^ 0x80
340        eor            v5.16b, v5.16b, v21.16b           // QS1 = Q1 ^ 0x80
341        mul            v18.8h, v18.8h, v20.8h            // w = 3 * (QS0 - PS0)
342        mul            v19.8h, v19.8h, v20.8h
343
344        sqsub          v20.16b, v2.16b, v5.16b           // clamp(PS1-QS1)
345        movi           v22.16b, #4
346        movi           v23.16b, #3
347    .if \inner
348        and            v20.16b, v20.16b, v17.16b         // if(hev) w += clamp(PS1-QS1)
349    .endif
350        saddw          v18.8h,  v18.8h, v20.8b           // w += clamp(PS1-QS1)
351        saddw2         v19.8h,  v19.8h, v20.16b
352        sqxtn          v18.8b,  v18.8h                   // narrow result back into v18
353        sqxtn2         v18.16b, v19.8h
354    .if !\inner && !\simple
355        eor            v1.16b,  v1.16b,  v21.16b         // PS2 = P2 ^ 0x80
356        eor            v6.16b,  v6.16b,  v21.16b         // QS2 = Q2 ^ 0x80
357    .endif
358        and            v18.16b, v18.16b, v16.16b         // w &= normal_limit
359
360        // registers used at this point..
361        //   v0 -> P3  (don't corrupt)
362        //   v1-v6 -> PS2-QS2
363        //   v7 -> Q3  (don't corrupt)
364        //   v17 -> hev
365        //   v18 -> w
366        //   v21 -> #0x80
367        //   v22 -> #4
368        //   v23 -> #3
369        //   v16, v19, v29 -> unused
370        //
371        // filter_common:   is4tap==1
372        //   c1 = clamp(w + 4) >> 3;
373        //   c2 = clamp(w + 3) >> 3;
374        //   Q0 = s2u(QS0 - c1);
375        //   P0 = s2u(PS0 + c2);
376
377    .if \simple
378        sqadd          v19.16b, v18.16b, v22.16b           // c1 = clamp((w&hev)+4)
379        sqadd          v20.16b, v18.16b, v23.16b           // c2 = clamp((w&hev)+3)
380        sshr           v19.16b, v19.16b, #3                // c1 >>= 3
381        sshr           v20.16b, v20.16b, #3                // c2 >>= 3
382        sqsub          v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
383        sqadd          v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
384        eor            v4.16b,  v4.16b,  v21.16b           // Q0 = QS0 ^ 0x80
385        eor            v3.16b,  v3.16b,  v21.16b           // P0 = PS0 ^ 0x80
386        eor            v5.16b,  v5.16b,  v21.16b           // Q1 = QS1 ^ 0x80
387        eor            v2.16b,  v2.16b,  v21.16b           // P1 = PS1 ^ 0x80
388    .elseif \inner
389        // the !is4tap case of filter_common, only used for inner blocks
390        //   c3 = ((c1&~hev) + 1) >> 1;
391        //   Q1 = s2u(QS1 - c3);
392        //   P1 = s2u(PS1 + c3);
393        sqadd          v19.16b, v18.16b, v22.16b           // c1 = clamp((w&hev)+4)
394        sqadd          v20.16b, v18.16b, v23.16b           // c2 = clamp((w&hev)+3)
395        sshr           v19.16b, v19.16b, #3                // c1 >>= 3
396        sshr           v20.16b, v20.16b, #3                // c2 >>= 3
397        sqsub          v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
398        sqadd          v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
399        bic            v19.16b, v19.16b, v17.16b           // c1 & ~hev
400        eor            v4.16b,  v4.16b,  v21.16b           // Q0 = QS0 ^ 0x80
401        srshr          v19.16b, v19.16b, #1                // c3 >>= 1
402        eor            v3.16b,  v3.16b,  v21.16b           // P0 = PS0 ^ 0x80
403        sqsub          v5.16b,  v5.16b,  v19.16b           // QS1 = clamp(QS1-c3)
404        sqadd          v2.16b,  v2.16b,  v19.16b           // PS1 = clamp(PS1+c3)
405        eor            v5.16b,  v5.16b,  v21.16b           // Q1 = QS1 ^ 0x80
406        eor            v2.16b,  v2.16b,  v21.16b           // P1 = PS1 ^ 0x80
407    .else
408        and            v20.16b, v18.16b, v17.16b           // w & hev
409        sqadd          v19.16b, v20.16b, v22.16b           // c1 = clamp((w&hev)+4)
410        sqadd          v20.16b, v20.16b, v23.16b           // c2 = clamp((w&hev)+3)
411        sshr           v19.16b, v19.16b, #3                // c1 >>= 3
412        sshr           v20.16b, v20.16b, #3                // c2 >>= 3
413        bic            v18.16b, v18.16b, v17.16b           // w &= ~hev
414        sqsub          v4.16b,  v4.16b,  v19.16b           // QS0 = clamp(QS0-c1)
415        sqadd          v3.16b,  v3.16b,  v20.16b           // PS0 = clamp(PS0+c2)
416
417        // filter_mbedge:
418        //   a = clamp((27*w + 63) >> 7);
419        //   Q0 = s2u(QS0 - a);
420        //   P0 = s2u(PS0 + a);
421        //   a = clamp((18*w + 63) >> 7);
422        //   Q1 = s2u(QS1 - a);
423        //   P1 = s2u(PS1 + a);
424        //   a = clamp((9*w + 63) >> 7);
425        //   Q2 = s2u(QS2 - a);
426        //   P2 = s2u(PS2 + a);
427        movi           v17.8h,  #63
428        sshll          v22.8h,  v18.8b, #3
429        sshll2         v23.8h,  v18.16b, #3
430        saddw          v22.8h,  v22.8h, v18.8b
431        saddw2         v23.8h,  v23.8h, v18.16b
432        add            v16.8h,  v17.8h, v22.8h
433        add            v17.8h,  v17.8h, v23.8h           //  9*w + 63
434        add            v19.8h,  v16.8h, v22.8h
435        add            v20.8h,  v17.8h, v23.8h           // 18*w + 63
436        add            v22.8h,  v19.8h, v22.8h
437        add            v23.8h,  v20.8h, v23.8h           // 27*w + 63
438        sqshrn         v16.8b,  v16.8h,  #7
439        sqshrn2        v16.16b, v17.8h, #7              // clamp(( 9*w + 63)>>7)
440        sqshrn         v19.8b,  v19.8h, #7
441        sqshrn2        v19.16b, v20.8h, #7              // clamp((18*w + 63)>>7)
442        sqshrn         v22.8b,  v22.8h, #7
443        sqshrn2        v22.16b, v23.8h, #7              // clamp((27*w + 63)>>7)
444        sqadd          v1.16b,  v1.16b,  v16.16b        // PS2 = clamp(PS2+a)
445        sqsub          v6.16b,  v6.16b,  v16.16b        // QS2 = clamp(QS2-a)
446        sqadd          v2.16b,  v2.16b,  v19.16b        // PS1 = clamp(PS1+a)
447        sqsub          v5.16b,  v5.16b,  v19.16b        // QS1 = clamp(QS1-a)
448        sqadd          v3.16b,  v3.16b,  v22.16b        // PS0 = clamp(PS0+a)
449        sqsub          v4.16b,  v4.16b,  v22.16b        // QS0 = clamp(QS0-a)
450        eor            v3.16b,  v3.16b,  v21.16b        // P0 = PS0 ^ 0x80
451        eor            v4.16b,  v4.16b,  v21.16b        // Q0 = QS0 ^ 0x80
452        eor            v2.16b,  v2.16b,  v21.16b        // P1 = PS1 ^ 0x80
453        eor            v5.16b,  v5.16b,  v21.16b        // Q1 = QS1 ^ 0x80
454        eor            v1.16b,  v1.16b,  v21.16b        // P2 = PS2 ^ 0x80
455        eor            v6.16b,  v6.16b,  v21.16b        // Q2 = QS2 ^ 0x80
456    .endif
457.endm
458
459.macro  vp8_v_loop_filter16 name, inner=0, simple=0
460function ff_vp8_v_loop_filter16\name\()_neon, export=1
461        sub             x0,  x0,  x1,  lsl #1+!\simple
462
463        // Load pixels:
464    .if !\simple
465        ld1             {v0.16b},     [x0], x1 // P3
466        ld1             {v1.16b},     [x0], x1 // P2
467    .endif
468        ld1             {v2.16b},     [x0], x1 // P1
469        ld1             {v3.16b},     [x0], x1 // P0
470        ld1             {v4.16b},     [x0], x1 // Q0
471        ld1             {v5.16b},     [x0], x1 // Q1
472    .if !\simple
473        ld1             {v6.16b},     [x0], x1 // Q2
474        ld1             {v7.16b},     [x0]     // Q3
475        dup             v23.16b, w3                 // flim_I
476    .endif
477        dup             v22.16b, w2                 // flim_E
478
479        vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4
480
481        // back up to P2:  dst -= stride * 6
482        sub             x0,  x0,  x1,  lsl #2
483    .if !\simple
484        sub             x0,  x0,  x1,  lsl #1
485
486        // Store pixels:
487        st1             {v1.16b},     [x0], x1 // P2
488    .endif
489        st1             {v2.16b},     [x0], x1 // P1
490        st1             {v3.16b},     [x0], x1 // P0
491        st1             {v4.16b},     [x0], x1 // Q0
492        st1             {v5.16b},     [x0], x1 // Q1
493    .if !\simple
494        st1             {v6.16b},     [x0]     // Q2
495    .endif
496
497        ret
498endfunc
499.endm
500
501vp8_v_loop_filter16
502vp8_v_loop_filter16 _inner,  inner=1
503vp8_v_loop_filter16 _simple, simple=1
504
505.macro  vp8_v_loop_filter8uv name, inner=0
506function ff_vp8_v_loop_filter8uv\name\()_neon, export=1
507        sub             x0,  x0,  x2,  lsl #2
508        sub             x1,  x1,  x2,  lsl #2
509        // Load pixels:
510        ld1          {v0.d}[0],     [x0], x2  // P3
511        ld1          {v0.d}[1],     [x1], x2  // P3
512        ld1          {v1.d}[0],     [x0], x2  // P2
513        ld1          {v1.d}[1],     [x1], x2  // P2
514        ld1          {v2.d}[0],     [x0], x2  // P1
515        ld1          {v2.d}[1],     [x1], x2  // P1
516        ld1          {v3.d}[0],     [x0], x2  // P0
517        ld1          {v3.d}[1],     [x1], x2  // P0
518        ld1          {v4.d}[0],     [x0], x2  // Q0
519        ld1          {v4.d}[1],     [x1], x2  // Q0
520        ld1          {v5.d}[0],     [x0], x2  // Q1
521        ld1          {v5.d}[1],     [x1], x2  // Q1
522        ld1          {v6.d}[0],     [x0], x2  // Q2
523        ld1          {v6.d}[1],     [x1], x2  // Q2
524        ld1          {v7.d}[0],     [x0]      // Q3
525        ld1          {v7.d}[1],     [x1]      // Q3
526
527        dup          v22.16b, w3                 // flim_E
528        dup          v23.16b, w4                 // flim_I
529
530        vp8_loop_filter inner=\inner, hev_thresh=w5
531
532        // back up to P2:  u,v -= stride * 6
533        sub          x0,  x0,  x2,  lsl #2
534        sub          x1,  x1,  x2,  lsl #2
535        sub          x0,  x0,  x2,  lsl #1
536        sub          x1,  x1,  x2,  lsl #1
537
538        // Store pixels:
539
540        st1          {v1.d}[0],     [x0], x2  // P2
541        st1          {v1.d}[1],     [x1], x2  // P2
542        st1          {v2.d}[0],     [x0], x2  // P1
543        st1          {v2.d}[1],     [x1], x2  // P1
544        st1          {v3.d}[0],     [x0], x2  // P0
545        st1          {v3.d}[1],     [x1], x2  // P0
546        st1          {v4.d}[0],     [x0], x2  // Q0
547        st1          {v4.d}[1],     [x1], x2  // Q0
548        st1          {v5.d}[0],     [x0], x2  // Q1
549        st1          {v5.d}[1],     [x1], x2  // Q1
550        st1          {v6.d}[0],     [x0]      // Q2
551        st1          {v6.d}[1],     [x1]      // Q2
552
553        ret
554endfunc
555.endm
556
557vp8_v_loop_filter8uv
558vp8_v_loop_filter8uv _inner, inner=1
559
560.macro  vp8_h_loop_filter16 name, inner=0, simple=0
561function ff_vp8_h_loop_filter16\name\()_neon, export=1
562
563        sub             x0,  x0,  #4
564        // Load pixels:
565        ld1             {v0.d}[0], [x0], x1
566        ld1             {v1.d}[0], [x0], x1
567        ld1             {v2.d}[0], [x0], x1
568        ld1             {v3.d}[0], [x0], x1
569        ld1             {v4.d}[0], [x0], x1
570        ld1             {v5.d}[0], [x0], x1
571        ld1             {v6.d}[0], [x0], x1
572        ld1             {v7.d}[0], [x0], x1
573        ld1             {v0.d}[1], [x0], x1
574        ld1             {v1.d}[1], [x0], x1
575        ld1             {v2.d}[1], [x0], x1
576        ld1             {v3.d}[1], [x0], x1
577        ld1             {v4.d}[1], [x0], x1
578        ld1             {v5.d}[1], [x0], x1
579        ld1             {v6.d}[1], [x0], x1
580        ld1             {v7.d}[1], [x0], x1
581
582        transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
583
584        dup             v22.16b, w2                 // flim_E
585    .if !\simple
586        dup             v23.16b, w3                 // flim_I
587    .endif
588
589        vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4
590
591        sub             x0,  x0,  x1, lsl #4    // backup 16 rows
592
593        transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
594
595        // Store pixels:
596        st1             {v0.d}[0], [x0], x1
597        st1             {v1.d}[0], [x0], x1
598        st1             {v2.d}[0], [x0], x1
599        st1             {v3.d}[0], [x0], x1
600        st1             {v4.d}[0], [x0], x1
601        st1             {v5.d}[0], [x0], x1
602        st1             {v6.d}[0], [x0], x1
603        st1             {v7.d}[0], [x0], x1
604        st1             {v0.d}[1], [x0], x1
605        st1             {v1.d}[1], [x0], x1
606        st1             {v2.d}[1], [x0], x1
607        st1             {v3.d}[1], [x0], x1
608        st1             {v4.d}[1], [x0], x1
609        st1             {v5.d}[1], [x0], x1
610        st1             {v6.d}[1], [x0], x1
611        st1             {v7.d}[1], [x0]
612
613        ret
614endfunc
615.endm
616
617vp8_h_loop_filter16
618vp8_h_loop_filter16 _inner,  inner=1
619vp8_h_loop_filter16 _simple, simple=1
620
621.macro  vp8_h_loop_filter8uv name, inner=0
622function ff_vp8_h_loop_filter8uv\name\()_neon, export=1
623        sub             x0,  x0,  #4
624        sub             x1,  x1,  #4
625
626        // Load pixels:
627        ld1          {v0.d}[0],     [x0], x2 // load u
628        ld1          {v0.d}[1],     [x1], x2 // load v
629        ld1          {v1.d}[0],     [x0], x2
630        ld1          {v1.d}[1],     [x1], x2
631        ld1          {v2.d}[0],     [x0], x2
632        ld1          {v2.d}[1],     [x1], x2
633        ld1          {v3.d}[0],     [x0], x2
634        ld1          {v3.d}[1],     [x1], x2
635        ld1          {v4.d}[0],     [x0], x2
636        ld1          {v4.d}[1],     [x1], x2
637        ld1          {v5.d}[0],     [x0], x2
638        ld1          {v5.d}[1],     [x1], x2
639        ld1          {v6.d}[0],     [x0], x2
640        ld1          {v6.d}[1],     [x1], x2
641        ld1          {v7.d}[0],     [x0], x2
642        ld1          {v7.d}[1],     [x1], x2
643
644        transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
645
646        dup             v22.16b, w3                 // flim_E
647        dup             v23.16b, w4                 // flim_I
648
649        vp8_loop_filter inner=\inner, hev_thresh=w5
650
651        sub             x0,  x0,  x2, lsl #3    // backup u 8 rows
652        sub             x1,  x1,  x2, lsl #3    // backup v 8 rows
653
654        transpose_8x16B   v0,  v1,  v2,  v3,  v4,  v5,  v6,  v7, v30, v31
655
656        // Store pixels:
657        st1          {v0.d}[0],     [x0], x2 // load u
658        st1          {v0.d}[1],     [x1], x2 // load v
659        st1          {v1.d}[0],     [x0], x2
660        st1          {v1.d}[1],     [x1], x2
661        st1          {v2.d}[0],     [x0], x2
662        st1          {v2.d}[1],     [x1], x2
663        st1          {v3.d}[0],     [x0], x2
664        st1          {v3.d}[1],     [x1], x2
665        st1          {v4.d}[0],     [x0], x2
666        st1          {v4.d}[1],     [x1], x2
667        st1          {v5.d}[0],     [x0], x2
668        st1          {v5.d}[1],     [x1], x2
669        st1          {v6.d}[0],     [x0], x2
670        st1          {v6.d}[1],     [x1], x2
671        st1          {v7.d}[0],     [x0]
672        st1          {v7.d}[1],     [x1]
673
674        ret
675
676endfunc
677.endm
678
679vp8_h_loop_filter8uv
680vp8_h_loop_filter8uv _inner, inner=1
681
682
683function ff_put_vp8_pixels16_neon, export=1
6841:
685        subs            w4, w4, #4
686        ld1             {v0.16b},     [x2], x3
687        ld1             {v1.16b},     [x2], x3
688        ld1             {v2.16b},     [x2], x3
689        ld1             {v3.16b},     [x2], x3
690        st1             {v0.16b},     [x0], x1
691        st1             {v1.16b},     [x0], x1
692        st1             {v2.16b},     [x0], x1
693        st1             {v3.16b},     [x0], x1
694        b.gt            1b
695        ret
696endfunc
697
698function ff_put_vp8_pixels8_neon, export=1
6991:
700        subs            w4, w4, #4
701        ld1             {v0.8b},   [x2], x3
702        ld1             {v0.d}[1], [x2], x3
703        ld1             {v1.8b},   [x2], x3
704        ld1             {v1.d}[1], [x2], x3
705        st1             {v0.8b},   [x0], x1
706        st1             {v0.d}[1], [x0], x1
707        st1             {v1.8b},   [x0], x1
708        st1             {v1.d}[1], [x0], x1
709        b.gt            1b
710        ret
711endfunc
712
713/* 4/6-tap 8th-pel MC */
714
715.macro  vp8_epel8_h6    d,   s0,   s1
716        ext             v22.8b, \s0\().8b,  \s1\().8b,  #1
717        uxtl            v18.8h, \s0\().8b
718        ext             v23.8b, \s0\().8b,  \s1\().8b,  #2
719        uxtl            v19.8h, v22.8b
720        ext             v24.8b, \s0\().8b,  \s1\().8b,  #3
721        uxtl            v21.8h, v23.8b
722        ext             v25.8b, \s0\().8b,  \s1\().8b,  #4
723        uxtl            v22.8h, v24.8b
724        ext             v26.8b, \s0\().8b,  \s1\().8b,  #5
725        uxtl            v25.8h, v25.8b
726        mul             v21.8h, v21.8h, v0.h[2]
727        uxtl            v26.8h, v26.8b
728        mul             v22.8h, v22.8h, v0.h[3]
729        mls             v21.8h, v19.8h, v0.h[1]
730        mls             v22.8h, v25.8h, v0.h[4]
731        mla             v21.8h, v18.8h, v0.h[0]
732        mla             v22.8h, v26.8h, v0.h[5]
733        sqadd           v22.8h, v21.8h, v22.8h
734        sqrshrun        \d\().8b, v22.8h, #7
735.endm
736
737.macro  vp8_epel16_h6   d0,  v0,  v1
738        ext             v22.16b, \v0\().16b, \v1\().16b, #3
739        ext             v23.16b, \v0\().16b, \v1\().16b, #4
740        uxtl            v19.8h,  v22.8b
741        uxtl2           v22.8h,  v22.16b
742        ext             v3.16b,  \v0\().16b, \v1\().16b, #2
743        uxtl            v20.8h,  v23.8b
744        uxtl2           v23.8h,  v23.16b
745        ext             v16.16b, \v0\().16b, \v1\().16b, #1
746        uxtl            v18.8h,  v3.8b
747        uxtl2           v3.8h,   v3.16b
748        ext             v2.16b,  \v0\().16b, \v1\().16b, #5
749        uxtl            v21.8h,  v2.8b
750        uxtl2           v2.8h,   v2.16b
751        uxtl            v17.8h,  v16.8b
752        uxtl2           v16.8h,  v16.16b
753        mul             v19.8h,  v19.8h, v0.h[3]
754        mul             v18.8h,  v18.8h, v0.h[2]
755        mul             v3.8h,   v3.8h,  v0.h[2]
756        mul             v22.8h,  v22.8h, v0.h[3]
757        mls             v19.8h,  v20.8h, v0.h[4]
758        uxtl            v20.8h,  \v0\().8b
759        uxtl2           v1.8h,   \v0\().16b
760        mls             v18.8h,  v17.8h, v0.h[1]
761        mls             v3.8h,   v16.8h, v0.h[1]
762        mls             v22.8h,  v23.8h, v0.h[4]
763        mla             v18.8h,  v20.8h, v0.h[0]
764        mla             v19.8h,  v21.8h, v0.h[5]
765        mla             v3.8h,   v1.8h,  v0.h[0]
766        mla             v22.8h,  v2.8h,  v0.h[5]
767        sqadd           v19.8h,  v18.8h, v19.8h
768        sqadd           v22.8h,  v3.8h,  v22.8h
769        sqrshrun        \d0\().8b,  v19.8h, #7
770        sqrshrun2       \d0\().16b, v22.8h, #7
771.endm
772
773.macro  vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6
774        uxtl            \s0\().8h, \s0\().8b
775        uxtl            \s3\().8h, \s3\().8b
776        uxtl            \s6\().8h, \s6\().8b
777        uxtl            \s1\().8h, \s1\().8b
778        uxtl            \s4\().8h, \s4\().8b
779        uxtl            \s2\().8h, \s2\().8b
780        uxtl            \s5\().8h, \s5\().8b
781        mul             \s0\().8h, \s0\().8h, v0.h[0]
782        mul             v31.8h   , \s3\().8h, v0.h[3]
783        mul             \s3\().8h, \s3\().8h, v0.h[2]
784        mul             \s6\().8h, \s6\().8h, v0.h[5]
785
786        mls             \s0\().8h, \s1\().8h, v0.h[1]
787        mls             v31.8h   , \s4\().8h, v0.h[4]
788        mls             \s3\().8h, \s2\().8h, v0.h[1]
789        mls             \s6\().8h, \s5\().8h, v0.h[4]
790
791        mla             \s0\().8h, \s2\().8h, v0.h[2]
792        mla             v31.8h   , \s5\().8h, v0.h[5]
793        mla             \s3\().8h, \s1\().8h, v0.h[0]
794        mla             \s6\().8h, \s4\().8h, v0.h[3]
795        sqadd           v31.8h   , \s0\().8h, v31.8h
796        sqadd           \s6\().8h, \s3\().8h, \s6\().8h
797        sqrshrun        \d0\().8b, v31.8h,    #7
798        sqrshrun        \d1\().8b, \s6\().8h, #7
799.endm
800
801.macro  vp8_epel8_h4    d,   v0,   v1
802        ext             v22.8b, \v0\().8b,  \v1\().8b,  #1
803        uxtl            v19.8h, \v0\().8b
804        ext             v23.8b, \v0\().8b,  \v1\().8b,  #2
805        uxtl            v20.8h, v22.8b
806        ext             v25.8b, \v0\().8b,  \v1\().8b,  #3
807        uxtl            v22.8h, v23.8b
808        uxtl            v25.8h, v25.8b
809        mul             v20.8h, v20.8h, v0.h[2]
810        mul             v22.8h, v22.8h, v0.h[3]
811        mls             v20.8h, v19.8h, v0.h[1]
812        mls             v22.8h, v25.8h, v0.h[4]
813        sqadd           v22.8h, v20.8h, v22.8h
814        sqrshrun        \d\().8b, v22.8h, #7
815.endm
816
817.macro  vp8_epel8_v4_y2 d0, s0, s1, s2, s3, s4
818        uxtl            \s0\().8h,  \s0\().8b
819        uxtl            \s1\().8h,  \s1\().8b
820        uxtl            \s2\().8h,  \s2\().8b
821        uxtl            \s3\().8h,  \s3\().8b
822        uxtl            \s4\().8h,  \s4\().8b
823        mul             v21.8h,     \s1\().8h, v0.h[2]
824        mul             v23.8h,     \s2\().8h, v0.h[3]
825        mul             \s2\().8h,  \s2\().8h, v0.h[2]
826        mul             v22.8h,     \s3\().8h, v0.h[3]
827        mls             v21.8h,     \s0\().8h, v0.h[1]
828        mls             v23.8h,     \s3\().8h, v0.h[4]
829        mls             \s2\().8h,  \s1\().8h, v0.h[1]
830        mls             v22.8h,     \s4\().8h, v0.h[4]
831        sqadd           v21.8h,     v21.8h,    v23.8h
832        sqadd           \s2\().8h,  \s2\().8h, v22.8h
833        sqrshrun        \d0\().8b,  v21.8h,    #7
834        sqrshrun2       \d0\().16b, \s2\().8h, #7
835.endm
836
837
838// note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
839// arithmetic can be used to apply filters
840const   subpel_filters, align=4
841        .short     0,   6, 123,  12,   1,   0,   0,   0
842        .short     2,  11, 108,  36,   8,   1,   0,   0
843        .short     0,   9,  93,  50,   6,   0,   0,   0
844        .short     3,  16,  77,  77,  16,   3,   0,   0
845        .short     0,   6,  50,  93,   9,   0,   0,   0
846        .short     1,   8,  36, 108,  11,   2,   0,   0
847        .short     0,   1,  12, 123,   6,   0,   0,   0
848endconst
849
850function ff_put_vp8_epel16_v6_neon, export=1
851        sub             x2,  x2,  x3,  lsl #1
852
853        sxtw            x4,  w4
854        sxtw            x6,  w6
855        movrel          x17,  subpel_filters, -16
856        add             x6,  x17,  x6, lsl #4  // y
857        ld1             {v0.8h},     [x6]
8581:
859        ld1             {v1.1d - v2.1d},    [x2], x3
860        ld1             {v3.1d - v4.1d},    [x2], x3
861        ld1             {v16.1d - v17.1d},  [x2], x3
862        ld1             {v18.1d - v19.1d},  [x2], x3
863        ld1             {v20.1d - v21.1d},  [x2], x3
864        ld1             {v22.1d - v23.1d},  [x2], x3
865        ld1             {v24.1d - v25.1d},  [x2]
866        sub             x2,  x2,  x3, lsl #2
867
868        vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24
869        vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25
870
871        st1             {v1.1d - v2.1d}, [x0], x1
872        st1             {v3.1d - v4.1d}, [x0], x1
873        subs            x4, x4, #2
874        b.ne            1b
875
876        ret
877endfunc
878
879function ff_put_vp8_epel16_h6_neon, export=1
880        sub             x2,  x2,  #2
881        sxtw            x5,  w5 // x
882
883        // first pass (horizontal):
884        movrel          x17,  subpel_filters, -16
885        add             x5,  x17,  x5, lsl #4 // x
886        ld1             {v0.8h},  [x5]
8871:
888        ld1             {v1.16b, v2.16b}, [x2], x3
889        vp8_epel16_h6   v1, v1, v2
890        st1             {v1.16b}, [x0], x1
891
892        subs            w4, w4, #1
893        b.ne            1b
894        ret
895endfunc
896
897
898function ff_put_vp8_epel16_h6v6_neon, export=1
899        sub             x2,  x2,  x3,  lsl #1
900        sub             x2,  x2,  #2
901
902        // first pass (horizontal):
903        movrel          x17,  subpel_filters, -16
904        sxtw            x5,  w5 // x
905        add             x16,  x17,  x5, lsl #4 // x
906        sub             sp,  sp,  #336+16
907        ld1             {v0.8h},  [x16]
908        add             x7,  sp,  #15
909        sxtw            x4,  w4
910        add             x16, x4, #5   // h
911        bic             x7,  x7,  #15
9121:
913        ld1             {v1.16b, v2.16b}, [x2], x3
914        vp8_epel16_h6   v1, v1, v2
915        st1             {v1.16b}, [x7], #16
916        subs            x16, x16, #1
917        b.ne            1b
918
919
920        // second pass (vertical):
921        sxtw            x6,  w6
922        add             x6,  x17,  x6, lsl #4  // y
923        add             x7,  sp,  #15
924        ld1             {v0.8h},     [x6]
925        bic             x7,  x7,  #15
9262:
927        ld1             {v1.8b - v4.8b},    [x7], #32
928        ld1             {v16.8b - v19.8b},  [x7], #32
929        ld1             {v20.8b - v23.8b},  [x7], #32
930        ld1             {v24.8b - v25.8b},  [x7]
931        sub             x7,  x7,  #64
932
933        vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24
934        vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25
935        trn1            v1.2d, v1.2d, v2.2d
936        trn1            v3.2d, v3.2d, v4.2d
937
938        st1             {v1.16b}, [x0], x1
939        st1             {v3.16b}, [x0], x1
940        subs            x4, x4, #2
941        b.ne            2b
942
943        add             sp,  sp,  #336+16
944        ret
945endfunc
946
947function ff_put_vp8_epel8_v6_neon, export=1
948        sub             x2,  x2,  x3,  lsl #1
949
950        movrel          x7,  subpel_filters, -16
951        add             x6,  x7,  w6, uxtw #4
952        ld1             {v0.8h},  [x6]
9531:
954        ld1             {v2.8b},  [x2], x3
955        ld1             {v3.8b},  [x2], x3
956        ld1             {v4.8b},  [x2], x3
957        ld1             {v5.8b},  [x2], x3
958        ld1             {v6.8b},  [x2], x3
959        ld1             {v7.8b},  [x2], x3
960        ld1             {v28.8b}, [x2]
961
962        sub             x2,  x2,  x3,  lsl #2
963
964        vp8_epel8_v6_y2 v2, v3, v2, v3, v4, v5, v6, v7, v28
965
966        st1             {v2.8b}, [x0], x1
967        st1             {v3.8b}, [x0], x1
968        subs            w4,  w4,  #2
969        b.ne            1b
970
971        ret
972endfunc
973
974function ff_put_vp8_epel8_h6_neon, export=1
975        sub             x2,  x2,  #2
976
977        movrel          x7,  subpel_filters, -16
978        add             x5,  x7,  w5, uxtw #4
979        ld1             {v0.8h},        [x5]
9801:
981        ld1             {v2.8b, v3.8b}, [x2], x3
982
983        vp8_epel8_h6    v2,  v2,  v3
984
985        st1             {v2.8b}, [x0], x1
986        subs            w4,  w4,  #1
987        b.ne            1b
988
989        ret
990endfunc
991
992function ff_put_vp8_epel8_h6v6_neon, export=1
993        sub             x2,  x2,  x3,  lsl #1
994        sub             x2,  x2,  #2
995        sxtw            x4,  w4
996
997        // first pass (horizontal):
998        movrel          x17,  subpel_filters, -16
999        sxtw            x5,  w5
1000        add             x5,  x17,  x5, lsl #4 // x
1001        sub             sp,  sp,  #168+16
1002        ld1             {v0.8h},  [x5]
1003        add             x7,  sp,  #15
1004        add             x16, x4,  #5   // h
1005        bic             x7,  x7,  #15
10061:
1007        ld1             {v1.8b, v2.8b}, [x2], x3
1008
1009        vp8_epel8_h6    v1, v1, v2
1010
1011        st1             {v1.8b}, [x7], #8
1012        subs            x16, x16, #1
1013        b.ne            1b
1014
1015        // second pass (vertical):
1016        sxtw            x6,  w6
1017        add             x6,  x17,  x6, lsl #4  // y
1018        add             x7,  sp,   #15
1019        ld1             {v0.8h},   [x6]
1020        bic             x7,  x7,   #15
10212:
1022        ld1             {v1.8b - v4.8b}, [x7], #32
1023        ld1             {v5.8b - v7.8b}, [x7]
1024
1025        sub             x7,  x7,  #16
1026
1027        vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7
1028
1029        st1             {v1.8b}, [x0], x1
1030        st1             {v2.8b}, [x0], x1
1031        subs            x4, x4, #2
1032        b.ne            2b
1033
1034        add             sp,  sp,  #168+16
1035        ret
1036endfunc
1037
1038function ff_put_vp8_epel8_v4_neon, export=1
1039        sub             x2,  x2,  x3
1040
1041        movrel          x7,  subpel_filters, -16
1042        add             x6,  x7,  w6, uxtw #4
1043        ld1             {v0.8h},     [x6]
10441:
1045        ld1             {v2.8b},     [x2], x3
1046        ld1             {v3.8b},     [x2], x3
1047        ld1             {v4.8b},     [x2], x3
1048        ld1             {v5.8b},     [x2], x3
1049        ld1             {v6.8b},     [x2]
1050        sub             x2,  x2,  x3,  lsl #1
1051
1052        vp8_epel8_v4_y2 v2, v2, v3, v4, v5, v6
1053
1054        st1             {v2.d}[0], [x0], x1
1055        st1             {v2.d}[1], [x0], x1
1056        subs            w4,  w4,  #2
1057        b.ne            1b
1058
1059        ret
1060endfunc
1061
1062function ff_put_vp8_epel8_h4_neon, export=1
1063        sub             x2,  x2,  #1
1064
1065        movrel          x7,  subpel_filters, -16
1066        add             x5,  x7,  w5, uxtw #4
1067        ld1             {v0.8h},       [x5]
10681:
1069        ld1             {v2.8b,v3.8b}, [x2], x3
1070
1071        vp8_epel8_h4    v2,  v2,  v3
1072
1073        st1             {v2.8b}, [x0], x1
1074        subs            w4,  w4,  #1
1075        b.ne            1b
1076
1077        ret
1078endfunc
1079
1080function ff_put_vp8_epel8_h4v6_neon, export=1
1081        sub             x2,  x2,  x3,  lsl #1
1082        sub             x2,  x2,  #1
1083        sxtw            x4,  w4
1084
1085        // first pass (horizontal):
1086        movrel          x17,  subpel_filters, -16
1087        sxtw            x5,  w5
1088        add             x5,  x17,  x5, lsl #4 // x
1089        sub             sp,  sp,  #168+16
1090        ld1             {v0.8h},  [x5]
1091        add             x7,  sp,  #15
1092        add             x16, x4, #5   // h
1093        bic             x7,  x7,  #15
10941:
1095        ld1             {v1.8b, v2.8b}, [x2], x3
1096
1097        vp8_epel8_h4    v1, v1, v2
1098
1099        st1             {v1.8b}, [x7], #8
1100        subs            x16, x16, #1
1101        b.ne            1b
1102
1103        // second pass (vertical):
1104        sxtw            x6,  w6
1105        add             x6,  x17,  x6, lsl #4  // y
1106        add             x7,  sp,   #15
1107        ld1             {v0.8h},   [x6]
1108        bic             x7,  x7,   #15
11092:
1110        ld1             {v1.8b - v4.8b}, [x7], #32
1111        ld1             {v5.8b - v7.8b}, [x7]
1112
1113        sub             x7,  x7,  #16
1114
1115        vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7
1116
1117        st1             {v1.8b}, [x0], x1
1118        st1             {v2.8b}, [x0], x1
1119        subs            x4, x4, #2
1120        b.ne            2b
1121
1122        add             sp,  sp,  #168+16
1123        ret
1124endfunc
1125
1126function ff_put_vp8_epel8_h4v4_neon, export=1
1127        sub             x2,  x2,  x3
1128        sub             x2,  x2,  #1
1129        sxtw            x4,  w4
1130
1131
1132        // first pass (horizontal):
1133        movrel          x17,  subpel_filters, -16
1134        sxtw            x5,  w5
1135        add             x5,  x17,  x5, lsl #4 // x
1136        sub             sp,  sp,  #168+16
1137        ld1             {v0.8h},  [x5]
1138        add             x7,  sp,  #15
1139        add             x16, x4, #3   // h
1140        bic             x7,  x7,  #15
11411:
1142        ld1             {v1.8b, v2.8b}, [x2], x3
1143
1144        vp8_epel8_h4    v1, v1, v2
1145
1146        st1             {v1.8b}, [x7], #8
1147        subs            x16, x16, #1
1148        b.ne            1b
1149
1150        // second pass (vertical):
1151        sxtw            x6,  w6
1152        add             x6,  x17,  x6, lsl #4  // y
1153        add             x7,  sp,   #15
1154        ld1             {v0.8h},   [x6]
1155        bic             x7,  x7,   #15
11562:
1157        ld1             {v1.8b - v2.8b}, [x7], #16
1158        ld1             {v3.8b - v5.8b}, [x7]
1159
1160        vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5
1161
1162        st1             {v1.d}[0], [x0], x1
1163        st1             {v1.d}[1], [x0], x1
1164        subs            x4, x4, #2
1165        b.ne            2b
1166
1167        add             sp,  sp,  #168+16
1168        ret
1169endfunc
1170
1171function ff_put_vp8_epel8_h6v4_neon, export=1
1172        sub             x2,  x2,  x3
1173        sub             x2,  x2,  #2
1174        sxtw            x4,  w4
1175
1176
1177        // first pass (horizontal):
1178        movrel          x17,  subpel_filters, -16
1179        sxtw            x5,  w5
1180        add             x5,  x17,  x5, lsl #4 // x
1181        sub             sp,  sp,  #168+16
1182        ld1             {v0.8h},  [x5]
1183        add             x7,  sp,  #15
1184        add             x16, x4, #3   // h
1185        bic             x7,  x7,  #15
11861:
1187        ld1             {v1.8b, v2.8b}, [x2], x3
1188
1189        vp8_epel8_h6    v1, v1, v2
1190
1191        st1             {v1.8b}, [x7], #8
1192        subs            x16, x16, #1
1193        b.ne            1b
1194
1195        // second pass (vertical):
1196        sxtw            x6,  w6
1197        add             x6,  x17,  x6, lsl #4  // y
1198        add             x7,  sp,   #15
1199        ld1             {v0.8h},   [x6]
1200        bic             x7,  x7,   #15
12012:
1202        ld1             {v1.8b - v2.8b}, [x7], #16
1203        ld1             {v3.8b - v5.8b}, [x7]
1204
1205        vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5
1206
1207        st1             {v1.d}[0], [x0], x1
1208        st1             {v1.d}[1], [x0], x1
1209        subs            x4, x4, #2
1210        b.ne            2b
1211
1212        add             sp,  sp,  #168+16
1213        ret
1214endfunc
1215
1216function ff_put_vp8_epel4_v6_neon, export=1
1217        sub             x2,  x2,  x3,  lsl #1
1218
1219        movrel          x7,  subpel_filters, -16
1220        add             x6,  x7,  w6, uxtw #4
1221        ld1             {v0.8h},    [x6]
12221:
1223        ld1r            {v2.2s},    [x2], x3
1224        ld1r            {v3.2s},    [x2], x3
1225        ld1r            {v4.2s},    [x2], x3
1226        ld1r            {v5.2s},    [x2], x3
1227        ld1r            {v6.2s},    [x2], x3
1228        ld1r            {v7.2s},    [x2], x3
1229        ld1r            {v28.2s},   [x2]
1230        sub             x2,  x2,  x3,  lsl #2
1231        ld1             {v2.s}[1],  [x2], x3
1232        ld1             {v3.s}[1],  [x2], x3
1233        ld1             {v4.s}[1],  [x2], x3
1234        ld1             {v5.s}[1],  [x2], x3
1235        ld1             {v6.s}[1],  [x2], x3
1236        ld1             {v7.s}[1],  [x2], x3
1237        ld1             {v28.s}[1], [x2]
1238        sub             x2,  x2,  x3,  lsl #2
1239
1240        vp8_epel8_v6_y2 v2, v3, v2, v3, v4, v5, v6, v7, v28
1241
1242        st1             {v2.s}[0],  [x0], x1
1243        st1             {v3.s}[0],  [x0], x1
1244        st1             {v2.s}[1],  [x0], x1
1245        st1             {v3.s}[1],  [x0], x1
1246        subs            w4,  w4,  #4
1247        b.ne            1b
1248
1249        ret
1250endfunc
1251
1252function ff_put_vp8_epel4_h6_neon, export=1
1253        sub             x2,  x2,  #2
1254
1255        movrel          x7,  subpel_filters, -16
1256        add             x5,  x7,  w5, uxtw #4
1257        ld1             {v0.8h},       [x5]
12581:
1259        ld1             {v2.8b,v3.8b}, [x2], x3
1260        vp8_epel8_h6    v2,  v2,  v3
1261        st1             {v2.s}[0], [x0], x1
1262        subs            w4,  w4,  #1
1263        b.ne            1b
1264
1265        ret
1266endfunc
1267
1268function ff_put_vp8_epel4_h6v6_neon, export=1
1269        sub             x2,  x2,  x3,  lsl #1
1270        sub             x2,  x2,  #2
1271
1272        movrel          x7,  subpel_filters, -16
1273        add             x5,  x7,  w5, uxtw #4
1274        ld1             {v0.8h},       [x5]
1275
1276        sub             sp,  sp,  #52
1277        add             w8,  w4,  #5
1278        mov             x9,  sp
12791:
1280        ld1             {v2.8b,v3.8b}, [x2], x3
1281        vp8_epel8_h6    v2,  v2,  v3
1282        st1             {v2.s}[0],     [x9], #4
1283        subs            w8,  w8,  #1
1284        b.ne            1b
1285
1286        add             x6,  x7,  w6, uxtw #4
1287        ld1             {v0.8h},       [x6]
1288        mov             x9,  sp
12892:
1290        ld1             {v2.8b,v3.8b}, [x9], #16
1291        ld1             {v6.8b},       [x9], #8
1292        ld1r            {v28.2s},      [x9]
1293        sub             x9,  x9,  #16
1294        ld1             {v4.8b,v5.8b}, [x9], #16
1295        ld1             {v7.8b},       [x9], #8
1296        ld1             {v28.s}[1],    [x9]
1297        sub             x9,  x9,  #16
1298        trn1            v1.2s, v2.2s, v4.2s
1299        trn2            v4.2s, v2.2s, v4.2s
1300        trn1            v2.2s, v3.2s, v5.2s
1301        trn2            v5.2s, v3.2s, v5.2s
1302        trn1            v3.2s, v6.2s, v7.2s
1303        trn2            v7.2s, v6.2s, v7.2s
1304        vp8_epel8_v6_y2 v2, v3, v1, v4, v2, v5, v3, v7, v28
1305        st1             {v2.s}[0],  [x0], x1
1306        st1             {v3.s}[0],  [x0], x1
1307        st1             {v2.s}[1],  [x0], x1
1308        st1             {v3.s}[1],  [x0], x1
1309        subs            w4,  w4,  #4
1310        b.ne            2b
1311
1312        add             sp,  sp,  #52
1313        ret
1314endfunc
1315
1316function ff_put_vp8_epel4_h4v6_neon, export=1
1317        sub             x2,  x2,  x3,  lsl #1
1318        sub             x2,  x2,  #1
1319
1320        movrel          x7,  subpel_filters, -16
1321        add             x5,  x7,  w5, uxtw #4
1322        ld1             {v0.8h},       [x5]
1323
1324        sub             sp,  sp,  #52
1325        add             w8,  w4,  #5
1326        mov             x9,  sp
13271:
1328        ld1             {v2.8b},       [x2], x3
1329        vp8_epel8_h4    v2,  v2,  v2
1330        st1             {v2.s}[0],     [x9], #4
1331        subs            w8,  w8,  #1
1332        b.ne            1b
1333
1334        add             x6,  x7,  w6, uxtw #4
1335        ld1             {v0.8h},       [x6]
1336        mov             x9,  sp
13372:
1338        ld1             {v2.8b,v3.8b}, [x9], #16
1339        ld1             {v6.8b},       [x9], #8
1340        ld1r            {v28.2s},      [x9]
1341        sub             x9,  x9,  #16
1342        ld1             {v4.8b,v5.8b}, [x9], #16
1343        ld1             {v7.8b},       [x9], #8
1344        ld1             {v28.s}[1],    [x9]
1345        sub             x9,  x9,  #16
1346        trn1            v1.2s, v2.2s, v4.2s
1347        trn2            v4.2s, v2.2s, v4.2s
1348        trn1            v2.2s, v3.2s, v5.2s
1349        trn2            v5.2s, v3.2s, v5.2s
1350        trn1            v3.2s, v6.2s, v7.2s
1351        trn2            v7.2s, v6.2s, v7.2s
1352        vp8_epel8_v6_y2 v2, v3, v1, v4, v2, v5, v3, v7, v28
1353        st1             {v2.s}[0],  [x0], x1
1354        st1             {v3.s}[0],  [x0], x1
1355        st1             {v2.s}[1],  [x0], x1
1356        st1             {v3.s}[1],  [x0], x1
1357        subs            w4,  w4,  #4
1358        b.ne            2b
1359
1360        add             sp,  sp,  #52
1361        ret
1362endfunc
1363
1364function ff_put_vp8_epel4_h6v4_neon, export=1
1365        sub             x2,  x2,  x3
1366        sub             x2,  x2,  #2
1367
1368        movrel          x7,  subpel_filters, -16
1369        add             x5,  x7,  w5, uxtw #4
1370        ld1             {v0.8h},       [x5]
1371
1372        sub             sp,  sp,  #44
1373        add             w8,  w4,  #3
1374        mov             x9,  sp
13751:
1376        ld1             {v2.8b,v3.8b}, [x2], x3
1377        vp8_epel8_h6    v2, v2, v3
1378        st1             {v2.s}[0],     [x9], #4
1379        subs            w8,  w8,  #1
1380        b.ne            1b
1381
1382        add             x6,  x7,  w6, uxtw #4
1383        ld1             {v0.8h},       [x6]
1384        mov             x9,  sp
13852:
1386        ld1             {v2.8b,v3.8b}, [x9], #16
1387        ld1r            {v6.2s},       [x9]
1388        sub             x9,  x9,  #8
1389        ld1             {v4.8b,v5.8b}, [x9], #16
1390        ld1             {v6.s}[1],     [x9]
1391        sub             x9,  x9,  #8
1392        trn1            v1.2s, v2.2s, v4.2s
1393        trn2            v4.2s, v2.2s, v4.2s
1394        trn1            v2.2s, v3.2s, v5.2s
1395        trn2            v5.2s, v3.2s, v5.2s
1396        vp8_epel8_v4_y2 v1, v1, v4, v2, v5, v6
1397        st1             {v1.s}[0],  [x0], x1
1398        st1             {v1.s}[2],  [x0], x1
1399        st1             {v1.s}[1],  [x0], x1
1400        st1             {v1.s}[3],  [x0], x1
1401        subs            w4,  w4,  #4
1402        b.ne            2b
1403
1404        add             sp,  sp,  #44
1405        ret
1406endfunc
1407
1408function ff_put_vp8_epel4_h4_neon, export=1
1409        sub             x2,  x2,  #1
1410
1411        movrel          x7,  subpel_filters, -16
1412        add             x5,  x7,  w5, uxtw #4
1413        ld1             {v0.8h},    [x5]
14141:
1415        ld1             {v2.8b},    [x2], x3
1416        vp8_epel8_h4    v2,  v2,  v2
1417        st1             {v2.s}[0],  [x0], x1
1418        subs            w4,  w4,  #1
1419        b.ne            1b
1420
1421        ret
1422endfunc
1423
1424function ff_put_vp8_epel4_v4_neon, export=1
1425        sub             x2,  x2,  x3
1426
1427        movrel          x7,  subpel_filters, -16
1428        add             x6,  x7,  w6, uxtw #4
1429        ld1             {v0.8h},   [x6]
14301:
1431        ld1r            {v2.2s},   [x2], x3
1432        ld1r            {v3.2s},   [x2], x3
1433        ld1r            {v4.2s},   [x2], x3
1434        ld1r            {v5.2s},   [x2], x3
1435        ld1r            {v6.2s},   [x2]
1436        sub             x2,  x2,  x3,  lsl #1
1437        ld1             {v2.s}[1], [x2], x3
1438        ld1             {v3.s}[1], [x2], x3
1439        ld1             {v4.s}[1], [x2], x3
1440        ld1             {v5.s}[1], [x2], x3
1441        ld1             {v6.s}[1], [x2]
1442        sub             x2,  x2,  x3,  lsl #1
1443
1444        vp8_epel8_v4_y2 v2, v2, v3, v4, v5, v6
1445
1446        st1             {v2.s}[0], [x0], x1
1447        st1             {v2.s}[2], [x0], x1
1448        st1             {v2.s}[1], [x0], x1
1449        st1             {v2.s}[3], [x0], x1
1450        subs            w4,  w4,  #4
1451        b.ne            1b
1452
1453        ret
1454endfunc
1455
1456function ff_put_vp8_epel4_h4v4_neon, export=1
1457        sub             x2,  x2,  x3
1458        sub             x2,  x2,  #1
1459
1460        movrel          x7,  subpel_filters, -16
1461        add             x5,  x7,  w5, uxtw #4
1462        ld1             {v0.8h},       [x5]
1463
1464        sub             sp,  sp,  #44
1465        add             w8,  w4,  #3
1466        mov             x9,  sp
14671:
1468        ld1             {v2.8b},       [x2], x3
1469        vp8_epel8_h4    v2,  v2,  v3
1470        st1             {v2.s}[0],     [x9], #4
1471        subs            w8,  w8,  #1
1472        b.ne            1b
1473
1474        add             x6,  x7,  w6, uxtw #4
1475        ld1             {v0.8h},       [x6]
1476        mov             x9,  sp
14772:
1478        ld1             {v2.8b,v3.8b}, [x9], #16
1479        ld1r            {v6.2s},       [x9]
1480        sub             x9,  x9,  #8
1481        ld1             {v4.8b,v5.8b}, [x9], #16
1482        ld1             {v6.s}[1],     [x9]
1483        sub             x9,  x9,  #8
1484        trn1            v1.2s, v2.2s, v4.2s
1485        trn2            v4.2s, v2.2s, v4.2s
1486        trn1            v2.2s, v3.2s, v5.2s
1487        trn2            v5.2s, v3.2s, v5.2s
1488        vp8_epel8_v4_y2 v1, v1, v4, v2, v5, v6
1489        st1             {v1.s}[0], [x0], x1
1490        st1             {v1.s}[2], [x0], x1
1491        st1             {v1.s}[1], [x0], x1
1492        st1             {v1.s}[3], [x0], x1
1493        subs            w4,  w4,  #4
1494        b.ne            2b
1495
1496        add             sp,  sp,  #44
1497        ret
1498endfunc
1499
1500/* Bilinear MC */
1501
1502function ff_put_vp8_bilin16_h_neon, export=1
1503        mov             w7,     #8
1504        dup             v0.8b,  w5
1505        sub             w5,     w7,     w5
1506        dup             v1.8b,  w5
15071:
1508        subs            w4,     w4,     #2
1509        ld1             {v2.8b,v3.8b,v4.8b},    [x2], x3
1510        ext             v5.8b,  v3.8b,  v4.8b,  #1
1511        ext             v4.8b,  v2.8b,  v3.8b,  #1
1512        umull           v16.8h, v2.8b,  v1.8b
1513        umlal           v16.8h, v4.8b,  v0.8b
1514        ld1             {v18.8b,v19.8b,v20.8b}, [x2], x3
1515        umull           v6.8h,  v3.8b,  v1.8b
1516        umlal           v6.8h,  v5.8b,  v0.8b
1517        ext             v21.8b, v19.8b, v20.8b, #1
1518        ext             v20.8b, v18.8b, v19.8b, #1
1519        umull           v22.8h, v18.8b, v1.8b
1520        umlal           v22.8h, v20.8b, v0.8b
1521        umull           v24.8h, v19.8b, v1.8b
1522        umlal           v24.8h, v21.8b, v0.8b
1523        rshrn           v4.8b,  v16.8h, #3
1524        rshrn2          v4.16b, v6.8h,  #3
1525        rshrn           v6.8b,  v22.8h, #3
1526        rshrn2          v6.16b, v24.8h, #3
1527        st1             {v4.16b}, [x0], x1
1528        st1             {v6.16b}, [x0], x1
1529        b.gt            1b
1530
1531        ret
1532endfunc
1533
1534function ff_put_vp8_bilin16_v_neon, export=1
1535        mov             w7,     #8
1536        dup             v0.16b, w6
1537        sub             w6,     w7,     w6
1538        dup             v1.16b, w6
1539
1540        ld1             {v2.16b}, [x2], x3
15411:
1542        subs            w4,     w4,     #2
1543        ld1             {v4.16b}, [x2], x3
1544        umull           v6.8h,  v2.8b,  v1.8b
1545        umlal           v6.8h,  v4.8b,  v0.8b
1546        umull2          v16.8h, v2.16b, v1.16b
1547        umlal2          v16.8h, v4.16b, v0.16b
1548        ld1             {v2.16b}, [x2], x3
1549        umull           v18.8h, v4.8b,  v1.8b
1550        umlal           v18.8h, v2.8b,  v0.8b
1551        umull2          v20.8h, v4.16b, v1.16b
1552        umlal2          v20.8h, v2.16b, v0.16b
1553        rshrn           v4.8b,  v6.8h,  #3
1554        rshrn2          v4.16b, v16.8h, #3
1555        rshrn           v6.8b,  v18.8h, #3
1556        rshrn2          v6.16b, v20.8h, #3
1557        st1             {v4.16b}, [x0], x1
1558        st1             {v6.16b}, [x0], x1
1559        b.gt            1b
1560
1561        ret
1562endfunc
1563
1564function ff_put_vp8_bilin16_hv_neon, export=1
1565        mov             w7,      #8
1566        dup             v0.8b,   w5            // mx
1567        sub             w5,      w7,     w5
1568        dup             v1.8b,   w5
1569        dup             v2.16b,  w6            // my
1570        sub             w6,      w7,     w6
1571        dup             v3.16b,  w6
1572
1573        ld1             {v4.8b,v5.8b,v6.8b},    [x2], x3
1574
1575        ext             v7.8b,   v5.8b,  v6.8b, #1
1576        ext             v6.8b,   v4.8b,  v5.8b, #1
1577        umull           v16.8h,  v4.8b,  v1.8b
1578        umlal           v16.8h,  v6.8b,  v0.8b
1579        umull           v18.8h,  v5.8b,  v1.8b
1580        umlal           v18.8h,  v7.8b,  v0.8b
1581        rshrn           v4.8b,   v16.8h, #3
1582        rshrn2          v4.16b,  v18.8h, #3
15831:
1584        subs            w4,  w4,  #2
1585        ld1             {v18.8b,v19.8b,v20.8b},  [x2], x3
1586        ext             v21.8b,  v19.8b, v20.8b, #1
1587        ext             v20.8b,  v18.8b, v19.8b, #1
1588        umull           v22.8h,  v18.8b, v1.8b
1589        umlal           v22.8h,  v20.8b, v0.8b
1590        ld1             {v26.8b,v27.8b,v28.8b},  [x2], x3
1591        umull           v24.8h,  v19.8b, v1.8b
1592        umlal           v24.8h,  v21.8b, v0.8b
1593        ext             v29.8b,  v27.8b, v28.8b, #1
1594        ext             v28.8b,  v26.8b, v27.8b, #1
1595        umull           v16.8h,  v26.8b, v1.8b
1596        umlal           v16.8h,  v28.8b, v0.8b
1597        umull           v18.8h,  v27.8b, v1.8b
1598        umlal           v18.8h,  v29.8b, v0.8b
1599        rshrn           v6.8b,   v22.8h, #3
1600        rshrn2          v6.16b,  v24.8h, #3
1601        umull           v24.8h,  v4.8b,  v3.8b
1602        umlal           v24.8h,  v6.8b,  v2.8b
1603        umull2          v30.8h,  v4.16b, v3.16b
1604        umlal2          v30.8h,  v6.16b, v2.16b
1605        rshrn           v4.8b,   v16.8h, #3
1606        rshrn2          v4.16b,  v18.8h, #3
1607        umull           v20.8h,  v6.8b,  v3.8b
1608        umlal           v20.8h,  v4.8b,  v2.8b
1609        umull2          v22.8h,  v6.16b, v3.16b
1610        umlal2          v22.8h,  v4.16b, v2.16b
1611        rshrn           v24.8b,  v24.8h, #3
1612        rshrn2          v24.16b, v30.8h, #3
1613        st1             {v24.16b}, [x0], x1
1614        rshrn           v20.8b,  v20.8h, #3
1615        rshrn2          v20.16b, v22.8h, #3
1616        st1             {v20.16b}, [x0], x1
1617        b.gt            1b
1618
1619        ret
1620endfunc
1621
1622function ff_put_vp8_bilin8_h_neon, export=1
1623        mov             w7,     #8
1624        dup             v0.8b,  w5
1625        sub             w5,     w7,     w5
1626        dup             v1.8b,  w5
16271:
1628        subs            w4,     w4,     #2
1629        ld1             {v2.8b,v3.8b},  [x2],  x3
1630        ext             v3.8b,  v2.8b,  v3.8b, #1
1631        umull           v4.8h,  v2.8b,  v1.8b
1632        umlal           v4.8h,  v3.8b,  v0.8b
1633        ld1             {v6.8b,v7.8b},  [x2],  x3
1634        ext             v7.8b,  v6.8b,  v7.8b, #1
1635        umull           v16.8h, v6.8b,  v1.8b
1636        umlal           v16.8h, v7.8b,  v0.8b
1637        rshrn           v4.8b,  v4.8h,  #3
1638        rshrn           v16.8b, v16.8h, #3
1639        st1             {v4.8b},  [x0], x1
1640        st1             {v16.8b}, [x0], x1
1641        b.gt            1b
1642
1643        ret
1644endfunc
1645
1646function ff_put_vp8_bilin8_v_neon, export=1
1647        mov             w7,      #8
1648        dup             v0.8b,   w6
1649        sub             w6,      w7,    w6
1650        dup             v1.8b,   w6
1651
1652        ld1             {v2.8b}, [x2],  x3
16531:
1654        subs            w4,      w4,    #2
1655        ld1             {v3.8b}, [x2],  x3
1656        umull           v4.8h,   v2.8b, v1.8b
1657        umlal           v4.8h,   v3.8b, v0.8b
1658        ld1             {v2.8b}, [x2],  x3
1659        umull           v6.8h,   v3.8b, v1.8b
1660        umlal           v6.8h,   v2.8b, v0.8b
1661        rshrn           v4.8b,   v4.8h, #3
1662        rshrn           v6.8b,   v6.8h, #3
1663        st1             {v4.8b}, [x0],  x1
1664        st1             {v6.8b}, [x0],  x1
1665        b.gt            1b
1666
1667        ret
1668endfunc
1669
1670function ff_put_vp8_bilin8_hv_neon, export=1
1671        mov             w7,     #8
1672        dup             v0.8b,  w5             // mx
1673        sub             w5,     w7,     w5
1674        dup             v1.8b,  w5
1675        dup             v2.8b,  w6             // my
1676        sub             w6,     w7,     w6
1677        dup             v3.8b,  w6
1678
1679        ld1             {v4.8b,v5.8b},  [x2],  x3
1680        ext             v5.8b,  v4.8b,  v5.8b, #1
1681        umull           v18.8h, v4.8b,  v1.8b
1682        umlal           v18.8h, v5.8b,  v0.8b
1683        rshrn           v22.8b, v18.8h, #3
16841:
1685        subs            w4,     w4,     #2
1686        ld1             {v6.8b,v7.8b},  [x2],  x3
1687        ext             v7.8b,  v6.8b,  v7.8b, #1
1688        umull           v16.8h, v6.8b,  v1.8b
1689        umlal           v16.8h, v7.8b,  v0.8b
1690        ld1             {v4.8b,v5.8b},  [x2],  x3
1691        ext             v5.8b,  v4.8b,  v5.8b, #1
1692        umull           v18.8h, v4.8b,  v1.8b
1693        umlal           v18.8h, v5.8b,  v0.8b
1694        rshrn           v16.8b, v16.8h, #3
1695        umull           v20.8h, v22.8b, v3.8b
1696        umlal           v20.8h, v16.8b, v2.8b
1697        rshrn           v22.8b, v18.8h, #3
1698        umull           v24.8h, v16.8b, v3.8b
1699        umlal           v24.8h, v22.8b, v2.8b
1700        rshrn           v20.8b, v20.8h, #3
1701        st1             {v20.8b}, [x0], x1
1702        rshrn           v23.8b, v24.8h, #3
1703        st1             {v23.8b}, [x0], x1
1704        b.gt            1b
1705
1706        ret
1707endfunc
1708
1709function ff_put_vp8_bilin4_h_neon, export=1
1710        mov             w7,      #8
1711        dup             v0.8b,   w5
1712        sub             w5,      w7,     w5
1713        dup             v1.8b,   w5
17141:
1715        subs            w4,      w4,     #2
1716        ld1             {v2.8b}, [x2],   x3
1717        ext             v3.8b,   v2.8b,  v3.8b,  #1
1718        ld1             {v6.8b}, [x2],   x3
1719        ext             v7.8b,   v6.8b,  v7.8b,  #1
1720        trn1            v2.2s,   v2.2s,  v6.2s
1721        trn1            v3.2s,   v3.2s,  v7.2s
1722        umull           v4.8h,   v2.8b,  v1.8b
1723        umlal           v4.8h,   v3.8b,  v0.8b
1724        rshrn           v4.8b,   v4.8h,  #3
1725        st1             {v4.s}[0], [x0], x1
1726        st1             {v4.s}[1], [x0], x1
1727        b.gt            1b
1728
1729        ret
1730endfunc
1731
1732function ff_put_vp8_bilin4_v_neon, export=1
1733        mov             w7,     #8
1734        dup             v0.8b,  w6
1735        sub             w6,     w7,  w6
1736        dup             v1.8b,  w6
1737
1738        ld1r            {v2.2s},    [x2], x3
17391:
1740        ld1r            {v3.2s},   [x2]
1741        ld1             {v2.s}[1], [x2], x3
1742        ld1             {v3.s}[1], [x2], x3
1743        umull           v4.8h,  v2.8b,  v1.8b
1744        umlal           v4.8h,  v3.8b,  v0.8b
1745        trn2            v2.2s,  v3.2s,  v2.2s
1746        rshrn           v4.8b,  v4.8h,  #3
1747        st1             {v4.s}[0], [x0], x1
1748        st1             {v4.s}[1], [x0], x1
1749        subs            w4,     w4,     #2
1750        b.gt            1b
1751
1752        ret
1753endfunc
1754
1755function ff_put_vp8_bilin4_hv_neon, export=1
1756        mov             w7,      #8
1757        dup             v0.8b,   w5             // mx
1758        sub             w5,      w7,     w5
1759        dup             v1.8b,   w5
1760        dup             v2.8b,   w6             // my
1761        sub             w6,      w7,     w6
1762        dup             v3.8b,   w6
1763
1764        ld1             {v4.8b}, [x2],   x3
1765        ext             v5.8b,   v4.8b,  v4.8b,  #1
1766        umull           v18.8h,  v4.8b,  v1.8b
1767        umlal           v18.8h,  v5.8b,  v0.8b
1768        rshrn           v22.8b,  v18.8h, #3
17691:
1770        subs            w4,      w4,     #2
1771        ld1             {v6.8b}, [x2],   x3
1772        ext             v7.8b,   v6.8b,  v6.8b,  #1
1773        ld1             {v4.8b}, [x2],   x3
1774        ext             v5.8b,   v4.8b,  v4.8b,  #1
1775        trn1            v6.2s,   v6.2s,  v4.2s
1776        trn1            v7.2s,   v7.2s,  v5.2s
1777        umull           v16.8h,  v6.8b,  v1.8b
1778        umlal           v16.8h,  v7.8b,  v0.8b
1779        rshrn           v16.8b,  v16.8h, #3
1780        umull           v20.8h,  v16.8b, v2.8b
1781        trn1            v22.2s,  v22.2s, v16.2s
1782        umlal           v20.8h,  v22.8b, v3.8b
1783        rev64           v22.2s,  v16.2s
1784        rshrn           v20.8b,  v20.8h, #3
1785        st1             {v20.s}[0], [x0], x1
1786        st1             {v20.s}[1], [x0], x1
1787        b.gt            1b
1788
1789        ret
1790endfunc
1791