1/*
2 * VP8 ARMv6 optimisations
3 *
4 * Copyright (c) 2010 Google Inc.
5 * Copyright (c) 2010 Rob Clark <rob@ti.com>
6 * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
7 *
8 * This file is part of FFmpeg.
9 *
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 *
24 * This code was partially ported from libvpx, which uses this license:
25 *
26 * Redistribution and use in source and binary forms, with or without
27 * modification, are permitted provided that the following conditions are
28 * met:
29 *
30 *   * Redistributions of source code must retain the above copyright
31 *     notice, this list of conditions and the following disclaimer.
32 *
33 *   * Redistributions in binary form must reproduce the above copyright
34 *     notice, this list of conditions and the following disclaimer in
35 *     the documentation and/or other materials provided with the
36 *     distribution.
37 *
38 *   * Neither the name of Google nor the names of its contributors may
39 *     be used to endorse or promote products derived from this software
40 *     without specific prior written permission.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
43 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
44 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
45 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
46 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
47 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
48 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
49 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
50 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
51 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
52 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
53 */
54
55#include "libavutil/arm/asm.S"
56
57@ idct
58
59@ void vp8_luma_dc_wht(int16_t block[4][4][16], int16_t dc[16])
60function ff_vp8_luma_dc_wht_armv6, export=1
61        push            {r4-r10, lr}
62
63        ldm             r1,  {r2-r9}
64        mov             r10, #0
65        mov             lr,  #0
66        uadd16          r12, r2,  r8            @ t0[0,1]
67        usub16          r2,  r2,  r8            @ t3[0,1]
68        stm             r1!, {r10, lr}
69        uadd16          r8,  r4,  r6            @ t1[0,1]
70        usub16          r4,  r4,  r6            @ t2[0,1]
71        stm             r1!, {r10, lr}
72        uadd16          r6,  r12, r8            @ dc0[0,1]
73        usub16          r12, r12, r8            @ dc2[0,1]
74        stm             r1!, {r10, lr}
75        uadd16          r8,  r2,  r4            @ dc1[0,1]
76        usub16          r2,  r2,  r4            @ dc3[0,1]
77        stm             r1!, {r10, lr}
78
79        uadd16          lr,  r3,  r9            @ t0[2,3]
80        usub16          r3,  r3,  r9            @ t3[2,3]
81        uadd16          r9,  r5,  r7            @ t1[2,3]
82        usub16          r5,  r5,  r7            @ t2[2,3]
83
84        uadd16          r7,  lr,  r9            @ dc0[2,3]
85        usub16          lr,  lr,  r9            @ dc2[2,3]
86        uadd16          r9,  r3,  r5            @ dc1[2,3]
87        usub16          r3,  r3,  r5            @ dc3[2,3]
88
89        mov             r1,  #3
90        orr             r1,  r1,  #0x30000      @ 3 | 3 (round)
91
92        pkhbt           r4,  r6,  r8,  lsl #16  @ dc{0,1}[0]
93        pkhtb           r6,  r8,  r6,  asr #16  @ dc{0,1}[1]
94        pkhbt           r5,  r12, r2,  lsl #16  @ dc{2,3}[0]
95        pkhtb           r12, r2,  r12, asr #16  @ dc{2,3}[1]
96        pkhbt           r8,  r7,  r9,  lsl #16  @ dc{0,1}[2]
97        uadd16          r4,  r4,  r1
98        uadd16          r5,  r5,  r1
99        pkhtb           r7,  r9,  r7,  asr #16  @ dc{0,1}[3]
100        pkhbt           r2,  lr,  r3,  lsl #16  @ dc{2,3}[2]
101        pkhtb           lr,  r3,  lr,  asr #16  @ dc{2,3}[3]
102
103        uadd16          r9,  r4,  r7            @ t0[0,1]
104        uadd16          r3,  r5,  lr            @ t0[2,3]
105        usub16          r4,  r4,  r7            @ t3[0,1]
106        usub16          r5,  r5,  lr            @ t3[2,3]
107        uadd16          r7,  r6,  r8            @ t1[0,1]
108        uadd16          lr,  r12, r2            @ t1[2,3]
109        usub16          r6,  r6,  r8            @ t2[0,1]
110        usub16          r12, r12, r2            @ t2[2,3]
111
112        uadd16          r8,  r9,  r7            @ block[0,1][0]
113        uadd16          r2,  r3,  lr            @ block[2,3][0]
114        usub16          r9,  r9,  r7            @ block[0,1][2]
115        usub16          r3,  r3,  lr            @ block[2,3][2]
116        uadd16          r7,  r4,  r6            @ block[0,1][1]
117        uadd16          lr,  r5,  r12           @ block[2,3][1]
118        usub16          r4,  r4,  r6            @ block[0,1][3]
119        usub16          r5,  r5,  r12           @ block[2,3][3]
120
121#if HAVE_ARMV6T2_EXTERNAL
122        sbfx            r6,  r8,  #3,  #13
123        sbfx            r12, r7,  #3,  #13
124        sbfx            r1,  r9,  #3,  #13
125        sbfx            r10, r4,  #3,  #13
126#else
127        sxth            r6,  r8
128        sxth            r12, r7
129        sxth            r1,  r9
130        sxth            r10, r4
131        asr             r6,  #3                 @ block[0][0]
132        asr             r12, #3                 @ block[0][1]
133        asr             r1,  #3                 @ block[0][2]
134        asr             r10, #3                 @ block[0][3]
135#endif
136
137        strh            r6,  [r0], #32
138        asr             r8,  r8,  #19           @ block[1][0]
139        strh            r12, [r0], #32
140        asr             r7,  r7,  #19           @ block[1][1]
141        strh            r1,  [r0], #32
142        asr             r9,  r9,  #19           @ block[1][2]
143        strh            r10, [r0], #32
144        asr             r4,  r4,  #19           @ block[1][3]
145        strh            r8,  [r0], #32
146        asr             r6,  r2,  #19           @ block[3][0]
147        strh            r7,  [r0], #32
148        asr             r12, lr,  #19           @ block[3][1]
149        strh            r9,  [r0], #32
150        asr             r1,  r3,  #19           @ block[3][2]
151        strh            r4,  [r0], #32
152        asr             r10, r5,  #19           @ block[3][3]
153
154#if HAVE_ARMV6T2_EXTERNAL
155        sbfx            r2,  r2,  #3,  #13
156        sbfx            lr,  lr,  #3,  #13
157        sbfx            r3,  r3,  #3,  #13
158        sbfx            r5,  r5,  #3,  #13
159#else
160        sxth            r2,  r2
161        sxth            lr,  lr
162        sxth            r3,  r3
163        sxth            r5,  r5
164        asr             r2,  #3                 @ block[2][0]
165        asr             lr,  #3                 @ block[2][1]
166        asr             r3,  #3                 @ block[2][2]
167        asr             r5,  #3                 @ block[2][3]
168#endif
169
170        strh            r2,  [r0], #32
171        strh            lr,  [r0], #32
172        strh            r3,  [r0], #32
173        strh            r5,  [r0], #32
174        strh            r6,  [r0], #32
175        strh            r12, [r0], #32
176        strh            r1,  [r0], #32
177        strh            r10, [r0], #32
178
179        pop             {r4-r10, pc}
180endfunc
181
182@ void vp8_luma_dc_wht_dc(int16_t block[4][4][16], int16_t dc[16])
183function ff_vp8_luma_dc_wht_dc_armv6, export=1
184        ldrsh           r2,  [r1]
185        mov             r3,  #0
186        add             r2,  r2,  #3
187        strh            r3,  [r1]
188        asr             r2,  r2,  #3
189    .rept 16
190        strh            r2,  [r0], #32
191    .endr
192        bx              lr
193endfunc
194
195@ void vp8_idct_add(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
196function ff_vp8_idct_add_armv6, export=1
197        push            {r4-r12, lr}
198        sub             sp,  sp,  #32
199
200        movw            r3,  #20091             @ cospi8sqrt2minus1
201        movw            r4,  #35468             @ sinpi8sqrt2
202        mov             r5,  sp
2031:
204        ldr             r6,  [r1, #8]       @  i5 | i4  = block1[1] | block1[0]
205        ldr             lr,  [r1, #16]      @  i9 | i8  = block2[1] | block2[0]
206        ldr             r12, [r1, #24]      @ i13 | i12 = block3[1] | block3[0]
207
208        smulwt          r9,  r3,  r6            @ ip[5] * cospi8sqrt2minus1
209        smulwb          r7,  r3,  r6            @ ip[4] * cospi8sqrt2minus1
210        smulwt          r10, r4,  r6            @ ip[5] * sinpi8sqrt2
211        smulwb          r8,  r4,  r6            @ ip[4] * sinpi8sqrt2
212        pkhbt           r7,  r7,  r9,  lsl #16  @ 5c | 4c
213        smulwt          r11, r3,  r12           @ ip[13] * cospi8sqrt2minus1
214        pkhbt           r8,  r8,  r10, lsl #16  @ 5s   | 4s   = t2 first half
215        uadd16          r6,  r6,  r7            @ 5c+5 | 4c+4 = t3 first half
216        smulwb          r9,  r3,  r12           @ ip[12] * cospi8sqrt2minus1
217        smulwt          r7,  r4,  r12           @ ip[13] * sinpi8sqrt2
218        smulwb          r10, r4,  r12           @ ip[12] * sinpi8sqrt2
219
220        pkhbt           r9,  r9,  r11, lsl #16  @ 13c | 12c
221        ldr             r11, [r1]               @  i1 | i0
222        pkhbt           r10, r10,  r7, lsl #16  @ 13s | 12s    = t3 second half
223        uadd16          r7,  r12, r9            @ 13c+13  | 12c+12 = t2 2nd half
224        uadd16          r6,  r6,  r10           @ d = t3
225        uadd16          r10, r11, lr            @ a = t0
226        usub16          r7,  r8,  r7            @ c = t2
227        usub16          r8,  r11, lr            @ b = t1
228        uadd16          r9,  r10, r6            @ a+d = tmp{0,1}[0]
229        usub16          r10, r10, r6            @ a-d = tmp{0,1}[3]
230        uadd16          r6,  r8,  r7            @ b+c = tmp{0,1}[1]
231        usub16          r7,  r8,  r7            @ b-c = tmp{0,1}[2]
232        mov             r8,  #0
233        cmp             sp,  r5
234        str             r6,  [r5, #8]           @  o5 | o4
235        str             r7,  [r5, #16]          @  o9 | o8
236        str             r10, [r5, #24]          @ o13 | o12
237        str             r9,  [r5], #4           @  o1 | o0
238        str             r8,  [r1, #8]
239        str             r8,  [r1, #16]
240        str             r8,  [r1, #24]
241        str             r8,  [r1], #4
242        beq             1b
243
244        mov             r5,  #2
2452:
246        pop             {r1, r6, r12, lr}
247        smulwt          r9,  r3,  r12           @ ip[5] * cospi8sqrt2minus1
248        smulwt          r7,  r3,  r1            @ ip[1] * cospi8sqrt2minus1
249        smulwt          r10, r4,  r12           @ ip[5] * sinpi8sqrt2
250        smulwt          r8,  r4,  r1            @ ip[1] * sinpi8sqrt2
251        pkhbt           r11, r1,  r12, lsl #16  @ i4 | i0 = t0/t1 first half
252        pkhtb           r1,  r12, r1,  asr #16  @ i5 | i1
253        pkhbt           r7,  r7,  r9,  lsl #16  @ 5c | 1c
254        pkhbt           r8,  r8,  r10, lsl #16  @ 5s | 1s = t2 first half
255        pkhbt           r9,  r6,  lr,  lsl #16  @ i6 | i2 = t0/t1 second half
256        pkhtb           r12, lr,  r6,  asr #16  @ i7 | i3
257        uadd16          r1,  r7,  r1            @ 5c+5 | 1c+1 = t3 first half
258        uadd16          r10, r11, r9            @ a = t0
259        usub16          r9,  r11, r9            @ b = t1
260        smulwt          r7,  r3,  r12           @ ip[7] * cospi8sqrt2minus1
261        smulwb          lr,  r3,  r12           @ ip[3] * cospi8sqrt2minus1
262        smulwt          r11, r4,  r12           @ ip[7] * sinpi8sqrt2
263        smulwb          r6,  r4,  r12           @ ip[3] * sinpi8sqrt2
264        subs            r5,  r5,  #1
265        pkhbt           r7,  lr,  r7,  lsl #16  @ 7c | 3c
266        pkhbt           r11, r6,  r11, lsl #16  @ 7s | 3s = t3 second half
267        mov             r6,  #0x4
268        orr             r6,  r6,  #0x40000
269        uadd16          r12, r7,  r12           @ 7c+7 | 3c+3 = t2 second half
270        uadd16          r10, r10, r6            @ t0 + 4
271        uadd16          r9,  r9,  r6            @ t1 + 4
272        usub16          lr,  r8,  r12           @ c (o5 | o1) = t2
273        uadd16          r12, r11, r1            @ d (o7 | o3) = t3
274        usub16          r1,  r9,  lr            @ b-c = dst{0,1}[2]
275        uadd16          r7,  r10, r12           @ a+d = dst{0,1}[0]
276        usub16          r12, r10, r12           @ a-d = dst{0,1}[3]
277        uadd16          r10, r9,  lr            @ b+c = dst{0,1}[1]
278
279        asr             lr,  r1,  #3            @ o[1][2]
280        asr             r9,  r12, #3            @ o[1][3]
281        pkhtb           r8,  lr,  r7,  asr #19  @ o[1][0,2]
282        pkhtb           r11, r9,  r10, asr #19  @ o[1][1,3]
283        ldr             lr,  [r0]
284        sxth            r12, r12
285        ldr             r9,  [r0, r2]
286        sxth            r1,  r1
287#if HAVE_ARMV6T2_EXTERNAL
288        sbfx            r7,  r7,  #3,  #13
289        sbfx            r10, r10, #3,  #13
290#else
291        sxth            r7,  r7
292        sxth            r10, r10
293        asr             r7,  #3                 @ o[0][0]
294        asr             r10, #3                 @ o[0][1]
295#endif
296        pkhbt           r7,  r7,  r1,  lsl #13  @ o[0][0,2]
297        pkhbt           r10, r10, r12, lsl #13  @ o[0][1,3]
298
299        uxtab16         r7,  r7,  lr
300        uxtab16         r10, r10, lr,  ror #8
301        uxtab16         r8,  r8,  r9
302        uxtab16         r11, r11, r9,  ror #8
303        usat16          r7,  #8,  r7
304        usat16          r10, #8,  r10
305        usat16          r8,  #8,  r8
306        usat16          r11, #8,  r11
307        orr             r7,  r7,  r10, lsl #8
308        orr             r8,  r8,  r11, lsl #8
309        str             r8,  [r0, r2]
310        str_post        r7,  r0,  r2,  lsl #1
311
312        bne             2b
313
314        pop             {r4-r12, pc}
315endfunc
316
317@ void vp8_idct_dc_add(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
318function ff_vp8_idct_dc_add_armv6, export=1
319        push            {r4-r6, lr}
320        add             r6,  r0,  r2,  lsl #1
321        ldrsh           r3,  [r1]
322        mov             r4,  #0
323        add             r3,  r3,  #4
324        strh            r4,  [r1], #32
325        asr             r3,  #3
326        ldr             r5,  [r0]
327        ldr             r4,  [r0, r2]
328        pkhbt           r3,  r3,  r3,  lsl #16
329        uxtab16         lr,  r3,  r5            @ a1+2 | a1+0
330        uxtab16         r5,  r3,  r5,  ror #8   @ a1+3 | a1+1
331        uxtab16         r12, r3,  r4
332        uxtab16         r4,  r3,  r4,  ror #8
333        usat16          lr,  #8,  lr
334        usat16          r5,  #8,  r5
335        usat16          r12, #8,  r12
336        usat16          r4,  #8,  r4
337        orr             lr,  lr,  r5,  lsl #8
338        ldr             r5,  [r6]
339        orr             r12, r12, r4,  lsl #8
340        ldr             r4,  [r6, r2]
341        str             lr,  [r0]
342        uxtab16         lr,  r3,  r5
343        str             r12, [r0, r2]
344        uxtab16         r5,  r3,  r5,  ror #8
345        uxtab16         r12, r3,  r4
346        uxtab16         r4,  r3,  r4,  ror #8
347        usat16          lr,  #8,  lr
348        usat16          r5,  #8,  r5
349        usat16          r12, #8,  r12
350        usat16          r4,  #8,  r4
351        orr             lr,  lr,  r5,  lsl #8
352        orr             r12, r12, r4,  lsl #8
353        str             lr,  [r6]
354        str             r12, [r6, r2]
355        pop             {r4-r6, pc}
356endfunc
357
358@ void vp8_idct_dc_add4uv(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride)
359function ff_vp8_idct_dc_add4uv_armv6, export=1
360        push            {r4, lr}
361
362        bl              X(ff_vp8_idct_dc_add_armv6)
363        add             r0,  r0,  #4
364        bl              X(ff_vp8_idct_dc_add_armv6)
365        add             r0,  r0,  r2,  lsl #2
366        sub             r0,  r0,  #4
367        bl              X(ff_vp8_idct_dc_add_armv6)
368        add             r0,  r0,  #4
369        bl              X(ff_vp8_idct_dc_add_armv6)
370
371        pop             {r4, pc}
372endfunc
373
374@ void vp8_idct_dc_add4y(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride)
375function ff_vp8_idct_dc_add4y_armv6, export=1
376        push            {r4, lr}
377
378        bl              X(ff_vp8_idct_dc_add_armv6)
379        add             r0,  r0,  #4
380        bl              X(ff_vp8_idct_dc_add_armv6)
381        add             r0,  r0,  #4
382        bl              X(ff_vp8_idct_dc_add_armv6)
383        add             r0,  r0,  #4
384        bl              X(ff_vp8_idct_dc_add_armv6)
385
386        pop             {r4, pc}
387endfunc
388
389@ loopfilter
390
391.macro  transpose       o3,  o2,  o1,  o0,  i0,  i1,  i2,  i3
392        uxtb16          \o1, \i1                @ xx 12 xx 10
393        uxtb16          \o0, \i0                @ xx 02 xx 00
394        uxtb16          \o3, \i3                @ xx 32 xx 30
395        uxtb16          \o2, \i2                @ xx 22 xx 20
396        orr             \o1, \o0, \o1, lsl #8   @ 12 02 10 00
397        orr             \o3, \o2, \o3, lsl #8   @ 32 22 30 20
398
399        uxtb16          \i1, \i1, ror #8        @ xx 13 xx 11
400        uxtb16          \i3, \i3, ror #8        @ xx 33 xx 31
401        uxtb16          \i0, \i0, ror #8        @ xx 03 xx 01
402        uxtb16          \i2, \i2, ror #8        @ xx 23 xx 21
403        orr             \i0, \i0, \i1, lsl #8   @ 13 03 11 01
404        orr             \i2, \i2, \i3, lsl #8   @ 33 23 31 21
405
406        pkhtb           \o2, \o3, \o1, asr #16  @ 32 22 12 02
407        pkhbt           \o0, \o1, \o3, lsl #16  @ 30 20 10 00
408        pkhtb           \o3, \i2, \i0, asr #16  @ 33 23 13 03
409        pkhbt           \o1, \i0, \i2, lsl #16  @ 31 21 11 01
410.endm
411
412.macro  simple_filter
413        uqsub8          r7,  r3,  r6            @ p1 - q1
414        uqsub8          r8,  r6,  r3            @ q1 - p1
415        uqsub8          r10, r4,  r5            @ p0 - q0
416        uqsub8          r9,  r5,  r4            @ q0 - p0
417        orr             r7,  r7,  r8            @ abs(p1 - q1)
418        orr             r9,  r9,  r10           @ abs(p0 - q0)
419        uhadd8          r7,  r7,  lr            @ abs(p1 - q2) >> 1
420        uqadd8          r9,  r9,  r9            @ abs(p0 - q0) * 2
421        uqadd8          r7,  r7,  r9            @ abs(p0 - q0)*2 + abs(p1-q1)/2
422        mvn             r8,  #0
423        usub8           r10, r12, r7            @ compare to flimit
424        sel             r10, r8,  lr            @ filter mask: F or 0
425        cmp             r10, #0
426        beq             2f
427
428        eor             r3,  r3,  r2            @ ps1
429        eor             r6,  r6,  r2            @ qs1
430        eor             r4,  r4,  r2            @ ps0
431        eor             r5,  r5,  r2            @ qs0
432
433        qsub8           r3,  r3,  r6            @ vp8_filter = p1 - q1
434        qsub8           r6,  r5,  r4            @ q0 - p0
435        qadd8           r3,  r3,  r6            @ += q0 - p0
436        lsr             r7,  r2,  #5            @ 0x04040404
437        qadd8           r3,  r3,  r6            @ += q0 - p0
438        sub             r9,  r7,  r2,  lsr #7   @ 0x03030303
439        qadd8           r3,  r3,  r6            @ vp8_filter = p1-q1 + 3*(q0-p0)
440        and             r3,  r3,  r10           @ vp8_filter &= mask
441
442        qadd8           r9,  r3,  r9            @ Filter2 = vp8_filter + 3
443        qadd8           r3,  r3,  r7            @ Filter1 = vp8_filter + 4
444
445        shadd8          r9,  r9,  lr
446        shadd8          r3,  r3,  lr
447        shadd8          r9,  r9,  lr
448        shadd8          r3,  r3,  lr
449        shadd8          r9,  r9,  lr            @ Filter2 >>= 3
450        shadd8          r3,  r3,  lr            @ Filter1 >>= 3
451
452        qadd8           r4,  r4,  r9            @ u = p0 + Filter2
453        qsub8           r5,  r5,  r3            @ u = q0 - Filter1
454        eor             r4,  r4,  r2            @ *op0 = u ^ 0x80
455        eor             r5,  r5,  r2            @ *oq0 = u ^ 0x80
456.endm
457
458@ void vp8_v_loop_filter16_simple(uint8_t *dst, ptrdiff_t stride, int flim)
459function ff_vp8_v_loop_filter16_simple_armv6, export=1
460        push            {r4-r11, lr}
461
462        orr             r2,  r2,  r2,  lsl #16
463        mov             r11, #4
464        mov             lr,  #0
465        orr             r12, r2,  r2,  lsl #8
466        mov32           r2,  0x80808080
4671:
468        ldr_nreg        r3,  r0,  r1,  lsl #1   @ p1
469        ldr_nreg        r4,  r0,  r1            @ p0
470        ldr             r5,  [r0]               @ q0
471        ldr             r6,  [r0, r1]           @ q1
472        simple_filter
473T       sub             r7,  r0,  r1
474        str             r5,  [r0]               @ oq0
475A       str             r4,  [r0, -r1]          @ op0
476T       str             r4,  [r7]
4772:
478        subs            r11, r11, #1
479        add             r0,  r0,  #4
480        bne             1b
481
482        pop             {r4-r11, pc}
483endfunc
484
485.macro  filter_mask_p
486        uqsub8          r6,  r9,  r10           @ p3 - p2
487        uqsub8          r7,  r10, r9            @ p2 - p3
488        uqsub8          r8,  r10, r11           @ p2 - p1
489        uqsub8          r10, r11, r10           @ p1 - p2
490        orr             r6,  r6,  r7            @ abs(p3-p2)
491        orr             r8,  r8,  r10           @ abs(p2-p1)
492        uqsub8          lr,  r6,  r2            @ compare to limit
493        uqsub8          r8,  r8,  r2            @ compare to limit
494        uqsub8          r6,  r11, r12           @ p1 - p0
495        orr             lr,  lr,  r8
496        uqsub8          r7,  r12, r11           @ p0 - p1
497        orr             r6,  r6,  r7            @ abs(p1-p0)
498        uqsub8          r7,  r6,  r2            @ compare to limit
499        uqsub8          r8,  r6,  r3            @ compare to thresh
500        orr             lr,  lr,  r7
501.endm
502
503.macro filter_mask_pq
504        uqsub8          r6,  r11, r10           @ p1 - q1
505        uqsub8          r7,  r10, r11           @ q1 - p1
506        uqsub8          r11, r12, r9            @ p0 - q0
507        uqsub8          r12, r9,  r12           @ q0 - p0
508        orr             r6,  r6,  r7            @ abs(p1-q1)
509        orr             r12, r11, r12           @ abs(p0-q0)
510        mov32           r7,  0x7f7f7f7f
511        uqadd8          r12, r12, r12           @ abs(p0-q0) * 2
512        and             r6,  r7,  r6,  lsr #1   @ abs(p1-q1) / 2
513        uqadd8          r12, r12, r6            @ abs(p0-q0) * 2 + abs(p1-q1)/2
514.endm
515
516.macro  filter_mask_v
517        filter_mask_p
518
519        ldr             r10, [r0, r1]           @ q1
520        ldr_post        r9,  r0,  r1,  lsl #1   @ q0
521
522        filter_mask_pq
523
524        ldr             r11, [r0]               @ q2
525
526        uqsub8          r7,  r9,  r10           @ q0 - q1
527        uqsub8          r6,  r10, r9            @ q1 - q0
528        uqsub8          r12, r12, r4            @ compare to flimit
529        uqsub8          r9,  r11, r10           @ q2 - q1
530        uqsub8          r10, r10, r11           @ q1 - q2
531        orr             lr,  lr,  r12
532        ldr             r12, [r0, r1]           @ q3
533        orr             r6,  r7,  r6            @ abs(q1-q0)
534        orr             r10, r9,  r10           @ abs(q2-q1)
535        uqsub8          r9,  r12, r11           @ q3 - q2
536        uqsub8          r11, r11, r12           @ q2 - q3
537        uqsub8          r7,  r6,  r2            @ compare to limit
538        uqsub8          r10, r10, r2            @ compare to limit
539        uqsub8          r6,  r6,  r3            @ compare to thresh
540        orr             r9,  r9,  r11           @ abs(q3-q2)
541        orr             lr,  lr,  r7
542        orr             lr,  lr,  r10
543        uqsub8          r9,  r9,  r2            @ compare to limit
544        orr             lr,  lr,  r9
545
546        mov             r12, #0
547        usub8           lr,  r12, lr
548        mvn             r11, #0
549        sel             lr,  r11, r12           @ filter mask
550        sub             r0,  r0,  r1,  lsl #1
551.endm
552
553.macro  filter_mask_h
554        transpose       r12, r11, r10, r9,  r6,  r7,  r8,  lr
555
556        filter_mask_p
557
558        stm             sp,  {r8, r11, r12, lr}
559        sub             r0,  r0,  r1,  lsl #2
560        add             r0,  r0,  #4
561
562        ldr             r7,  [r0, r1]
563        ldr_post        r6,  r0,  r1,  lsl #1
564        ldr             lr,  [r0, r1]
565        ldr             r8,  [r0]
566
567        transpose       r12, r11, r10, r9,  r6,  r7,  r8,  lr
568
569        uqsub8          r8,  r12, r11           @ q3 - q2
570        uqsub8          lr,  r11, r12           @ q2 - q3
571        uqsub8          r7,  r9,  r10           @ q0 - q1
572        uqsub8          r6,  r10, r9            @ q1 - q0
573        uqsub8          r12, r11, r10           @ q2 - q1
574        uqsub8          r11, r10, r11           @ q1 - q2
575        orr             r8,  r8,  lr            @ abs(q3-q2)
576        orr             r6,  r7,  r6            @ abs(q1-q0)
577        orr             r11, r12, r11           @ abs(q2-q1)
578        ldr             lr,  [sp, #12]          @ load back (f)limit accumulator
579        uqsub8          r8,  r8,  r2            @ compare to limit
580        uqsub8          r7,  r6,  r2            @ compare to limit
581        uqsub8          r11, r11, r2            @ compare to limit
582        orr             lr,  lr,  r8
583        uqsub8          r8,  r6,  r3            @ compare to thresh
584        orr             lr,  lr,  r7
585        ldr             r12, [sp, #8]           @ p1
586        orr             lr,  lr,  r11
587
588        ldr             r11, [sp, #4]           @ p0
589
590        filter_mask_pq
591
592        mov             r10, #0
593        uqsub8          r12, r12, r4            @ compare to flimit
594        mvn             r11, #0
595        orr             lr,  lr,  r12
596        usub8           lr,  r10, lr
597        sel             lr,  r11, r10           @ filter mask
598.endm
599
600.macro  filter          inner
601        mov32           r12, 0x80808080
602        eor             r11, r7,  r12           @ ps1
603        eor             r8,  r8,  r12           @ ps0
604        eor             r9,  r9,  r12           @ qs0
605        eor             r10, r10, r12           @ qs1
606
607        stm             sp,  {r8-r11}
608
609        qsub8           r7,  r11, r10           @ vp8_signed_char_clamp(ps1-qs1)
610        qsub8           r8,  r9,  r8            @ vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0))
611    .if \inner
612        and             r7,  r7,  r6            @ vp8_filter &= hev
613    .endif
614        qadd8           r7,  r7,  r8
615        lsr             r10, r12, #5            @ 0x04040404
616        qadd8           r7,  r7,  r8
617        sub             r9,  r10, r12, lsr #7   @ 0x03030303
618        qadd8           r7,  r7,  r8
619
620        and             r7,  r7,  lr            @ vp8_filter &= mask
621    .if !\inner
622        mov             r12, r7                 @ Filter2
623        and             r7,  r7,  r6            @ Filter2 &= hev
624    .endif
625        qadd8           lr,  r7,  r9            @ Filter2 = vp8_signed_char_clamp(vp8_filter+3)
626        qadd8           r7,  r7,  r10           @ Filter1 = vp8_signed_char_clamp(vp8_filter+4)
627
628        mov             r9,  #0
629        shadd8          lr,  lr,  r9            @ Filter2 >>= 3
630        shadd8          r7,  r7,  r9            @ Filter1 >>= 3
631        shadd8          lr,  lr,  r9
632        shadd8          r7,  r7,  r9
633        shadd8          lr,  lr,  r9            @ Filter2
634        shadd8          r7,  r7,  r9            @ Filter1
635.endm
636
637.macro  filter_v        inner
638        orr             r10, r6,  r8            @ calculate vp8_hevmask
639        ldr_nreg        r7,  r0,  r1,  lsl #1   @ p1
640        usub8           r10, r12, r10
641        ldr_nreg        r8,  r0,  r1            @ p0
642        sel             r6,  r12, r11           @ obtain vp8_hevmask
643        ldr             r9,  [r0]               @ q0
644        ldr             r10, [r0, r1]           @ q1
645        filter          \inner
646.endm
647
648.macro  filter_h        inner
649        orr             r9,  r6,  r8
650        usub8           r9,  r12, r9
651        sel             r6,  r12, r11           @ hev mask
652
653        stm             sp,  {r6, lr}
654
655        ldr_nreg        r12, r0,  r1,  lsl #1
656        ldr_nreg        r11, r0,  r1
657        ldr             r6,  [r0]
658        ldr             lr,  [r0, r1]
659
660        transpose       r10, r9,  r8,  r7,  r12, r11, r6,  lr
661
662        ldm             sp,  {r6, lr}
663        filter          \inner
664.endm
665
666.macro  filter_inner
667        ldm             sp,  {r8, r9}
668        lsr             r10, r10, #2            @ 0x01010101
669        qadd8           r8,  r8,  lr            @ u = vp8_signed_char_clamp(ps0 + Filter2)
670        mov             lr,  #0
671        qsub8           r9,  r9,  r7            @ u = vp8_signed_char_clamp(qs0 - Filter1)
672        sadd8           r7,  r7,  r10           @ vp8_filter += 1
673        ldr             r10, [sp, #8]           @ qs1
674        shadd8          r7,  r7,  lr            @ vp8_filter >>= 1
675        eor             r8,  r8,  r12           @ *op0 = u ^ 0x80
676        bic             r7,  r7,  r6            @ vp8_filter &= ~hev
677        qadd8           r11, r11, r7            @ u = vp8_signed_char_clamp(ps1 + vp8_filter)
678        eor             r9,  r9,  r12           @ *oq0 = u ^ 0x80
679        qsub8           r10, r10, r7            @ u = vp8_signed_char_clamp(qs1 - vp8_filter)
680        eor             r11, r11, r12           @ *op1 = u ^ 0x80
681        eor             r10, r10, r12           @ *oq1 = u ^ 0x80
682.endm
683
684.macro  filter_x        c0
685        mov             lr,  \c0
686        mov             r7,  #63
687
688        sxtb16          r6,  r12
689        sxtb16          r10, r12, ror #8
690        smlabb          r8,  r6,  lr,  r7
691        smlatb          r6,  r6,  lr,  r7
692        smlabb          r7,  r10, lr,  r7
693        smultb          r10, r10, lr
694        ssat            r8,  #8,  r8,  asr #7
695        ssat            r6,  #8,  r6,  asr #7
696        add             r10, r10, #63
697        ssat            r7,  #8,  r7,  asr #7
698        ssat            r10, #8,  r10, asr #7
699
700        pkhbt           r6,  r8,  r6,  lsl #16
701        pkhbt           r10, r7,  r10, lsl #16
702        uxtb16          r6,  r6
703        uxtb16          r10, r10
704
705        mov32           lr,  0x80808080
706
707        orr             r10, r6,  r10, lsl #8   @ u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
708        qsub8           r8,  r9,  r10           @ s = vp8_signed_char_clamp(qs0 - u)
709        qadd8           r10, r11, r10           @ s = vp8_signed_char_clamp(ps0 + u)
710        eor             r8,  r8,  lr            @ *oq0 = s ^ 0x80
711        eor             r10, r10, lr            @ *op0 = s ^ 0x80
712.endm
713
714.macro  filter_1
715        ldm             sp,  {r8, r9}
716        qadd8           r11, r8,  lr
717        qsub8           r9,  r9,  r7
718        bic             r12, r12, r6            @ vp8_filter &= ~hev
719        filter_x        #27
720.endm
721
722.macro  filter_2
723        ldr             r9,   [sp, #8]          @ qs1
724        ldr             r11,  [sp, #12]         @ ps1
725        filter_x        #18
726.endm
727
728.macro  filter_3
729        eor             r9,  r9,  lr
730        eor             r11, r11, lr
731        filter_x        #9
732.endm
733
734function vp8_v_loop_filter_inner_armv6
735        mov             r5,  #4
736        sub             sp,  sp,  #16
737
738        orr             r2,  r2,  r2,  lsl #16
739        orr             r3,  r3,  r3,  lsl #16
740        orr             r6,  r6,  r6,  lsl #16
741        orr             r4,  r2,  r2,  lsl #8   @ flimE
742        orr             r2,  r3,  r3,  lsl #8   @ flimI
743        orr             r3,  r6,  r6,  lsl #8   @ thresh
7441:
745        sub             r0,  r0,  r1,  lsl #2
746        ldr             r10, [r0, r1]           @ p2
747        ldr_post        r9,  r0,  r1,  lsl #1   @ p3
748        ldr             r12, [r0, r1]           @ p0
749        ldr_post        r11, r0,  r1,  lsl #1   @ p1
750
751        filter_mask_v
752        cmp             lr,  #0
753        beq             2f
754        filter_v        inner=1
755        filter_inner
756
757A       str             r11, [r0, -r1, lsl #1]  @ op1
758A       str             r8,  [r0, -r1]          @ op0
759T       sub             r0,  r0,  r1,  lsl #1
760T       str             r8,  [r0, r1]
761T       str_post        r11, r0,  r1,  lsl #1
762        str             r9,  [r0]               @ oq0
763        str             r10, [r0, r1]           @ oq1
7642:
765        add             r0,  r0,  #4
766        cmp             r5,  #3
767        it              eq
768        ldreq           r0,  [sp, #16]
769        subs            r5,  r5,  #1
770        bne             1b
771
772        add             sp,  sp,  #16
773        pop             {r0, r4-r11, pc}
774endfunc
775
776function ff_vp8_v_loop_filter16_inner_armv6, export=1
777        push            {r4-r11, lr}
778        add             r12, r0,  #8
779        push            {r12}
780        ldr             r6,  [sp, #40]
781        orr             r2,  r2,  r2,  lsl #16
782        b               vp8_v_loop_filter_inner_armv6
783endfunc
784
785function ff_vp8_v_loop_filter8uv_inner_armv6, export=1
786        push            {r1, r4-r11, lr}
787        mov             r1,  r2
788        orr             r2,  r3,  r3,  lsl #16
789        ldr             r3,  [sp, #40]
790        ldr             r6,  [sp, #44]
791        b               vp8_v_loop_filter_inner_armv6
792endfunc
793
794function vp8_v_loop_filter_armv6
795        mov             r5,  #4
796        sub             sp,  sp,  #16
797
798        orr             r3,  r3,  r3,  lsl #16
799        orr             r6,  r6,  r6,  lsl #16
800        orr             r4,  r2,  r2,  lsl #8   @ flimE
801        orr             r2,  r3,  r3,  lsl #8   @ flimI
802        orr             r3,  r6,  r6,  lsl #8   @ thresh
8031:
804        sub             r0,  r0,  r1,  lsl #2
805        ldr             r10, [r0, r1]           @ p2
806        ldr_post        r9,  r0,  r1,  lsl #1   @ p3
807        ldr             r12, [r0, r1]           @ p0
808        ldr_post        r11, r0,  r1,  lsl #1   @ p1
809
810        filter_mask_v
811        cmp             lr,  #0
812        beq             2f
813
814        filter_v        inner=0
815        filter_1
816
817        str             r8,  [r0]               @ *oq0
818A       str             r10, [r0, -r1]          @ *op0
819T       sub             r0,  r0,  r1,  lsl #1
820T       str             r10, [r0, r1]
821
822        filter_2
823
824A       str             r10, [r0, -r1, lsl #1]  @ *op1
825T       str_post        r10, r0,  r1,  lsl #1
826        str             r8,  [r0, r1]           @ *oq1
827
828        ldr             r9,  [r0, r1,  lsl #1]  @ q2
829        add             r0,  r0,  r1
830A       ldr             r11, [r0, -r1, lsl #2]  @ p2
831T       ldr_dpre        r11, r0,  r1,  lsl #2
832
833        filter_3
834
835A       str             r10, [r0, -r1, lsl #2]  @ *op2
836T       str_post        r10, r0,  r1,  lsl #2
837        str             r8,  [r0, r1]           @ *oq2
838        sub             r0,  r0,  r1
8392:
840        add             r0,  r0,  #4
841        cmp             r5,  #3
842        it              eq
843        ldreq           r0,  [sp, #16]
844        subs            r5,  r5,  #1
845        bne             1b
846
847        add             sp,  sp,  #16
848        pop             {r0, r4-r11, pc}
849endfunc
850
851function ff_vp8_v_loop_filter16_armv6, export=1
852        push            {r4-r11, lr}
853        add             r12, r0,  #8
854        push            {r12}
855        ldr             r6,  [sp, #40]
856        orr             r2,  r2,  r2,  lsl #16
857        b               vp8_v_loop_filter_armv6
858endfunc
859
860function ff_vp8_v_loop_filter8uv_armv6, export=1
861        push            {r1, r4-r11, lr}
862        mov             r1,  r2
863        orr             r2,  r3,  r3,  lsl #16
864        ldr             r3,  [sp, #40]
865        ldr             r6,  [sp, #44]
866        b               vp8_v_loop_filter_armv6
867endfunc
868
869@ void vp8_h_loop_filter16_simple(uint8_t *dst, ptrdiff_t stride, int flim)
870function ff_vp8_h_loop_filter16_simple_armv6, export=1
871        push            {r4-r11, lr}
872        orr             r12, r2,  r2,  lsl #16
873        mov32           r2,  0x80808080
874        orr             r12, r12, r12, lsl #8
875
876        mov             lr,  #0
877        mov             r11, #4
8781:
879        sub             r0,  r0,  #2
880        ldr             r8,  [r0, r1]
881        ldr_post        r7,  r0,  r1,  lsl #1
882        ldr             r10, [r0, r1]
883        ldr_post        r9,  r0,  r1,  lsl #1
884        add             r0,  r0,  #2
885        transpose       r6,  r5,  r4,  r3,  r7,  r8,  r9,  r10
886        simple_filter
887        sub             r0,  r0,  r1,  lsl #2
888        sub             r0,  r0,  #1
889
890        uxtb16          r6,  r4
891        uxtb16          r8,  r5
892        uxtb16          r7,  r4,  ror #8
893        uxtb16          r9,  r5,  ror #8
894        orr             r6,  r6,  r8,  lsl #8
895        orr             r7,  r7,  r9,  lsl #8
896        lsr             r4,  r6,  #16
897        lsr             r5,  r7,  #16
898
899        strh_post       r6,  r0,  r1
900        strh_post       r7,  r0,  r1
901        strh_post       r4,  r0,  r1
902        strh_post       r5,  r0,  r1
903        add             r0,  r0,  #1
9042:
905        subs            r11, r11, #1
906        bne             1b
907
908        pop             {r4-r11, pc}
909endfunc
910
911function vp8_h_loop_filter_inner_armv6
912        mov             r5,  #4
913        sub             sp,  sp,  #16
914
915        orr             r3,  r3,  r3,  lsl #16
916        orr             r9,  r9,  r9,  lsl #16
917        orr             r4,  r2,  r2,  lsl #8   @ flimE
918        orr             r2,  r3,  r3,  lsl #8   @ flimI
919        orr             r3,  r9,  r9,  lsl #8   @ thresh
920        sub             r0,  r0,  #4
9211:
922        ldr             r7,  [r0, r1]
923        ldr_post        r6,  r0,  r1,  lsl #1
924        ldr             lr,  [r0, r1]
925        ldr_post        r8,  r0,  r1,  lsl #1
926
927        filter_mask_h
928
929        cmp             lr,  #0
930        sub             r0,  r0,  #2
931        beq             2f
932
933        ldr             r6,  [sp]
934
935        filter_h        inner=1
936        filter_inner
937
938        transpose       lr,  r12, r7,  r6,  r11, r8,  r9,  r10
939
940A       str             r6,  [r0, -r1, lsl #1]
941A       str             r7,  [r0, -r1]
942T       sub             r0,  r0,  r1,  lsl #1
943T       str             r7,  [r0, r1]
944T       str_post        r6,  r0,  r1,  lsl #1
945        str             r12, [r0]
946        str             lr,  [r0, r1]
9472:
948        sub             r0,  r0,  #2
949        add             r0,  r0,  r1,  lsl #1
950        cmp             r5,  #3
951        it              eq
952        ldreq           r0,  [sp, #16]
953        subs            r5,  r5,  #1
954        bne             1b
955
956        add             sp, sp, #16
957        pop             {r0, r4-r11, pc}
958endfunc
959
960function ff_vp8_h_loop_filter16_inner_armv6, export=1
961        push            {r4-r11, lr}
962        add             r12, r0,  r1,  lsl #3
963        sub             r12, r12, #4
964        push            {r12}
965        ldr             r9,  [sp, #40]
966        orr             r2,  r2,  r2,  lsl #16
967        b               vp8_h_loop_filter_inner_armv6
968endfunc
969
970function ff_vp8_h_loop_filter8uv_inner_armv6, export=1
971        sub             r1,  r1,  #4
972        push            {r1, r4-r11, lr}
973        mov             r1,  r2
974        orr             r2,  r3,  r3,  lsl #16
975        ldr             r3,  [sp, #40]
976        ldr             r9,  [sp, #44]
977        b               vp8_h_loop_filter_inner_armv6
978endfunc
979
980function vp8_h_loop_filter_armv6
981        mov             r5,  #4
982        sub             sp,  sp,  #16
983
984        orr             r3,  r3,  r3,  lsl #16
985        orr             r9,  r9,  r9,  lsl #16
986        orr             r4,  r2,  r2,  lsl #8   @ flimE
987        orr             r2,  r3,  r3,  lsl #8   @ flimI
988        orr             r3,  r9,  r9,  lsl #8   @ thresh
9891:
990        sub             r0,  r0,  #4
991        ldr             r7,  [r0, r1]
992        ldr_post        r6,  r0,  r1,  lsl #1
993        ldr             lr,  [r0, r1]
994        ldr_post        r8,  r0,  r1,  lsl #1
995
996        filter_mask_h
997        cmp             lr,  #0
998        it              eq
999        addeq           r0,  r0,  r1,  lsl #1
1000        beq             2f
1001
1002        ldr             r6,  [sp]
1003        sub             r0,  r0,  #2
1004
1005        filter_h        inner=0
1006        filter_1
1007
1008        sub             r0,  r0,  r1,  lsl #1
1009        uxtb16          r6,  r10
1010        uxtb16          r7,  r8
1011        uxtb16          r10, r10, ror #8
1012        uxtb16          r8,  r8,  ror #8
1013        orr             r6,  r6,  r7,  lsl #8
1014        orr             r10, r10, r8,  lsl #8
1015        lsr             r7,  r6,  #16
1016        lsr             r8,  r10, #16
1017
1018        add             r0,  r0,  #1
1019        strh_post       r6,  r0,  r1
1020        strh_post       r10, r0,  r1
1021        strh_post       r7,  r0,  r1
1022        strh_post       r8,  r0,  r1
1023
1024        filter_2
1025
1026        sub             r0,  r0,  r1,  lsl #2
1027        add             r0,  r0,  #3
1028
1029        ldrb            r11, [r0, #-5]          @ p2 for 1/7th difference
1030        strb            r10, [r0, #-4]          @ op1
1031        strb            r8,  [r0, #-1]          @ oq1
1032        ldrb_post       r9,  r0,  r1            @ q2 for 1/7th difference
1033
1034        lsr             r10, r10, #8
1035        lsr             r8,  r8,  #8
1036
1037        ldrb            r6,  [r0, #-5]
1038        strb            r10, [r0, #-4]
1039        strb            r8,  [r0, #-1]
1040        ldrb_post       r7,  r0,  r1
1041
1042        lsr             r10, r10, #8
1043        lsr             r8,  r8,  #8
1044        orr             r11, r11, r6,  lsl #8
1045        orr             r9,  r9,  r7,  lsl #8
1046
1047        ldrb            r6,  [r0, #-5]
1048        strb            r10, [r0, #-4]
1049        strb            r8,  [r0, #-1]
1050        ldrb_post       r7,  r0,  r1
1051
1052        lsr             r10, r10, #8
1053        lsr             r8,  r8,  #8
1054        orr             r11, r11, r6,  lsl #16
1055        orr             r9,  r9,  r7,  lsl #16
1056
1057        ldrb            r6,  [r0, #-5]
1058        strb            r10, [r0, #-4]
1059        strb            r8,  [r0, #-1]
1060        ldrb_post       r7,  r0,  r1
1061        orr             r11, r11, r6,  lsl #24
1062        orr             r9,  r9,  r7,  lsl #24
1063
1064        filter_3
1065
1066        sub             r0,  r0,  r1,  lsl #2
1067        strb            r10, [r0, #-5]
1068        strb_post       r8,  r0,  r1
1069        lsr             r10, r10, #8
1070        lsr             r8,  r8,  #8
1071        strb            r10, [r0, #-5]
1072        strb_post       r8,  r0,  r1
1073        lsr             r10, r10, #8
1074        lsr             r8,  r8,  #8
1075        strb            r10, [r0, #-5]
1076        strb_post       r8,  r0,  r1
1077        lsr             r10, r10, #8
1078        lsr             r8,  r8,  #8
1079        strb            r10, [r0, #-5]
1080        strb_post       r8,  r0,  r1
1081
1082        sub             r0,  r0,  #2
10832:
1084        cmp             r5,  #3
1085        it              eq
1086        ldreq           r0,  [sp, #16]
1087        subs            r5,  r5,  #1
1088        bne             1b
1089
1090        add             sp,  sp,  #16
1091        pop             {r0, r4-r11, pc}
1092endfunc
1093
1094function ff_vp8_h_loop_filter16_armv6, export=1
1095        push            {r4-r11, lr}
1096        add             r12, r0,  r1,  lsl #3
1097        push            {r12}
1098        ldr             r9,  [sp, #40]
1099        orr             r2,  r2,  r2,  lsl #16
1100        b               vp8_h_loop_filter_armv6
1101endfunc
1102
1103function ff_vp8_h_loop_filter8uv_armv6, export=1
1104        push            {r1, r4-r11, lr}
1105        mov             r1,  r2
1106        orr             r2,  r3,  r3,  lsl #16
1107        ldr             r3,  [sp, #40]
1108        ldr             r9,  [sp, #44]
1109        b               vp8_h_loop_filter_armv6
1110endfunc
1111
1112.ltorg
1113
1114@ MC
1115
1116@ void put_vp8_pixels16(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1117@                       ptrdiff_t srcstride, int h, int mx, int my)
1118function ff_put_vp8_pixels16_armv6, export=1
1119        push            {r4-r11}
1120        ldr             r12, [sp, #32]          @ h
11211:
1122        subs            r12, r12, #2
1123        ldr             r5,  [r2, #4]
1124        ldr             r6,  [r2, #8]
1125        ldr             r7,  [r2, #12]
1126        ldr_post        r4,  r2,  r3
1127        ldr             r9,  [r2, #4]
1128        ldr             r10, [r2, #8]
1129        ldr             r11, [r2, #12]
1130        ldr_post        r8,  r2,  r3
1131        strd            r6,  r7,  [r0, #8]
1132        strd_post       r4,  r5,  r0,  r1
1133        strd            r10, r11, [r0, #8]
1134        strd_post       r8,  r9,  r0,  r1
1135        bgt             1b
1136        pop             {r4-r11}
1137        bx              lr
1138endfunc
1139
1140@ void put_vp8_pixels8(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1141@                      ptrdiff_t srcstride, int h, int mx, int my)
1142function ff_put_vp8_pixels8_armv6, export=1
1143        push            {r4-r11}
1144        ldr             r12, [sp, #32]          @ h
11451:
1146        subs            r12, r12, #4
1147        ldr             r5,  [r2, #4]
1148        ldr_post        r4,  r2,  r3
1149        ldr             r7,  [r2, #4]
1150        ldr_post        r6,  r2,  r3
1151        ldr             r9,  [r2, #4]
1152        ldr_post        r8,  r2,  r3
1153        ldr             r11, [r2, #4]
1154        ldr_post        r10, r2,  r3
1155        strd_post       r4,  r5,  r0,  r1
1156        strd_post       r6,  r7,  r0,  r1
1157        strd_post       r8,  r9,  r0,  r1
1158        strd_post       r10, r11, r0,  r1
1159        bgt             1b
1160        pop             {r4-r11}
1161        bx              lr
1162endfunc
1163
1164@ void put_vp8_pixels4(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1165@                      ptrdiff_t srcstride, int h, int mx, int my)
1166function ff_put_vp8_pixels4_armv6, export=1
1167        ldr             r12, [sp, #0]           @ h
1168        push            {r4-r6,lr}
11691:
1170        subs            r12, r12, #4
1171        ldr_post        r4,  r2,  r3
1172        ldr_post        r5,  r2,  r3
1173        ldr_post        r6,  r2,  r3
1174        ldr_post        lr,  r2,  r3
1175        str_post        r4,  r0,  r1
1176        str_post        r5,  r0,  r1
1177        str_post        r6,  r0,  r1
1178        str_post        lr,  r0,  r1
1179        bgt             1b
1180        pop             {r4-r6,pc}
1181endfunc
1182
1183@ note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit
1184@ arithmetic can be used to apply filters
1185const   sixtap_filters_13245600, align=4
1186        .short     2, 108, -11,  36,  -8, 1, 0, 0
1187        .short     3,  77, -16,  77, -16, 3, 0, 0
1188        .short     1,  36,  -8, 108, -11, 2, 0, 0
1189endconst
1190
1191const   fourtap_filters_1324, align=4
1192        .short     -6,  12, 123, -1
1193        .short     -9,  50,  93, -6
1194        .short     -6,  93,  50, -9
1195        .short     -1, 123,  12, -6
1196endconst
1197
1198.macro  vp8_mc_1        name, size, hv
1199function ff_put_vp8_\name\size\()_\hv\()_armv6, export=1
1200        sub             r1,  r1,  #\size
1201        mov             r12, sp
1202        push            {r1, r4-r11, lr}
1203        ldm             r12, {r5-r7}
1204        mov             r4,  #\size
1205        stm             r12, {r4, r5}
1206        orr             r12, r6,  r7
1207        b               bl_put_\name\()_\hv\()_armv6
1208endfunc
1209.endm
1210
1211vp8_mc_1                epel,  16, h6
1212vp8_mc_1                epel,  16, v6
1213vp8_mc_1                epel,   8, h6
1214vp8_mc_1                epel,   8, v6
1215vp8_mc_1                epel,   8, h4
1216vp8_mc_1                epel,   8, v4
1217vp8_mc_1                epel,   4, h6
1218vp8_mc_1                epel,   4, v6
1219vp8_mc_1                epel,   4, h4
1220vp8_mc_1                epel,   4, v4
1221
1222vp8_mc_1                bilin, 16, h
1223vp8_mc_1                bilin, 16, v
1224vp8_mc_1                bilin,  8, h
1225vp8_mc_1                bilin,  8, v
1226vp8_mc_1                bilin,  4, h
1227vp8_mc_1                bilin,  4, v
1228
1229@ 4 and 8 pixel wide mc blocks might have height of 8 or 16 lines
1230#define TMPSIZE \size * (16 / ((16 / \size + 1) / 2) + \ytaps - 1)
1231
1232.macro  vp8_mc_hv       name, size, h, v, ytaps
1233function ff_put_vp8_\name\size\()_\h\v\()_armv6, export=1
1234        push            {r0, r1, r4, lr}
1235        add             r0,  sp,  #16
1236        sub             sp,  sp,  #TMPSIZE+16
1237        ldm             r0,  {r0, r12}
1238        mov             r4,  #\size
1239        add             lr,  r0,  #\ytaps-1
1240    .if \ytaps > 2
1241        sub             r2,  r2,  r3,  lsl #\ytaps >> 1 & 1
1242    .endif
1243        stm             sp,  {r4, lr}
1244        add             r0,  sp,  #16
1245        mov             r1,  #0
1246        bl              vp8_put_\name\()_\h\()_armv6
1247        add             r0,  sp,  #TMPSIZE+16
1248        ldr             lr,  [sp, #TMPSIZE+16+16]
1249        ldm             r0,  {r0, r1}
1250        mov             r3,  #\size
1251        ldr             r12, [sp, #TMPSIZE+16+16+8]
1252        str             lr,  [sp, #4]
1253        add             r2,  sp,  #16 + \size * (\ytaps / 2 - 1)
1254        sub             r1,  r1,  #\size
1255        bl              vp8_put_\name\()_\v\()_armv6
1256        add             sp,  sp,  #TMPSIZE+16+8
1257        pop             {r4, pc}
1258endfunc
1259.endm
1260
1261vp8_mc_hv               epel,  16, h6, v6, 6
1262vp8_mc_hv               epel,   8, h6, v6, 6
1263vp8_mc_hv               epel,   8, h4, v6, 6
1264vp8_mc_hv               epel,   8, h6, v4, 4
1265vp8_mc_hv               epel,   8, h4, v4, 4
1266vp8_mc_hv               epel,   4, h6, v6, 6
1267vp8_mc_hv               epel,   4, h4, v6, 6
1268vp8_mc_hv               epel,   4, h6, v4, 4
1269vp8_mc_hv               epel,   4, h4, v4, 4
1270
1271vp8_mc_hv               bilin, 16, h,  v,  2
1272vp8_mc_hv               bilin,  8, h,  v,  2
1273vp8_mc_hv               bilin,  4, h,  v,  2
1274
1275.macro  sat4            r0,  r1,  r2,  r3
1276        asr             \r0, \r0, #7
1277        asr             \r1, \r1, #7
1278        pkhbt           \r0, \r0, \r2, lsl #9
1279        pkhbt           \r1, \r1, \r3, lsl #9
1280        usat16          \r0, #8,  \r0
1281        usat16          \r1, #8,  \r1
1282        orr             \r0, \r0, \r1, lsl #8
1283.endm
1284
1285@ Calling convention for the inner MC functions:
1286@       r0      dst
1287@       r1      dst_stride - block_width
1288@       r2      src
1289@       r3      src_stride
1290@       r4      block_width
1291@       r12     filter_index
1292@       [sp]    block_width
1293@       [sp+4]  height
1294@       [sp+8]  scratch
1295
1296function vp8_put_epel_h6_armv6
1297        push            {r1, r4-r11, lr}
1298bl_put_epel_h6_armv6:
1299        sub             r2,  r2,  #2
1300        movrel          lr,  sixtap_filters_13245600 - 16
1301        add             lr,  lr,  r12, lsl #3
1302        sub             r3,  r3,  r4
1303        str             r3,  [sp, #48]
1304        ldm             lr,  {r1, r3, lr}
13051:
1306        ldr             r7,  [r2, #5]           @ src[5-8]
1307        ldr             r6,  [r2, #2]           @ src[2-5]
1308        ldr             r5,  [r2], #4           @ src[0-3]
1309
1310        pkhtb           r7,  r7,  r7,  asr #8   @ src[8,7,7,6]
1311        uxtb16          r9,  r6,  ror #8        @ src[5] | src[3]
1312        uxtb16          r6,  r6                 @ src[4] | src[2]
1313        uxtb16          r8,  r5,  ror #8        @ src[3] | src[1]
1314        uxtb16          r11, r7,  ror #8        @ src[8] | src[7]
1315        uxtb16          r7,  r7                 @ src[7] | src[6]
1316        uxtb16          r5,  r5                 @ src[2] | src[0]
1317
1318        mov             r10, #0x40
1319        smlad           r5,  r5,  r1,  r10      @ filter[0][0]
1320        smlad           r11, r11, lr,  r10      @ filter[3][2]
1321        smlad           r12, r7,  lr,  r10      @ filter[2][2]
1322        smlad           r10, r8,  r1,  r10      @ filter[1][0]
1323        smlad           r5,  r8,  r3,  r5       @ filter[0][1]
1324        smlad           r11, r9,  r1,  r11      @ filter[3][0]
1325        smlad           r12, r9,  r3,  r12      @ filter[2][1]
1326        pkhtb           r9,  r9,  r6,  asr #16  @ src[5] | src[4]
1327        smlad           r10, r6,  r3,  r10      @ filter[1][1]
1328        pkhbt           r7,  r9,  r7,  lsl #16  @ src[6] | src[4]
1329        smlad           r5,  r9,  lr,  r5       @ filter[0][2]
1330        pkhtb           r8,  r7,  r9,  asr #16  @ src[6] | src[5]
1331        smlad           r11, r7,  r3,  r11      @ filter[3][1]
1332        smlad           r9,  r8,  lr,  r10      @ filter[1][2]
1333        smlad           r7,  r6,  r1,  r12      @ filter[2][0]
1334
1335        subs            r4,  r4,  #4
1336
1337        sat4            r5,  r9,  r7,  r11
1338        str             r5,  [r0], #4
1339
1340        bne             1b
1341
1342        add             r4,  sp,  #40
1343        ldm             r4,  {r4, r5, r12}
1344        ldr             r6,  [sp]
1345        subs            r5,  r5,  #1
1346        add             r2,  r2,  r12
1347        str             r5,  [sp, #44]
1348        add             r0,  r0,  r6
1349
1350        bne             1b
1351
1352        pop             {r1, r4-r11, pc}
1353endfunc
1354
1355function vp8_put_epel_v6_armv6
1356        push            {r1, r4-r11, lr}
1357bl_put_epel_v6_armv6:
1358        movrel          lr,  sixtap_filters_13245600 - 16
1359        add             lr,  lr,  r12, lsl #3
1360        str             r3,  [sp, #48]
13611:
1362        add             r1,  r3,  r3,  lsl #1   @ stride * 3
1363        ldr_nreg        r5,  r2,  r3            @ src[0,1,2,3 + stride * 1]
1364        ldr             r6,  [r2, r3]           @ src[0,1,2,3 + stride * 3]
1365        ldr             r7,  [r2, r3,  lsl #1]  @ src[0,1,2,3 + stride * 4]
1366        ldr             r8,  [r2, r1]           @ src[0,1,2,3 + stride * 5]
1367
1368        uxtb16          r9,  r5,  ror #8        @ src[3 + s*1] | src[1 + s*1]
1369        uxtb16          r10, r6,  ror #8        @ src[3 + s*3] | src[1 + s*3]
1370        uxtb16          r11, r7,  ror #8        @ src[3 + s*4] | src[1 + s*4]
1371        uxtb16          r12, r8,  ror #8        @ src[3 + s*5] | src[1 + s*5]
1372        uxtb16          r5,  r5                 @ src[2 + s*1] | src[0 + s*1]
1373        uxtb16          r6,  r6                 @ src[2 + s*3] | src[0 + s*3]
1374        uxtb16          r7,  r7                 @ src[2 + s*4] | src[0 + s*4]
1375        uxtb16          r8,  r8                 @ src[2 + s*5] | src[0 + s*5]
1376        pkhbt           r1,  r9,  r10, lsl #16  @ src[1 + s*3] | src[1 + s*1]
1377        pkhtb           r9,  r10, r9,  asr #16  @ src[3 + s*3] | src[3 + s*1]
1378        pkhbt           r10, r11, r12, lsl #16  @ src[1 + s*5] | src[1 + s*4]
1379        pkhtb           r11, r12, r11, asr #16  @ src[3 + s*5] | src[3 + s*4]
1380        pkhbt           r12, r5,  r6,  lsl #16  @ src[0 + s*3] | src[0 + s*1]
1381        pkhtb           r5,  r6,  r5,  asr #16  @ src[2 + s*3] | src[2 + s*1]
1382        pkhbt           r6,  r7,  r8,  lsl #16  @ src[0 + s*5] | src[0 + s*4]
1383        pkhtb           r7,  r8,  r7,  asr #16  @ src[2 + s*5] | src[2 + s*4]
1384
1385        ldr             r8,  [lr, #4]
1386        mov             r3,  #0x40
1387        smlad           r12, r12, r8,  r3       @ filter[0][1]
1388        smlad           r1,  r1,  r8,  r3       @ filter[1][1]
1389        smlad           r5,  r5,  r8,  r3       @ filter[2][1]
1390        smlad           r9,  r9,  r8,  r3       @ filter[3][1]
1391        ldr             r8,  [lr, #8]
1392        ldr             r3,  [sp, #48]
1393        smlad           r12, r6,  r8,  r12      @ filter[0][2]
1394        smlad           r1,  r10, r8,  r1       @ filter[1][2]
1395        ldr_nreg        r6,  r2,  r3,  lsl #1   @ src[0,1,2,3 + stride * 0]
1396        ldr             r10, [r2], #4           @ src[0,1,2,3 + stride * 2]
1397        smlad           r5,  r7,  r8,  r5       @ filter[2][2]
1398        smlad           r9,  r11, r8,  r9       @ filter[3][2]
1399
1400        uxtb16          r7,  r6,  ror #8        @ src[3 + s*0] | src[1 + s*0]
1401        uxtb16          r11, r10, ror #8        @ src[3 + s*2] | src[1 + s*2]
1402        uxtb16          r6,  r6                 @ src[2 + s*0] | src[0 + s*0]
1403        uxtb16          r10, r10                @ src[2 + s*2] | src[0 + s*2]
1404
1405        pkhbt           r8,  r7,  r11, lsl #16  @ src[1 + s*2] | src[1 + s*0]
1406        pkhtb           r7,  r11, r7,  asr #16  @ src[3 + s*2] | src[3 + s*0]
1407        pkhbt           r11, r6,  r10, lsl #16  @ src[0 + s*2] | src[0 + s*0]
1408        pkhtb           r6,  r10, r6,  asr #16  @ src[2 + s*2] | src[2 + s*0]
1409
1410        ldr             r10, [lr]
1411        subs            r4,  r4,  #4
1412        smlad           r12, r11, r10, r12      @ filter[0][0]
1413        smlad           r1,  r8,  r10, r1       @ filter[1][0]
1414        smlad           r5,  r6,  r10, r5       @ filter[2][0]
1415        smlad           r9,  r7,  r10, r9       @ filter[3][0]
1416
1417        sat4            r12, r1,  r5,  r9
1418        str             r12, [r0], #4
1419
1420        bne             1b
1421
1422        ldrd            r4,  r5,  [sp, #40]
1423        ldr             r6,  [sp]
1424        subs            r5,  r5,  #1
1425        sub             r2,  r2,  r4
1426        str             r5,  [sp, #44]
1427        add             r0,  r0,  r6
1428        add             r2,  r2,  r3
1429
1430        bne             1b
1431
1432        pop             {r1, r4-r11, pc}
1433endfunc
1434
1435function vp8_put_epel_h4_armv6
1436        push            {r1, r4-r11, lr}
1437bl_put_epel_h4_armv6:
1438        subs            r2,  r2,  #1
1439        movrel          lr,  fourtap_filters_1324 - 4
1440        add             lr,  lr,  r12, lsl #2
1441        sub             r3,  r3,  r4
1442        ldm             lr,  {r5, r6}
1443        ldr             lr,  [sp, #44]
14441:
1445        ldr             r9,  [r2, #3]
1446        ldr             r8,  [r2, #2]
1447        ldr             r7,  [r2], #4
1448
1449        uxtb16          r9,  r9,  ror #8        @ src[6] | src[4]
1450        uxtb16          r10, r8,  ror #8        @ src[5] | src[3]
1451        uxtb16          r8,  r8                 @ src[4] | src[2]
1452        uxtb16          r11, r7,  ror #8        @ src[3] | src[1]
1453        uxtb16          r7,  r7                 @ src[2] | src[0]
1454
1455        mov             r12, #0x40
1456        smlad           r9,  r9,  r6,  r12      @ filter[3][1]
1457        smlad           r7,  r7,  r5,  r12      @ filter[0][0]
1458        smlad           r9,  r10, r5,  r9       @ filter[3][0]
1459        smlad           r10, r10, r6,  r12      @ filter[2][1]
1460        smlad           r12, r11, r5,  r12      @ filter[1][0]
1461        smlad           r7,  r11, r6,  r7       @ filter[0][1]
1462        smlad           r10, r8,  r5,  r10      @ filter[2][0]
1463        smlad           r12, r8,  r6,  r12      @ filter[1][1]
1464
1465        subs            r4,  r4,  #4
1466
1467        sat4            r7,  r12, r10, r9
1468        str             r7,  [r0], #4
1469
1470        bne             1b
1471
1472        subs            lr,  lr,  #1
1473        ldr             r4,  [sp, #40]
1474        add             r2,  r2,  r3
1475        add             r0,  r0,  r1
1476
1477        bne             1b
1478
1479        pop             {r1, r4-r11, pc}
1480endfunc
1481
1482function vp8_put_epel_v4_armv6
1483        push            {r1, r4-r11, lr}
1484bl_put_epel_v4_armv6:
1485        movrel          lr,  fourtap_filters_1324 - 4
1486        add             lr,  lr,  r12, lsl #2
1487        ldm             lr,  {r5, r6}
1488        str             r3,  [sp, #48]
14891:
1490        ldr             lr,  [r2, r3, lsl #1]
1491        ldr             r12, [r2, r3]
1492        ldr_nreg        r7,  r2,  r3
1493        ldr             r11, [r2], #4
1494
1495        uxtb16          r8,  lr,  ror #8        @ src[3 + s*3] | src[1 + s*3]
1496        uxtb16          r9,  r12, ror #8        @ src[3 + s*2] | src[1 + s*2]
1497        uxtb16          r3,  r7,  ror #8        @ src[3 + s*0] | src[1 + s*0]
1498        uxtb16          r1,  r11, ror #8        @ src[3 + s*1] | src[1 + s*1]
1499        uxtb16          lr,  lr                 @ src[2 + s*3] | src[0 + s*3]
1500        uxtb16          r12, r12                @ src[2 + s*2] | src[0 + s*2]
1501        uxtb16          r7,  r7                 @ src[2 + s*0] | src[0 + s*0]
1502        uxtb16          r11, r11                @ src[2 + s*1] | src[0 + s*1]
1503        pkhbt           r10, r1,  r8,  lsl #16  @ src[1 + s*3] | src[1 + s*1]
1504        pkhtb           r1,  r8,  r1,  asr #16  @ src[3 + s*3] | src[3 + s*1]
1505        pkhbt           r8,  r3,  r9,  lsl #16  @ src[1 + s*2] | src[1 + s*0]
1506        pkhtb           r3,  r9,  r3,  asr #16  @ src[3 + s*2] | src[3 + s*0]
1507        pkhbt           r9,  r11, lr,  lsl #16  @ src[0 + s*3] | src[0 + s*1]
1508        pkhtb           r11, lr,  r11, asr #16  @ src[2 + s*3] | src[2 + s*1]
1509        pkhbt           lr,  r7,  r12, lsl #16  @ src[0 + s*2] | src[0 + s*0]
1510        pkhtb           r7,  r12, r7,  asr #16  @ src[2 + s*2] | src[2 + s*0]
1511
1512        mov             r12, #0x40
1513        smlad           r9,  r9,  r6,  r12      @ filter[0][1]
1514        smlad           r10, r10, r6,  r12      @ filter[1][1]
1515        smlad           r11, r11, r6,  r12      @ filter[2][1]
1516        smlad           r1,  r1,  r6,  r12      @ filter[3][1]
1517        smlad           r9,  lr,  r5,  r9       @ filter[0][0]
1518        smlad           r10, r8,  r5,  r10      @ filter[1][0]
1519        smlad           r11, r7,  r5,  r11      @ filter[2][0]
1520        smlad           r1,  r3,  r5,  r1       @ filter[3][0]
1521
1522        subs            r4,  r4,  #4
1523        ldr             r3,  [sp, #48]
1524
1525        sat4            r9,  r10, r11, r1
1526        str             r9,  [r0], #4
1527
1528        bne             1b
1529
1530        ldr             r4,  [sp, #40]
1531        ldr             r12, [sp, #44]
1532        add             r2,  r2,  r3
1533        ldr             r9,  [sp, #0]
1534        subs            r12, r12, #1
1535        sub             r2,  r2,  r4
1536        str             r12, [sp, #44]
1537        add             r0,  r0,  r9
1538
1539        bne             1b
1540
1541        pop             {r1, r4-r11, pc}
1542endfunc
1543
1544function vp8_put_bilin_h_armv6
1545        push            {r1, r4-r11, lr}
1546bl_put_bilin_h_armv6:
1547        rsb             r5,  r12, r12, lsl #16
1548        ldr             r12, [sp, #44]
1549        sub             r3,  r3,  r4
1550        add             r5,  r5,  #8
15511:
1552        ldrb            r6,  [r2], #1
1553        ldrb            r7,  [r2], #1
1554        ldrb            r8,  [r2], #1
1555        ldrb            r9,  [r2], #1
1556        ldrb            lr,  [r2]
1557
1558        pkhbt           r6,  r6,  r7,  lsl #16  @ src[1] | src[0]
1559        pkhbt           r7,  r7,  r8,  lsl #16  @ src[2] | src[1]
1560        pkhbt           r8,  r8,  r9,  lsl #16  @ src[3] | src[2]
1561        pkhbt           r9,  r9,  lr,  lsl #16  @ src[4] | src[3]
1562
1563        mov             r10, #4
1564        smlad           r6,  r6,  r5,  r10
1565        smlad           r7,  r7,  r5,  r10
1566        smlad           r8,  r8,  r5,  r10
1567        smlad           r9,  r9,  r5,  r10
1568
1569        subs            r4,  r4,  #4
1570
1571        asr             r6,  #3
1572        asr             r7,  #3
1573        pkhbt           r6,  r6,  r8,  lsl #13
1574        pkhbt           r7,  r7,  r9,  lsl #13
1575        orr             r6,  r6,  r7,  lsl #8
1576        str             r6,  [r0], #4
1577
1578        bne             1b
1579
1580        ldr             r4,  [sp, #40]
1581        subs            r12, r12, #1
1582        add             r2,  r2,  r3
1583        add             r0,  r0,  r1
1584
1585        bne             1b
1586
1587        pop             {r1, r4-r11, pc}
1588endfunc
1589
1590function vp8_put_bilin_v_armv6
1591        push            {r1, r4-r11, lr}
1592bl_put_bilin_v_armv6:
1593        rsb             r5,  r12, r12, lsl #16
1594        ldr             r12, [sp, #44]
1595        add             r5,  r5,  #8
15961:
1597        ldrb            r10, [r2, r3]
1598        ldrb            r6,  [r2], #1
1599        ldrb            r11, [r2, r3]
1600        ldrb            r7,  [r2], #1
1601        ldrb            lr,  [r2, r3]
1602        ldrb            r8,  [r2], #1
1603        ldrb            r9,  [r2, r3]
1604        pkhbt           r6,  r6,  r10, lsl #16
1605        ldrb            r10, [r2], #1
1606        pkhbt           r7,  r7,  r11, lsl #16
1607        pkhbt           r8,  r8,  lr,  lsl #16
1608        pkhbt           r9,  r10, r9,  lsl #16
1609
1610        mov             r10, #4
1611        smlad           r6,  r6,  r5,  r10
1612        smlad           r7,  r7,  r5,  r10
1613        smlad           r8,  r8,  r5,  r10
1614        smlad           r9,  r9,  r5,  r10
1615
1616        subs            r4,  r4,  #4
1617
1618        asr             r6,  #3
1619        asr             r7,  #3
1620        pkhbt           r6,  r6,  r8,  lsl #13
1621        pkhbt           r7,  r7,  r9,  lsl #13
1622        orr             r6,  r6,  r7,  lsl #8
1623        str             r6,  [r0], #4
1624
1625        bne             1b
1626
1627        ldr             r4,  [sp, #40]
1628        subs            r12, r12, #1
1629        add             r2,  r2,  r3
1630        add             r0,  r0,  r1
1631        sub             r2,  r2,  r4
1632
1633        bne             1b
1634        pop             {r1, r4-r11, pc}
1635endfunc
1636