1 /*
2  * Loongson SIMD optimized vp8dsp
3  *
4  * Copyright (c) 2016 Loongson Technology Corporation Limited
5  * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "vp8dsp_mips.h"
25 #include "constants.h"
26 #include "libavutil/mips/mmiutils.h"
27 #include "libavutil/mem_internal.h"
28 
29 #define DECLARE_DOUBLE_1            double db_1
30 #define DECLARE_DOUBLE_2            double db_2
31 #define DECLARE_UINT32_T            uint32_t  it_1
32 #define RESTRICT_ASM_DOUBLE_1       [db_1]"=&f"(db_1)
33 #define RESTRICT_ASM_DOUBLE_2       [db_2]"=&f"(db_2)
34 #define RESTRICT_ASM_UINT32_T       [it_1]"=&r"(it_1)
35 
36 #define MMI_PCMPGTUB(dst, src1, src2)                                       \
37         "pcmpeqb    %[db_1],    "#src1",        "#src2"             \n\t"   \
38         "pmaxub     %[db_2],    "#src1",        "#src2"             \n\t"   \
39         "pcmpeqb    %[db_2],    %[db_2],        "#src1"             \n\t"   \
40         "xor        "#dst",     %[db_2],        %[db_1]             \n\t"
41 
42 #define MMI_BTOH(dst_l, dst_r, src)                                         \
43         "xor        %[db_1],    %[db_1],        %[db_1]             \n\t"   \
44         "pcmpgtb    %[db_2],    %[db_1],        "#src"              \n\t"   \
45         "punpcklbh  "#dst_r",   "#src",         %[db_2]             \n\t"   \
46         "punpckhbh  "#dst_l",   "#src",         %[db_2]             \n\t"
47 
48 #define MMI_VP8_LOOP_FILTER                                                 \
49         /* Calculation of hev */                                            \
50         "dmtc1      %[thresh],  %[ftmp3]                            \n\t"   \
51         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
52         "punpcklhw  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
53         "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
54         "pasubub    %[ftmp0],   %[p1],          %[p0]               \n\t"   \
55         "pasubub    %[ftmp1],   %[q1],          %[q0]               \n\t"   \
56         "pmaxub     %[ftmp0],   %[ftmp0],       %[ftmp1]            \n\t"   \
57         MMI_PCMPGTUB(%[hev], %[ftmp0], %[ftmp3])                            \
58         /* Calculation of mask */                                           \
59         "pasubub    %[ftmp1],   %[p0],          %[q0]               \n\t"   \
60         "paddusb    %[ftmp1],   %[ftmp1],       %[ftmp1]            \n\t"   \
61         "pasubub    %[ftmp2],   %[p1],          %[q1]               \n\t"   \
62         "li         %[tmp0],    0x09                                \n\t"   \
63         "dmtc1      %[tmp0],    %[ftmp3]                            \n\t"   \
64         PSRLB_MMI(%[ftmp2],  %[ftmp3],  %[ftmp4],  %[ftmp5],  %[ftmp2])     \
65         "paddusb    %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"   \
66         "dmtc1      %[e],       %[ftmp3]                            \n\t"   \
67         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
68         "punpcklhw  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
69         "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
70         MMI_PCMPGTUB(%[mask], %[ftmp1], %[ftmp3])                           \
71         "pmaxub     %[mask],    %[mask],        %[ftmp0]            \n\t"   \
72         "pasubub    %[ftmp1],   %[p3],          %[p2]               \n\t"   \
73         "pasubub    %[ftmp2],   %[p2],          %[p1]               \n\t"   \
74         "pmaxub     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"   \
75         "pmaxub     %[mask],    %[mask],        %[ftmp1]            \n\t"   \
76         "pasubub    %[ftmp1],   %[q3],          %[q2]               \n\t"   \
77         "pasubub    %[ftmp2],   %[q2],          %[q1]               \n\t"   \
78         "pmaxub     %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t"   \
79         "pmaxub     %[mask],    %[mask],        %[ftmp1]            \n\t"   \
80         "dmtc1      %[i],       %[ftmp3]                            \n\t"   \
81         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
82         "punpcklhw  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
83         "punpcklwd  %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
84         MMI_PCMPGTUB(%[mask], %[mask], %[ftmp3])                            \
85         "pcmpeqw    %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"   \
86         "xor        %[mask],    %[mask],        %[ftmp3]            \n\t"   \
87         /* VP8_MBFILTER */                                                  \
88         "li         %[tmp0],    0x80808080                          \n\t"   \
89         "dmtc1      %[tmp0],    %[ftmp7]                            \n\t"   \
90         "punpcklwd  %[ftmp7],   %[ftmp7],       %[ftmp7]            \n\t"   \
91         "xor        %[p2],      %[p2],          %[ftmp7]            \n\t"   \
92         "xor        %[p1],      %[p1],          %[ftmp7]            \n\t"   \
93         "xor        %[p0],      %[p0],          %[ftmp7]            \n\t"   \
94         "xor        %[q0],      %[q0],          %[ftmp7]            \n\t"   \
95         "xor        %[q1],      %[q1],          %[ftmp7]            \n\t"   \
96         "xor        %[q2],      %[q2],          %[ftmp7]            \n\t"   \
97         "psubsb     %[ftmp4],   %[p1],          %[q1]               \n\t"   \
98         "psubb      %[ftmp5],   %[q0],          %[p0]               \n\t"   \
99         MMI_BTOH(%[ftmp1],  %[ftmp0],  %[ftmp5])                            \
100         MMI_BTOH(%[ftmp3],  %[ftmp2],  %[ftmp4])                            \
101         /* Right part */                                                    \
102         "paddh      %[ftmp5],   %[ftmp0],       %[ftmp0]            \n\t"   \
103         "paddh      %[ftmp0],   %[ftmp0],       %[ftmp5]            \n\t"   \
104         "paddh      %[ftmp0],   %[ftmp2],       %[ftmp0]            \n\t"   \
105         /* Left part */                                                     \
106         "paddh      %[ftmp5],   %[ftmp1],       %[ftmp1]            \n\t"   \
107         "paddh      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"   \
108         "paddh      %[ftmp1],   %[ftmp3],       %[ftmp1]            \n\t"   \
109         /* Combine left and right part */                                   \
110         "packsshb   %[ftmp1],   %[ftmp0],       %[ftmp1]            \n\t"   \
111         "and        %[ftmp1],   %[ftmp1],       %[mask]             \n\t"   \
112         "and        %[ftmp2],   %[ftmp1],       %[hev]              \n\t"   \
113         "li         %[tmp0],    0x04040404                          \n\t"   \
114         "dmtc1      %[tmp0],    %[ftmp0]                            \n\t"   \
115         "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"   \
116         "paddsb     %[ftmp3],   %[ftmp2],       %[ftmp0]            \n\t"   \
117         "li         %[tmp0],    0x0B                                \n\t"   \
118         "dmtc1      %[tmp0],    %[ftmp4]                            \n\t"   \
119         PSRAB_MMI(%[ftmp3],  %[ftmp4],  %[ftmp5],  %[ftmp6],  %[ftmp3])     \
120         "li         %[tmp0],    0x03030303                          \n\t"   \
121         "dmtc1      %[tmp0],    %[ftmp0]                            \n\t"   \
122         "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"   \
123         "paddsb     %[ftmp4],   %[ftmp2],       %[ftmp0]            \n\t"   \
124         "li         %[tmp0],    0x0B                                \n\t"   \
125         "dmtc1      %[tmp0],    %[ftmp2]                            \n\t"   \
126         PSRAB_MMI(%[ftmp4],  %[ftmp2],  %[ftmp5],  %[ftmp6],  %[ftmp4])     \
127         "psubsb     %[q0],      %[q0],          %[ftmp3]            \n\t"   \
128         "paddsb     %[p0],      %[p0],          %[ftmp4]            \n\t"   \
129         /* filt_val &= ~hev */                                              \
130         "pcmpeqw    %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"   \
131         "xor        %[hev],     %[hev],         %[ftmp0]            \n\t"   \
132         "and        %[ftmp1],   %[ftmp1],       %[hev]              \n\t"   \
133         MMI_BTOH(%[ftmp5],  %[ftmp6],  %[ftmp1])                            \
134         "li         %[tmp0],    0x07                                \n\t"   \
135         "dmtc1      %[tmp0],    %[ftmp2]                            \n\t"   \
136         "li         %[tmp0],    0x001b001b                          \n\t"   \
137         "dmtc1      %[tmp0],    %[ftmp1]                            \n\t"   \
138         "punpcklwd  %[ftmp1],   %[ftmp1],       %[ftmp1]            \n\t"   \
139         "li         %[tmp0],    0x003f003f                          \n\t"   \
140         "dmtc1      %[tmp0],    %[ftmp0]                            \n\t"   \
141         "punpcklwd  %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"   \
142         /* Right part */                                                    \
143         "pmullh     %[ftmp3],   %[ftmp6],       %[ftmp1]            \n\t"   \
144         "paddh      %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"   \
145         "psrah      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
146         /* Left part */                                                     \
147         "pmullh     %[ftmp4],   %[ftmp5],       %[ftmp1]            \n\t"   \
148         "paddh      %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"   \
149         "psrah      %[ftmp4],   %[ftmp4],       %[ftmp2]            \n\t"   \
150         /* Combine left and right part */                                   \
151         "packsshb   %[ftmp4],   %[ftmp3],       %[ftmp4]            \n\t"   \
152         "psubsb     %[q0],      %[q0],          %[ftmp4]            \n\t"   \
153         "xor        %[q0],      %[q0],          %[ftmp7]            \n\t"   \
154         "paddsb     %[p0],      %[p0],          %[ftmp4]            \n\t"   \
155         "xor        %[p0],      %[p0],          %[ftmp7]            \n\t"   \
156         "li         %[tmp0],    0x00120012                          \n\t"   \
157         "dmtc1      %[tmp0],    %[ftmp1]                            \n\t"   \
158         "punpcklwd  %[ftmp1],   %[ftmp1],       %[ftmp1]            \n\t"   \
159         /* Right part */                                                    \
160         "pmullh     %[ftmp3],   %[ftmp6],       %[ftmp1]            \n\t"   \
161         "paddh      %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"   \
162         "psrah      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
163         /* Left part */                                                     \
164         "pmullh     %[ftmp4],   %[ftmp5],       %[ftmp1]            \n\t"   \
165         "paddh      %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"   \
166         "psrah      %[ftmp4],   %[ftmp4],       %[ftmp2]            \n\t"   \
167         /* Combine left and right part */                                   \
168         "packsshb   %[ftmp4],   %[ftmp3],       %[ftmp4]            \n\t"   \
169         "psubsb     %[q1],      %[q1],          %[ftmp4]            \n\t"   \
170         "xor        %[q1],      %[q1],          %[ftmp7]            \n\t"   \
171         "paddsb     %[p1],      %[p1],          %[ftmp4]            \n\t"   \
172         "xor        %[p1],      %[p1],          %[ftmp7]            \n\t"   \
173         "li         %[tmp0],    0x03                                \n\t"   \
174         "dmtc1      %[tmp0],    %[ftmp1]                            \n\t"   \
175         /* Right part */                                                    \
176         "psllh      %[ftmp3],   %[ftmp6],       %[ftmp1]            \n\t"   \
177         "paddh      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"   \
178         "paddh      %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"   \
179         "psrah      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
180         /* Left part */                                                     \
181         "psllh      %[ftmp4],   %[ftmp5],       %[ftmp1]            \n\t"   \
182         "paddh      %[ftmp4],   %[ftmp4],       %[ftmp5]            \n\t"   \
183         "paddh      %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"   \
184         "psrah      %[ftmp4],   %[ftmp4],       %[ftmp2]            \n\t"   \
185         /* Combine left and right part */                                   \
186         "packsshb   %[ftmp4],   %[ftmp3],       %[ftmp4]            \n\t"   \
187         "psubsb     %[q2],      %[q2],          %[ftmp4]            \n\t"   \
188         "xor        %[q2],      %[q2],          %[ftmp7]            \n\t"   \
189         "paddsb     %[p2],      %[p2],          %[ftmp4]            \n\t"   \
190         "xor        %[p2],      %[p2],          %[ftmp7]            \n\t"
191 
192 #define PUT_VP8_EPEL4_H6_MMI(src, dst)                                      \
193         MMI_ULWC1(%[ftmp1], src, 0x00)                                      \
194         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
195         "pmullh     %[ftmp3],   %[ftmp2],       %[filter2]          \n\t"   \
196                                                                             \
197         MMI_ULWC1(%[ftmp1], src, -0x01)                                     \
198         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
199         "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
200         "psubsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
201                                                                             \
202         MMI_ULWC1(%[ftmp1], src, -0x02)                                     \
203         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
204         "pmullh     %[ftmp2],   %[ftmp2],       %[filter0]          \n\t"   \
205         "paddsh     %[ftmp5],   %[ftmp3],       %[ftmp2]            \n\t"   \
206                                                                             \
207         MMI_ULWC1(%[ftmp1], src, 0x01)                                      \
208         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
209         "pmullh     %[ftmp3],   %[ftmp2],       %[filter3]          \n\t"   \
210                                                                             \
211         MMI_ULWC1(%[ftmp1], src, 0x02)                                      \
212         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
213         "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
214         "psubsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
215                                                                             \
216         MMI_ULWC1(%[ftmp1], src, 0x03)                                      \
217         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
218         "pmullh     %[ftmp2],   %[ftmp2],       %[filter5]          \n\t"   \
219         "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
220                                                                             \
221         "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"   \
222         "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_64]         \n\t"   \
223         "psrah      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"   \
224         "packushb   %[ftmp1],   %[ftmp3],       %[ftmp0]            \n\t"   \
225                                                                             \
226         MMI_SWC1(%[ftmp1], dst, 0x00)
227 
228 
229 #define PUT_VP8_EPEL4_H4_MMI(src, dst)                                      \
230         MMI_ULWC1(%[ftmp1], src, 0x00)                                      \
231         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
232         "pmullh     %[ftmp3],   %[ftmp2],       %[filter2]          \n\t"   \
233                                                                             \
234         MMI_ULWC1(%[ftmp1], src, -0x01)                                     \
235         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
236         "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
237         "psubsh     %[ftmp5],   %[ftmp3],       %[ftmp2]            \n\t"   \
238                                                                             \
239         MMI_ULWC1(%[ftmp1], src, 0x01)                                      \
240         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
241         "pmullh     %[ftmp3],   %[ftmp2],       %[filter3]          \n\t"   \
242                                                                             \
243         MMI_ULWC1(%[ftmp1], src, 0x02)                                      \
244         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
245         "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
246         "psubh      %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
247                                                                             \
248         "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"   \
249                                                                             \
250         "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_64]         \n\t"   \
251         "psrah      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"   \
252                                                                             \
253         "packushb   %[ftmp1],   %[ftmp3],       %[ftmp0]            \n\t"   \
254         MMI_SWC1(%[ftmp1], dst, 0x00)
255 
256 
257 #define PUT_VP8_EPEL4_V6_MMI(src, src1, dst, srcstride)                     \
258         MMI_ULWC1(%[ftmp1], src, 0x00)                                      \
259         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
260         "pmullh     %[ftmp3],   %[ftmp2],       %[filter2]          \n\t"   \
261                                                                             \
262         PTR_SUBU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
263         MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
264         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
265         "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
266         "psubsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
267                                                                             \
268         PTR_SUBU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
269         MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
270         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
271         "pmullh     %[ftmp2],   %[ftmp2],       %[filter0]          \n\t"   \
272         "paddsh     %[ftmp5],   %[ftmp3],       %[ftmp2]            \n\t"   \
273                                                                             \
274         PTR_ADDU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
275         MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
276         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
277         "pmullh     %[ftmp3],   %[ftmp2],       %[filter3]          \n\t"   \
278                                                                             \
279         PTR_ADDU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
280         MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
281         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
282         "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
283         "psubsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
284                                                                             \
285         PTR_ADDU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
286         MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
287         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
288         "pmullh     %[ftmp2],   %[ftmp2],       %[filter5]          \n\t"   \
289         "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
290                                                                             \
291         "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"   \
292                                                                             \
293         "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_64]         \n\t"   \
294         "psrah      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"   \
295         "packushb   %[ftmp1],   %[ftmp3],       %[ftmp0]            \n\t"   \
296                                                                             \
297         MMI_SWC1(%[ftmp1], dst, 0x00)
298 
299 
300 #define PUT_VP8_EPEL4_V4_MMI(src, src1, dst, srcstride)                     \
301         MMI_ULWC1(%[ftmp1], src, 0x00)                                      \
302         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
303         "pmullh     %[ftmp3],   %[ftmp2],       %[filter2]          \n\t"   \
304                                                                             \
305         PTR_SUBU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
306         MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
307         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
308         "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
309         "psubsh     %[ftmp5],   %[ftmp3],       %[ftmp2]            \n\t"   \
310                                                                             \
311         PTR_ADDU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
312         MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
313         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
314         "pmullh     %[ftmp3],   %[ftmp2],       %[filter3]          \n\t"   \
315                                                                             \
316         PTR_ADDU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
317         MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
318         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
319         "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
320         "psubsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
321                                                                             \
322         "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"   \
323                                                                             \
324         "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_64]         \n\t"   \
325         "psrah      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"   \
326         "packushb   %[ftmp1],   %[ftmp3],       %[ftmp0]            \n\t"   \
327                                                                             \
328         MMI_SWC1(%[ftmp1], dst, 0x00)
329 
330 
331 #define PUT_VP8_EPEL8_H6_MMI(src, dst)                                      \
332         MMI_ULDC1(%[ftmp1], src, 0x00)                                      \
333         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
334         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
335         "pmullh     %[ftmp5],   %[ftmp2],       %[filter2]          \n\t"   \
336         "pmullh     %[ftmp6],   %[ftmp3],       %[filter2]          \n\t"   \
337                                                                             \
338         MMI_ULDC1(%[ftmp1], src, -0x01)                                     \
339         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
340         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
341         "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
342         "pmullh     %[ftmp3],   %[ftmp3],       %[filter1]          \n\t"   \
343         "psubsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
344         "psubsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
345                                                                             \
346         MMI_ULDC1(%[ftmp1], src, -0x02)                                     \
347         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
348         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
349         "pmullh     %[ftmp2],   %[ftmp2],       %[filter0]          \n\t"   \
350         "pmullh     %[ftmp3],   %[ftmp3],       %[filter0]          \n\t"   \
351         "paddsh     %[ftmp7],   %[ftmp5],       %[ftmp2]            \n\t"   \
352         "paddsh     %[ftmp8],   %[ftmp6],       %[ftmp3]            \n\t"   \
353                                                                             \
354         MMI_ULDC1(%[ftmp1], src, 0x01)                                      \
355         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
356         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
357         "pmullh     %[ftmp5],   %[ftmp2],       %[filter3]          \n\t"   \
358         "pmullh     %[ftmp6],   %[ftmp3],       %[filter3]          \n\t"   \
359                                                                             \
360         MMI_ULDC1(%[ftmp1], src, 0x02)                                      \
361         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
362         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
363         "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
364         "pmullh     %[ftmp3],   %[ftmp3],       %[filter4]          \n\t"   \
365         "psubsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
366         "psubsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
367                                                                             \
368         MMI_ULDC1(%[ftmp1], src, 0x03)                                      \
369         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
370         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
371         "pmullh     %[ftmp2],   %[ftmp2],       %[filter5]          \n\t"   \
372         "pmullh     %[ftmp3],   %[ftmp3],       %[filter5]          \n\t"   \
373         "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
374         "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
375                                                                             \
376         "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"   \
377         "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"   \
378                                                                             \
379         "paddsh     %[ftmp5],   %[ftmp5],       %[ff_pw_64]         \n\t"   \
380         "paddsh     %[ftmp6],   %[ftmp6],       %[ff_pw_64]         \n\t"   \
381         "psrah      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"   \
382         "psrah      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"   \
383         "packushb   %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"   \
384                                                                             \
385         MMI_SDC1(%[ftmp1], dst, 0x00)
386 
387 
388 #define PUT_VP8_EPEL8_H4_MMI(src, dst)                                      \
389         MMI_ULDC1(%[ftmp1], src, 0x00)                                      \
390         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
391         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
392         "pmullh     %[ftmp5],   %[ftmp2],       %[filter2]          \n\t"   \
393         "pmullh     %[ftmp6],   %[ftmp3],       %[filter2]          \n\t"   \
394                                                                             \
395         MMI_ULDC1(%[ftmp1], src, -0x01)                                     \
396         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
397         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
398         "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
399         "pmullh     %[ftmp3],   %[ftmp3],       %[filter1]          \n\t"   \
400         "psubsh     %[ftmp7],   %[ftmp5],       %[ftmp2]            \n\t"   \
401         "psubsh     %[ftmp8],   %[ftmp6],       %[ftmp3]            \n\t"   \
402                                                                             \
403         MMI_ULDC1(%[ftmp1], src, 0x01)                                      \
404         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
405         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
406         "pmullh     %[ftmp5],   %[ftmp2],       %[filter3]          \n\t"   \
407         "pmullh     %[ftmp6],   %[ftmp3],       %[filter3]          \n\t"   \
408                                                                             \
409         MMI_ULDC1(%[ftmp1], src, 0x02)                                      \
410         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
411         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
412         "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
413         "pmullh     %[ftmp3],   %[ftmp3],       %[filter4]          \n\t"   \
414         "psubsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
415         "psubsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
416                                                                             \
417         "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"   \
418         "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"   \
419                                                                             \
420         "paddsh     %[ftmp5],   %[ftmp5],       %[ff_pw_64]         \n\t"   \
421         "paddsh     %[ftmp6],   %[ftmp6],       %[ff_pw_64]         \n\t"   \
422         "psrah      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"   \
423         "psrah      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"   \
424                                                                             \
425         "packushb   %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"   \
426         MMI_SDC1(%[ftmp1], dst, 0x00)
427 
428 
429 #define PUT_VP8_EPEL8_V6_MMI(src, src1, dst, srcstride)                     \
430         MMI_ULDC1(%[ftmp1], src, 0x00)                                      \
431         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
432         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
433         "pmullh     %[ftmp5],   %[ftmp2],       %[filter2]          \n\t"   \
434         "pmullh     %[ftmp6],   %[ftmp3],       %[filter2]          \n\t"   \
435                                                                             \
436         PTR_SUBU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
437         MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
438         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
439         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
440         "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
441         "pmullh     %[ftmp3],   %[ftmp3],       %[filter1]          \n\t"   \
442         "psubsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
443         "psubsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
444                                                                             \
445         PTR_SUBU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
446         MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
447         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
448         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
449         "pmullh     %[ftmp2],   %[ftmp2],       %[filter0]          \n\t"   \
450         "pmullh     %[ftmp3],   %[ftmp3],       %[filter0]          \n\t"   \
451         "paddsh     %[ftmp7],   %[ftmp5],       %[ftmp2]            \n\t"   \
452         "paddsh     %[ftmp8],   %[ftmp6],       %[ftmp3]            \n\t"   \
453                                                                             \
454         PTR_ADDU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
455         MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
456         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
457         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
458         "pmullh     %[ftmp5],   %[ftmp2],       %[filter3]          \n\t"   \
459         "pmullh     %[ftmp6],   %[ftmp3],       %[filter3]          \n\t"   \
460                                                                             \
461         PTR_ADDU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
462         MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
463         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
464         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
465         "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
466         "pmullh     %[ftmp3],   %[ftmp3],       %[filter4]          \n\t"   \
467         "psubsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
468         "psubsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
469                                                                             \
470         PTR_ADDU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
471         MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
472         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
473         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
474         "pmullh     %[ftmp2],   %[ftmp2],       %[filter5]          \n\t"   \
475         "pmullh     %[ftmp3],   %[ftmp3],       %[filter5]          \n\t"   \
476         "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
477         "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
478                                                                             \
479         "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"   \
480         "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"   \
481                                                                             \
482         "paddsh     %[ftmp5],   %[ftmp5],       %[ff_pw_64]         \n\t"   \
483         "paddsh     %[ftmp6],   %[ftmp6],       %[ff_pw_64]         \n\t"   \
484         "psrah      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"   \
485         "psrah      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"   \
486         "packushb   %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"   \
487                                                                             \
488         MMI_SDC1(%[ftmp1], dst, 0x00)
489 
490 
491 #define PUT_VP8_EPEL8_V4_MMI(src, src1, dst, srcstride)                     \
492         MMI_ULDC1(%[ftmp1], src, 0x00)                                      \
493         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
494         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
495         "pmullh     %[ftmp5],   %[ftmp2],       %[filter2]          \n\t"   \
496         "pmullh     %[ftmp6],   %[ftmp3],       %[filter2]          \n\t"   \
497                                                                             \
498         PTR_SUBU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
499         MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
500         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
501         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
502         "pmullh     %[ftmp2],   %[ftmp2],       %[filter1]          \n\t"   \
503         "pmullh     %[ftmp3],   %[ftmp3],       %[filter1]          \n\t"   \
504         "psubsh     %[ftmp7],   %[ftmp5],       %[ftmp2]            \n\t"   \
505         "psubsh     %[ftmp8],   %[ftmp6],       %[ftmp3]            \n\t"   \
506                                                                             \
507         PTR_ADDU   ""#src1",    "#src",         "#srcstride"        \n\t"   \
508         MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
509         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
510         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
511         "pmullh     %[ftmp5],   %[ftmp2],       %[filter3]          \n\t"   \
512         "pmullh     %[ftmp6],   %[ftmp3],       %[filter3]          \n\t"   \
513                                                                             \
514         PTR_ADDU   ""#src1",    "#src1",        "#srcstride"        \n\t"   \
515         MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
516         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
517         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
518         "pmullh     %[ftmp2],   %[ftmp2],       %[filter4]          \n\t"   \
519         "pmullh     %[ftmp3],   %[ftmp3],       %[filter4]          \n\t"   \
520         "psubsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
521         "psubsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
522                                                                             \
523         "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t"   \
524         "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp8]            \n\t"   \
525                                                                             \
526         "paddsh     %[ftmp5],   %[ftmp5],       %[ff_pw_64]         \n\t"   \
527         "paddsh     %[ftmp6],   %[ftmp6],       %[ff_pw_64]         \n\t"   \
528         "psrah      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"   \
529         "psrah      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"   \
530         "packushb   %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"   \
531                                                                             \
532         MMI_SDC1(%[ftmp1], dst, 0x00)
533 
534 
535 #define PUT_VP8_BILINEAR8_H_MMI(src, dst)                                   \
536         MMI_ULDC1(%[ftmp1], src, 0x00)                                      \
537         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
538         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
539         "pmullh     %[ftmp5],   %[ftmp2],       %[a]                \n\t"   \
540         "pmullh     %[ftmp6],   %[ftmp3],       %[a]                \n\t"   \
541                                                                             \
542         MMI_ULDC1(%[ftmp1], src, 0x01)                                      \
543         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
544         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
545         "pmullh     %[ftmp2],   %[ftmp2],       %[b]                \n\t"   \
546         "pmullh     %[ftmp3],   %[ftmp3],       %[b]                \n\t"   \
547         "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
548         "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
549                                                                             \
550         "paddsh     %[ftmp5],   %[ftmp5],       %[ff_pw_4]          \n\t"   \
551         "paddsh     %[ftmp6],   %[ftmp6],       %[ff_pw_4]          \n\t"   \
552         "psrah      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"   \
553         "psrah      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"   \
554                                                                             \
555         "packushb   %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"   \
556         MMI_SDC1(%[ftmp1], dst, 0x00)
557 
558 
559 #define PUT_VP8_BILINEAR4_H_MMI(src, dst)                                   \
560         MMI_ULWC1(%[ftmp1], src, 0x00)                                      \
561         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
562         "pmullh     %[ftmp3],   %[ftmp2],       %[a]                \n\t"   \
563                                                                             \
564         MMI_ULWC1(%[ftmp1], src, 0x01)                                      \
565         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
566         "pmullh     %[ftmp2],   %[ftmp2],       %[b]                \n\t"   \
567         "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
568                                                                             \
569         "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_4]          \n\t"   \
570         "psrah      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"   \
571                                                                             \
572         "packushb   %[ftmp1],   %[ftmp3],       %[ftmp0]            \n\t"   \
573         MMI_SWC1(%[ftmp1], dst, 0x00)
574 
575 
576 #define PUT_VP8_BILINEAR8_V_MMI(src, src1, dst, sstride)                    \
577         MMI_ULDC1(%[ftmp1], src, 0x00)                                      \
578         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
579         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
580         "pmullh     %[ftmp5],   %[ftmp2],       %[c]                \n\t"   \
581         "pmullh     %[ftmp6],   %[ftmp3],       %[c]                \n\t"   \
582                                                                             \
583         PTR_ADDU   ""#src1",    "#src",         "#sstride"          \n\t"   \
584         MMI_ULDC1(%[ftmp1], src1, 0x00)                                     \
585         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
586         "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t"   \
587         "pmullh     %[ftmp2],   %[ftmp2],       %[d]                \n\t"   \
588         "pmullh     %[ftmp3],   %[ftmp3],       %[d]                \n\t"   \
589         "paddsh     %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"   \
590         "paddsh     %[ftmp6],   %[ftmp6],       %[ftmp3]            \n\t"   \
591                                                                             \
592         "paddsh     %[ftmp5],   %[ftmp5],       %[ff_pw_4]          \n\t"   \
593         "paddsh     %[ftmp6],   %[ftmp6],       %[ff_pw_4]          \n\t"   \
594         "psrah      %[ftmp5],   %[ftmp5],       %[ftmp4]            \n\t"   \
595         "psrah      %[ftmp6],   %[ftmp6],       %[ftmp4]            \n\t"   \
596                                                                             \
597         "packushb   %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"   \
598         MMI_SDC1(%[ftmp1], dst, 0x00)
599 
600 
601 #define PUT_VP8_BILINEAR4_V_MMI(src, src1, dst, sstride)                    \
602         MMI_ULWC1(%[ftmp1], src, 0x00)                                      \
603         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
604         "pmullh     %[ftmp3],   %[ftmp2],       %[c]                \n\t"   \
605                                                                             \
606         PTR_ADDU   ""#src1",    "#src",         "#sstride"          \n\t"   \
607         MMI_ULWC1(%[ftmp1], src1, 0x00)                                     \
608         "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t"   \
609         "pmullh     %[ftmp2],   %[ftmp2],       %[d]                \n\t"   \
610         "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp2]            \n\t"   \
611                                                                             \
612         "paddsh     %[ftmp3],   %[ftmp3],       %[ff_pw_4]          \n\t"   \
613         "psrah      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"   \
614                                                                             \
615         "packushb   %[ftmp1],   %[ftmp3],       %[ftmp0]            \n\t"   \
616         MMI_SWC1(%[ftmp1], dst, 0x00)
617 
618 
619 DECLARE_ALIGNED(8, static const uint64_t, fourtap_subpel_filters[7][6]) = {
620    {0x0000000000000000, 0x0006000600060006, 0x007b007b007b007b,
621     0x000c000c000c000c, 0x0001000100010001, 0x0000000000000000},
622 
623    {0x0002000200020002, 0x000b000b000b000b, 0x006c006c006c006c,
624     0x0024002400240024, 0x0008000800080008, 0x0001000100010001},
625 
626    {0x0000000000000000, 0x0009000900090009, 0x005d005d005d005d,
627     0x0032003200320032, 0x0006000600060006, 0x0000000000000000},
628 
629    {0x0003000300030003, 0x0010001000100010, 0x004d004d004d004d,
630     0x004d004d004d004d, 0x0010001000100010, 0x0003000300030003},
631 
632    {0x0000000000000000, 0x0006000600060006, 0x0032003200320032,
633     0x005d005d005d005d, 0x0009000900090009, 0x0000000000000000},
634 
635    {0x0001000100010001, 0x0008000800080008, 0x0024002400240024,
636     0x006c006c006c006c, 0x000b000b000b000b, 0x0002000200020002},
637 
638    {0x0000000000000000, 0x0001000100010001, 0x000c000c000c000c,
639     0x007b007b007b007b, 0x0006000600060006, 0x0000000000000000}
640 };
641 
642 #if 0
643 #define FILTER_6TAP(src, F, stride)                                           \
644     cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] +             \
645         F[0] * src[x - 2 * stride] + F[3] * src[x + 1 * stride] -             \
646         F[4] * src[x + 2 * stride] + F[5] * src[x + 3 * stride] + 64) >> 7]
647 
648 #define FILTER_4TAP(src, F, stride)                                           \
649     cm[(F[2] * src[x + 0 * stride] - F[1] * src[x - 1 * stride] +             \
650         F[3] * src[x + 1 * stride] - F[4] * src[x + 2 * stride] + 64) >> 7]
651 
652 static const uint8_t subpel_filters[7][6] = {
653     { 0,  6, 123,  12,  1, 0 },
654     { 2, 11, 108,  36,  8, 1 },
655     { 0,  9,  93,  50,  6, 0 },
656     { 3, 16,  77,  77, 16, 3 },
657     { 0,  6,  50,  93,  9, 0 },
658     { 1,  8,  36, 108, 11, 2 },
659     { 0,  1,  12, 123,  6, 0 },
660 };
661 
662 #define MUL_20091(a) ((((a) * 20091) >> 16) + (a))
663 #define MUL_35468(a)  (((a) * 35468) >> 16)
664 #endif
665 
666 #define clip_int8(n) (cm[(n) + 0x80] - 0x80)
vp8_filter_common_is4tap(uint8_t * p,ptrdiff_t stride)667 static av_always_inline void vp8_filter_common_is4tap(uint8_t *p,
668         ptrdiff_t stride)
669 {
670     int av_unused p1 = p[-2 * stride];
671     int av_unused p0 = p[-1 * stride];
672     int av_unused q0 = p[ 0 * stride];
673     int av_unused q1 = p[ 1 * stride];
674     int a, f1, f2;
675     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
676 
677     a = 3 * (q0 - p0);
678     a += clip_int8(p1 - q1);
679     a = clip_int8(a);
680 
681     // We deviate from the spec here with c(a+3) >> 3
682     // since that's what libvpx does.
683     f1 = FFMIN(a + 4, 127) >> 3;
684     f2 = FFMIN(a + 3, 127) >> 3;
685 
686     // Despite what the spec says, we do need to clamp here to
687     // be bitexact with libvpx.
688     p[-1 * stride] = cm[p0 + f2];
689     p[ 0 * stride] = cm[q0 - f1];
690 }
691 
vp8_filter_common_isnot4tap(uint8_t * p,ptrdiff_t stride)692 static av_always_inline void vp8_filter_common_isnot4tap(uint8_t *p,
693         ptrdiff_t stride)
694 {
695     int av_unused p1 = p[-2 * stride];
696     int av_unused p0 = p[-1 * stride];
697     int av_unused q0 = p[ 0 * stride];
698     int av_unused q1 = p[ 1 * stride];
699     int a, f1, f2;
700     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
701 
702     a = 3 * (q0 - p0);
703     a = clip_int8(a);
704 
705     // We deviate from the spec here with c(a+3) >> 3
706     // since that's what libvpx does.
707     f1 = FFMIN(a + 4, 127) >> 3;
708     f2 = FFMIN(a + 3, 127) >> 3;
709 
710     // Despite what the spec says, we do need to clamp here to
711     // be bitexact with libvpx.
712     p[-1 * stride] = cm[p0 + f2];
713     p[ 0 * stride] = cm[q0 - f1];
714     a              = (f1 + 1) >> 1;
715     p[-2 * stride] = cm[p1 + a];
716     p[ 1 * stride] = cm[q1 - a];
717 }
718 
vp8_simple_limit(uint8_t * p,ptrdiff_t stride,int flim)719 static av_always_inline int vp8_simple_limit(uint8_t *p, ptrdiff_t stride,
720         int flim)
721 {
722     int av_unused p1 = p[-2 * stride];
723     int av_unused p0 = p[-1 * stride];
724     int av_unused q0 = p[ 0 * stride];
725     int av_unused q1 = p[ 1 * stride];
726 
727     return 2 * FFABS(p0 - q0) + (FFABS(p1 - q1) >> 1) <= flim;
728 }
729 
hev(uint8_t * p,ptrdiff_t stride,int thresh)730 static av_always_inline int hev(uint8_t *p, ptrdiff_t stride, int thresh)
731 {
732     int av_unused p1 = p[-2 * stride];
733     int av_unused p0 = p[-1 * stride];
734     int av_unused q0 = p[ 0 * stride];
735     int av_unused q1 = p[ 1 * stride];
736 
737     return FFABS(p1 - p0) > thresh || FFABS(q1 - q0) > thresh;
738 }
739 
filter_mbedge(uint8_t * p,ptrdiff_t stride)740 static av_always_inline void filter_mbedge(uint8_t *p, ptrdiff_t stride)
741 {
742     int a0, a1, a2, w;
743     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
744 
745     int av_unused p2 = p[-3 * stride];
746     int av_unused p1 = p[-2 * stride];
747     int av_unused p0 = p[-1 * stride];
748     int av_unused q0 = p[ 0 * stride];
749     int av_unused q1 = p[ 1 * stride];
750     int av_unused q2 = p[ 2 * stride];
751 
752     w = clip_int8(p1 - q1);
753     w = clip_int8(w + 3 * (q0 - p0));
754 
755     a0 = (27 * w + 63) >> 7;
756     a1 = (18 * w + 63) >> 7;
757     a2 =  (9 * w + 63) >> 7;
758 
759     p[-3 * stride] = cm[p2 + a2];
760     p[-2 * stride] = cm[p1 + a1];
761     p[-1 * stride] = cm[p0 + a0];
762     p[ 0 * stride] = cm[q0 - a0];
763     p[ 1 * stride] = cm[q1 - a1];
764     p[ 2 * stride] = cm[q2 - a2];
765 }
766 
vp8_normal_limit(uint8_t * p,ptrdiff_t stride,int E,int I)767 static av_always_inline int vp8_normal_limit(uint8_t *p, ptrdiff_t stride,
768         int E, int I)
769 {
770     int av_unused p3 = p[-4 * stride];
771     int av_unused p2 = p[-3 * stride];
772     int av_unused p1 = p[-2 * stride];
773     int av_unused p0 = p[-1 * stride];
774     int av_unused q0 = p[ 0 * stride];
775     int av_unused q1 = p[ 1 * stride];
776     int av_unused q2 = p[ 2 * stride];
777     int av_unused q3 = p[ 3 * stride];
778 
779     return vp8_simple_limit(p, stride, E) &&
780            FFABS(p3 - p2) <= I && FFABS(p2 - p1) <= I &&
781            FFABS(p1 - p0) <= I && FFABS(q3 - q2) <= I &&
782            FFABS(q2 - q1) <= I && FFABS(q1 - q0) <= I;
783 }
784 
vp8_v_loop_filter8_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)785 static av_always_inline void vp8_v_loop_filter8_mmi(uint8_t *dst,
786         ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
787 {
788     double ftmp[18];
789     uint32_t tmp[1];
790     DECLARE_DOUBLE_1;
791     DECLARE_DOUBLE_2;
792     DECLARE_UINT32_T;
793     __asm__ volatile(
794         /* Get data from dst */
795         "gsldlc1    %[q0],      0x07(%[dst])                      \n\t"
796         "gsldrc1    %[q0],      0x00(%[dst])                      \n\t"
797         PTR_SUBU    "%[tmp0],   %[dst],         %[stride]         \n\t"
798         "gsldlc1    %[p0],      0x07(%[tmp0])                     \n\t"
799         "gsldrc1    %[p0],      0x00(%[tmp0])                     \n\t"
800         PTR_SUBU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
801         "gsldlc1    %[p1],      0x07(%[tmp0])                     \n\t"
802         "gsldrc1    %[p1],      0x00(%[tmp0])                     \n\t"
803         PTR_SUBU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
804         "gsldlc1    %[p2],      0x07(%[tmp0])                     \n\t"
805         "gsldrc1    %[p2],      0x00(%[tmp0])                     \n\t"
806         PTR_SUBU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
807         "gsldlc1    %[p3],      0x07(%[tmp0])                     \n\t"
808         "gsldrc1    %[p3],      0x00(%[tmp0])                     \n\t"
809         PTR_ADDU    "%[tmp0],   %[dst],         %[stride]         \n\t"
810         "gsldlc1    %[q1],      0x07(%[tmp0])                     \n\t"
811         "gsldrc1    %[q1],      0x00(%[tmp0])                     \n\t"
812         PTR_ADDU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
813         "gsldlc1    %[q2],      0x07(%[tmp0])                     \n\t"
814         "gsldrc1    %[q2],      0x00(%[tmp0])                     \n\t"
815         PTR_ADDU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
816         "gsldlc1    %[q3],      0x07(%[tmp0])                     \n\t"
817         "gsldrc1    %[q3],      0x00(%[tmp0])                     \n\t"
818         MMI_VP8_LOOP_FILTER
819         /* Move to dst */
820         "gssdlc1    %[q0],      0x07(%[dst])                      \n\t"
821         "gssdrc1    %[q0],      0x00(%[dst])                      \n\t"
822         PTR_SUBU    "%[tmp0],   %[dst],         %[stride]         \n\t"
823         "gssdlc1    %[p0],      0x07(%[tmp0])                     \n\t"
824         "gssdrc1    %[p0],      0x00(%[tmp0])                     \n\t"
825         PTR_SUBU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
826         "gssdlc1    %[p1],      0x07(%[tmp0])                     \n\t"
827         "gssdrc1    %[p1],      0x00(%[tmp0])                     \n\t"
828         PTR_SUBU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
829         "gssdlc1    %[p2],      0x07(%[tmp0])                     \n\t"
830         "gssdrc1    %[p2],      0x00(%[tmp0])                     \n\t"
831         PTR_ADDU    "%[tmp0],   %[dst],         %[stride]         \n\t"
832         "gssdlc1    %[q1],      0x07(%[tmp0])                     \n\t"
833         "gssdrc1    %[q1],      0x00(%[tmp0])                     \n\t"
834         PTR_ADDU    "%[tmp0],   %[tmp0],        %[stride]         \n\t"
835         "gssdlc1    %[q2],      0x07(%[tmp0])                     \n\t"
836         "gssdrc1    %[q2],      0x00(%[tmp0])                     \n\t"
837         : [p3]"=&f"(ftmp[0]),       [p2]"=&f"(ftmp[1]),
838           [p1]"=&f"(ftmp[2]),       [p0]"=&f"(ftmp[3]),
839           [q0]"=&f"(ftmp[4]),       [q1]"=&f"(ftmp[5]),
840           [q2]"=&f"(ftmp[6]),       [q3]"=&f"(ftmp[7]),
841           [ftmp0]"=&f"(ftmp[8]),    [ftmp1]"=&f"(ftmp[9]),
842           [ftmp2]"=&f"(ftmp[10]),   [ftmp3]"=&f"(ftmp[11]),
843           [hev]"=&f"(ftmp[12]),     [mask]"=&f"(ftmp[13]),
844           [ftmp4]"=&f"(ftmp[14]),   [ftmp5]"=&f"(ftmp[15]),
845           [ftmp6]"=&f"(ftmp[16]),   [ftmp7]"=&f"(ftmp[17]),
846           [dst]"+&r"(dst),          [tmp0]"=&r"(tmp[0]),
847           RESTRICT_ASM_DOUBLE_1,    RESTRICT_ASM_DOUBLE_2,
848           RESTRICT_ASM_UINT32_T
849         : [e]"r"((mips_reg)flim_E), [thresh]"r"((mips_reg)hev_thresh),
850           [i]"r"((mips_reg)flim_I), [stride]"r"((mips_reg)stride)
851         : "memory"
852     );
853 }
854 
vp8_v_loop_filter8_inner_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)855 static av_always_inline void vp8_v_loop_filter8_inner_mmi(uint8_t *dst,
856         ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
857 {
858     int i;
859 
860     for (i = 0; i < 8; i++)
861         if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) {
862             int hv = hev(dst + i * 1, stride, hev_thresh);
863             if (hv)
864                 vp8_filter_common_is4tap(dst + i * 1, stride);
865             else
866                 vp8_filter_common_isnot4tap(dst + i * 1, stride);
867         }
868 }
869 
vp8_h_loop_filter8_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)870 static av_always_inline void vp8_h_loop_filter8_mmi(uint8_t *dst,
871         ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
872 {
873     double ftmp[18];
874     uint32_t tmp[1];
875     DECLARE_DOUBLE_1;
876     DECLARE_DOUBLE_2;
877     DECLARE_UINT32_T;
878     __asm__ volatile(
879         /* Get data from dst */
880         "gsldlc1    %[p3],        0x03(%[dst])                    \n\t"
881         "gsldrc1    %[p3],        -0x04(%[dst])                   \n\t"
882         PTR_ADDU    "%[tmp0],     %[dst],           %[stride]     \n\t"
883         "gsldlc1    %[p2],        0x03(%[tmp0])                   \n\t"
884         "gsldrc1    %[p2],        -0x04(%[tmp0])                  \n\t"
885         PTR_ADDU    "%[tmp0],     %[tmp0],          %[stride]     \n\t"
886         "gsldlc1    %[p1],        0x03(%[tmp0])                   \n\t"
887         "gsldrc1    %[p1],        -0x04(%[tmp0])                  \n\t"
888         PTR_ADDU    "%[tmp0],     %[tmp0],          %[stride]     \n\t"
889         "gsldlc1    %[p0],        0x03(%[tmp0])                   \n\t"
890         "gsldrc1    %[p0],        -0x04(%[tmp0])                  \n\t"
891         PTR_ADDU    "%[tmp0],     %[tmp0],          %[stride]     \n\t"
892         "gsldlc1    %[q0],        0x03(%[tmp0])                   \n\t"
893         "gsldrc1    %[q0],        -0x04(%[tmp0])                  \n\t"
894         PTR_ADDU    "%[tmp0],     %[tmp0],          %[stride]     \n\t"
895         "gsldlc1    %[q1],        0x03(%[tmp0])                   \n\t"
896         "gsldrc1    %[q1],        -0x04(%[tmp0])                  \n\t"
897         PTR_ADDU    "%[tmp0],     %[tmp0],          %[stride]     \n\t"
898         "gsldlc1    %[q2],        0x03(%[tmp0])                   \n\t"
899         "gsldrc1    %[q2],        -0x04(%[tmp0])                  \n\t"
900         PTR_ADDU    "%[tmp0],     %[tmp0],          %[stride]     \n\t"
901         "gsldlc1    %[q3],        0x03(%[tmp0])                   \n\t"
902         "gsldrc1    %[q3],        -0x04(%[tmp0])                  \n\t"
903         /* Matrix transpose */
904         TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0],
905                      %[q0], %[q1], %[q2], %[q3],
906                      %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
907         MMI_VP8_LOOP_FILTER
908         /* Matrix transpose */
909         TRANSPOSE_8B(%[p3], %[p2], %[p1], %[p0],
910                      %[q0], %[q1], %[q2], %[q3],
911                      %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
912         /* Move to dst */
913         "gssdlc1    %[p3],        0x03(%[dst])                    \n\t"
914         "gssdrc1    %[p3],        -0x04(%[dst])                   \n\t"
915         PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
916         "gssdlc1    %[p2],        0x03(%[dst])                    \n\t"
917         "gssdrc1    %[p2],        -0x04(%[dst])                   \n\t"
918         PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
919         "gssdlc1    %[p1],        0x03(%[dst])                    \n\t"
920         "gssdrc1    %[p1],        -0x04(%[dst])                   \n\t"
921         PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
922         "gssdlc1    %[p0],        0x03(%[dst])                    \n\t"
923         "gssdrc1    %[p0],        -0x04(%[dst])                   \n\t"
924         PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
925         "gssdlc1    %[q0],        0x03(%[dst])                    \n\t"
926         "gssdrc1    %[q0],        -0x04(%[dst])                   \n\t"
927         PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
928         "gssdlc1    %[q1],        0x03(%[dst])                    \n\t"
929         "gssdrc1    %[q1],        -0x04(%[dst])                   \n\t"
930         PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
931         "gssdlc1    %[q2],        0x03(%[dst])                    \n\t"
932         "gssdrc1    %[q2],        -0x04(%[dst])                   \n\t"
933         PTR_ADDU    "%[dst],      %[dst],           %[stride]     \n\t"
934         "gssdlc1    %[q3],        0x03(%[dst])                    \n\t"
935         "gssdrc1    %[q3],        -0x04(%[dst])                   \n\t"
936         : [p3]"=&f"(ftmp[0]),       [p2]"=&f"(ftmp[1]),
937           [p1]"=&f"(ftmp[2]),       [p0]"=&f"(ftmp[3]),
938           [q0]"=&f"(ftmp[4]),       [q1]"=&f"(ftmp[5]),
939           [q2]"=&f"(ftmp[6]),       [q3]"=&f"(ftmp[7]),
940           [ftmp0]"=&f"(ftmp[8]),    [ftmp1]"=&f"(ftmp[9]),
941           [ftmp2]"=&f"(ftmp[10]),   [ftmp3]"=&f"(ftmp[11]),
942           [hev]"=&f"(ftmp[12]),     [mask]"=&f"(ftmp[13]),
943           [ftmp4]"=&f"(ftmp[14]),   [ftmp5]"=&f"(ftmp[15]),
944           [ftmp6]"=&f"(ftmp[16]),   [ftmp7]"=&f"(ftmp[17]),
945           [dst]"+&r"(dst),          [tmp0]"=&r"(tmp[0]),
946           RESTRICT_ASM_DOUBLE_1,    RESTRICT_ASM_DOUBLE_2,
947           RESTRICT_ASM_UINT32_T
948         : [e]"r"((mips_reg)flim_E), [thresh]"r"((mips_reg)hev_thresh),
949           [i]"r"((mips_reg)flim_I), [stride]"r"((mips_reg)stride)
950         : "memory"
951     );
952 }
953 
vp8_h_loop_filter8_inner_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)954 static av_always_inline void vp8_h_loop_filter8_inner_mmi(uint8_t *dst,
955         ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
956 {
957     int i;
958 
959     for (i = 0; i < 8; i++)
960         if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) {
961             int hv = hev(dst + i * stride, 1, hev_thresh);
962             if (hv)
963                 vp8_filter_common_is4tap(dst + i * stride, 1);
964             else
965                 vp8_filter_common_isnot4tap(dst + i * stride, 1);
966         }
967 }
968 
ff_vp8_luma_dc_wht_mmi(int16_t block[4][4][16],int16_t dc[16])969 void ff_vp8_luma_dc_wht_mmi(int16_t block[4][4][16], int16_t dc[16])
970 {
971 #if 1
972     double ftmp[8];
973     DECLARE_VAR_ALL64;
974 
975     __asm__ volatile (
976         MMI_LDC1(%[ftmp0], %[dc], 0x00)
977         MMI_LDC1(%[ftmp1], %[dc], 0x08)
978         MMI_LDC1(%[ftmp2], %[dc], 0x10)
979         MMI_LDC1(%[ftmp3], %[dc], 0x18)
980         "paddsh     %[ftmp4],   %[ftmp0],       %[ftmp3]            \n\t"
981         "psubsh     %[ftmp5],   %[ftmp0],       %[ftmp3]            \n\t"
982         "paddsh     %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
983         "psubsh     %[ftmp7],   %[ftmp1],       %[ftmp2]            \n\t"
984         "paddsh     %[ftmp0],   %[ftmp4],       %[ftmp6]            \n\t"
985         "paddsh     %[ftmp1],   %[ftmp5],       %[ftmp7]            \n\t"
986         "psubsh     %[ftmp2],   %[ftmp4],       %[ftmp6]            \n\t"
987         "psubsh     %[ftmp3],   %[ftmp5],       %[ftmp7]            \n\t"
988         MMI_SDC1(%[ftmp0], %[dc], 0x00)
989         MMI_SDC1(%[ftmp1], %[dc], 0x08)
990         MMI_SDC1(%[ftmp2], %[dc], 0x10)
991         MMI_SDC1(%[ftmp3], %[dc], 0x18)
992         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
993           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
994           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
995           [ftmp6]"=&f"(ftmp[6]),
996           RESTRICT_ASM_ALL64
997           [ftmp7]"=&f"(ftmp[7])
998         : [dc]"r"((uint8_t*)dc)
999         : "memory"
1000     );
1001 
1002     block[0][0][0] = (dc[0] + dc[3] + 3 + dc[1] + dc[2]) >> 3;
1003     block[0][1][0] = (dc[0] - dc[3] + 3 + dc[1] - dc[2]) >> 3;
1004     block[0][2][0] = (dc[0] + dc[3] + 3 - dc[1] - dc[2]) >> 3;
1005     block[0][3][0] = (dc[0] - dc[3] + 3 - dc[1] + dc[2]) >> 3;
1006 
1007     block[1][0][0] = (dc[4] + dc[7] + 3 + dc[5] + dc[6]) >> 3;
1008     block[1][1][0] = (dc[4] - dc[7] + 3 + dc[5] - dc[6]) >> 3;
1009     block[1][2][0] = (dc[4] + dc[7] + 3 - dc[5] - dc[6]) >> 3;
1010     block[1][3][0] = (dc[4] - dc[7] + 3 - dc[5] + dc[6]) >> 3;
1011 
1012     block[2][0][0] = (dc[8] + dc[11] + 3 + dc[9] + dc[10]) >> 3;
1013     block[2][1][0] = (dc[8] - dc[11] + 3 + dc[9] - dc[10]) >> 3;
1014     block[2][2][0] = (dc[8] + dc[11] + 3 - dc[9] - dc[10]) >> 3;
1015     block[2][3][0] = (dc[8] - dc[11] + 3 - dc[9] + dc[10]) >> 3;
1016 
1017     block[3][0][0] = (dc[12] + dc[15] + 3 + dc[13] + dc[14]) >> 3;
1018     block[3][1][0] = (dc[12] - dc[15] + 3 + dc[13] - dc[14]) >> 3;
1019     block[3][2][0] = (dc[12] + dc[15] + 3 - dc[13] - dc[14]) >> 3;
1020     block[3][3][0] = (dc[12] - dc[15] + 3 - dc[13] + dc[14]) >> 3;
1021 
1022     __asm__ volatile (
1023         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1024         MMI_SDC1(%[ftmp0], %[dc], 0x00)
1025         MMI_SDC1(%[ftmp0], %[dc], 0x08)
1026         MMI_SDC1(%[ftmp0], %[dc], 0x10)
1027         MMI_SDC1(%[ftmp0], %[dc], 0x18)
1028         : RESTRICT_ASM_ALL64
1029           [ftmp0]"=&f"(ftmp[0])
1030         : [dc]"r"((uint8_t *)dc)
1031         : "memory"
1032     );
1033 #else
1034     int t00, t01, t02, t03, t10, t11, t12, t13, t20, t21, t22, t23, t30, t31, t32, t33;
1035 
1036     t00 = dc[0] + dc[12];
1037     t10 = dc[1] + dc[13];
1038     t20 = dc[2] + dc[14];
1039     t30 = dc[3] + dc[15];
1040 
1041     t03 = dc[0] - dc[12];
1042     t13 = dc[1] - dc[13];
1043     t23 = dc[2] - dc[14];
1044     t33 = dc[3] - dc[15];
1045 
1046     t01 = dc[4] + dc[ 8];
1047     t11 = dc[5] + dc[ 9];
1048     t21 = dc[6] + dc[10];
1049     t31 = dc[7] + dc[11];
1050 
1051     t02 = dc[4] - dc[ 8];
1052     t12 = dc[5] - dc[ 9];
1053     t22 = dc[6] - dc[10];
1054     t32 = dc[7] - dc[11];
1055 
1056     dc[ 0] = t00 + t01;
1057     dc[ 1] = t10 + t11;
1058     dc[ 2] = t20 + t21;
1059     dc[ 3] = t30 + t31;
1060 
1061     dc[ 4] = t03 + t02;
1062     dc[ 5] = t13 + t12;
1063     dc[ 6] = t23 + t22;
1064     dc[ 7] = t33 + t32;
1065 
1066     dc[ 8] = t00 - t01;
1067     dc[ 9] = t10 - t11;
1068     dc[10] = t20 - t21;
1069     dc[11] = t30 - t31;
1070 
1071     dc[12] = t03 - t02;
1072     dc[13] = t13 - t12;
1073     dc[14] = t23 - t22;
1074     dc[15] = t33 - t32;
1075 
1076     block[0][0][0] = (dc[0] + dc[3] + 3 + dc[1] + dc[2]) >> 3;
1077     block[0][1][0] = (dc[0] - dc[3] + 3 + dc[1] - dc[2]) >> 3;
1078     block[0][2][0] = (dc[0] + dc[3] + 3 - dc[1] - dc[2]) >> 3;
1079     block[0][3][0] = (dc[0] - dc[3] + 3 - dc[1] + dc[2]) >> 3;
1080 
1081     block[1][0][0] = (dc[4] + dc[7] + 3 + dc[5] + dc[6]) >> 3;
1082     block[1][1][0] = (dc[4] - dc[7] + 3 + dc[5] - dc[6]) >> 3;
1083     block[1][2][0] = (dc[4] + dc[7] + 3 - dc[5] - dc[6]) >> 3;
1084     block[1][3][0] = (dc[4] - dc[7] + 3 - dc[5] + dc[6]) >> 3;
1085 
1086     block[2][0][0] = (dc[8] + dc[11] + 3 + dc[9] + dc[10]) >> 3;
1087     block[2][1][0] = (dc[8] - dc[11] + 3 + dc[9] - dc[10]) >> 3;
1088     block[2][2][0] = (dc[8] + dc[11] + 3 - dc[9] - dc[10]) >> 3;
1089     block[2][3][0] = (dc[8] - dc[11] + 3 - dc[9] + dc[10]) >> 3;
1090 
1091     block[3][0][0] = (dc[12] + dc[15] + 3 + dc[13] + dc[14]) >> 3;
1092     block[3][1][0] = (dc[12] - dc[15] + 3 + dc[13] - dc[14]) >> 3;
1093     block[3][2][0] = (dc[12] + dc[15] + 3 - dc[13] - dc[14]) >> 3;
1094     block[3][3][0] = (dc[12] - dc[15] + 3 - dc[13] + dc[14]) >> 3;
1095 
1096     AV_ZERO64(dc + 0);
1097     AV_ZERO64(dc + 4);
1098     AV_ZERO64(dc + 8);
1099     AV_ZERO64(dc + 12);
1100 #endif
1101 }
1102 
ff_vp8_luma_dc_wht_dc_mmi(int16_t block[4][4][16],int16_t dc[16])1103 void ff_vp8_luma_dc_wht_dc_mmi(int16_t block[4][4][16], int16_t dc[16])
1104 {
1105     int val = (dc[0] + 3) >> 3;
1106 
1107     dc[0] = 0;
1108 
1109     block[0][0][0] = val;
1110     block[0][1][0] = val;
1111     block[0][2][0] = val;
1112     block[0][3][0] = val;
1113     block[1][0][0] = val;
1114     block[1][1][0] = val;
1115     block[1][2][0] = val;
1116     block[1][3][0] = val;
1117     block[2][0][0] = val;
1118     block[2][1][0] = val;
1119     block[2][2][0] = val;
1120     block[2][3][0] = val;
1121     block[3][0][0] = val;
1122     block[3][1][0] = val;
1123     block[3][2][0] = val;
1124     block[3][3][0] = val;
1125 }
1126 
ff_vp8_idct_add_mmi(uint8_t * dst,int16_t block[16],ptrdiff_t stride)1127 void ff_vp8_idct_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
1128 {
1129 #if 1
1130     DECLARE_ALIGNED(8, const uint64_t, ff_ph_4e7b) = {0x4e7b4e7b4e7b4e7bULL};
1131     DECLARE_ALIGNED(8, const uint64_t, ff_ph_22a3) = {0x22a322a322a322a3ULL};
1132     double ftmp[12];
1133     uint32_t tmp[1];
1134     DECLARE_VAR_LOW32;
1135     DECLARE_VAR_ALL64;
1136 
1137     __asm__ volatile (
1138         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1139         MMI_LDC1(%[ftmp1], %[block], 0x00)
1140         MMI_LDC1(%[ftmp2], %[block], 0x08)
1141         MMI_LDC1(%[ftmp3], %[block], 0x10)
1142         MMI_LDC1(%[ftmp4], %[block], 0x18)
1143 
1144         "li         %[tmp0],    0x02                                \n\t"
1145         "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
1146 
1147         // block[0...3] + block[8...11]
1148         "paddh      %[ftmp5],   %[ftmp1],       %[ftmp3]            \n\t"
1149         // block[0...3] - block[8...11]
1150         "psubh      %[ftmp6],   %[ftmp1],       %[ftmp3]            \n\t"
1151         // MUL_35468(block[12...15])
1152         "psllh      %[ftmp9],   %[ftmp4],       %[ftmp11]           \n\t"
1153         "pmulhh     %[ftmp7],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
1154         // MUL_35468(block[4...7])
1155         "psllh      %[ftmp9],   %[ftmp2],       %[ftmp11]           \n\t"
1156         "pmulhh     %[ftmp8],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
1157         // MUL_20091(block[4...7]
1158         "pmulhh     %[ftmp9],   %[ftmp2],       %[ff_ph_4e7b]       \n\t"
1159         "paddh      %[ftmp9],   %[ftmp9],       %[ftmp2]            \n\t"
1160         // MUL_20091(block[12...15])
1161         "pmulhh     %[ftmp10],  %[ftmp4],       %[ff_ph_4e7b]       \n\t"
1162         "paddh      %[ftmp10],  %[ftmp10],      %[ftmp4]            \n\t"
1163 
1164         // tmp[0 4  8 12]
1165         "paddh      %[ftmp1],   %[ftmp5],       %[ftmp7]            \n\t"
1166         "paddh      %[ftmp1],   %[ftmp1],       %[ftmp9]            \n\t"
1167         // tmp[1 5  9 13]
1168         "paddh      %[ftmp2],   %[ftmp6],       %[ftmp8]            \n\t"
1169         "psubh      %[ftmp2],   %[ftmp2],       %[ftmp10]           \n\t"
1170         // tmp[2 6 10 14]
1171         "psubh      %[ftmp3],   %[ftmp6],       %[ftmp8]            \n\t"
1172         "paddh      %[ftmp3],   %[ftmp3],       %[ftmp10]           \n\t"
1173         // tmp[3 7 11 15]
1174         "psubh      %[ftmp4],   %[ftmp5],       %[ftmp7]            \n\t"
1175         "psubh      %[ftmp4],   %[ftmp4],       %[ftmp9]            \n\t"
1176 
1177         MMI_SDC1(%[ftmp0], %[block], 0x00)
1178         MMI_SDC1(%[ftmp0], %[block], 0x08)
1179         MMI_SDC1(%[ftmp0], %[block], 0x10)
1180         MMI_SDC1(%[ftmp0], %[block], 0x18)
1181 
1182         TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
1183                      %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8])
1184 
1185         // t[0 4  8 12]
1186         "paddh      %[ftmp5],   %[ftmp1],       %[ftmp3]            \n\t"
1187         // t[1 5  9 13]
1188         "psubh      %[ftmp6],   %[ftmp1],       %[ftmp3]            \n\t"
1189         // t[2 6 10 14]
1190         "psllh      %[ftmp9],   %[ftmp2],       %[ftmp11]           \n\t"
1191         "pmulhh     %[ftmp9],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
1192         "psubh      %[ftmp7],   %[ftmp9],       %[ftmp4]            \n\t"
1193         "pmulhh     %[ftmp10],  %[ftmp4],       %[ff_ph_4e7b]       \n\t"
1194         "psubh      %[ftmp7],   %[ftmp7],       %[ftmp10]           \n\t"
1195         // t[3 7 11 15]
1196         "psllh      %[ftmp9],   %[ftmp4],       %[ftmp11]           \n\t"
1197         "pmulhh     %[ftmp9],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
1198         "paddh      %[ftmp8],   %[ftmp9],       %[ftmp2]            \n\t"
1199         "pmulhh     %[ftmp10],  %[ftmp2],       %[ff_ph_4e7b]       \n\t"
1200         "paddh      %[ftmp8],   %[ftmp8],       %[ftmp10]           \n\t"
1201 
1202         "li         %[tmp0],    0x03                                \n\t"
1203         "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
1204         "paddh      %[ftmp1],   %[ftmp5],       %[ftmp8]            \n\t"
1205         "paddh      %[ftmp1],   %[ftmp1],       %[ff_pw_4]          \n\t"
1206         "psrah      %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
1207         "paddh      %[ftmp2],   %[ftmp6],       %[ftmp7]            \n\t"
1208         "paddh      %[ftmp2],   %[ftmp2],       %[ff_pw_4]          \n\t"
1209         "psrah      %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
1210         "psubh      %[ftmp3],   %[ftmp6],       %[ftmp7]            \n\t"
1211         "paddh      %[ftmp3],   %[ftmp3],       %[ff_pw_4]          \n\t"
1212         "psrah      %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
1213         "psubh      %[ftmp4],   %[ftmp5],       %[ftmp8]            \n\t"
1214         "paddh      %[ftmp4],   %[ftmp4],       %[ff_pw_4]          \n\t"
1215         "psrah      %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
1216 
1217         TRANSPOSE_4H(%[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4],
1218                      %[ftmp5], %[ftmp6], %[ftmp7], %[ftmp8])
1219 
1220         MMI_LWC1(%[ftmp5], %[dst0], 0x00)
1221         MMI_LWC1(%[ftmp6], %[dst1], 0x00)
1222         MMI_LWC1(%[ftmp7], %[dst2], 0x00)
1223         MMI_LWC1(%[ftmp8], %[dst3], 0x00)
1224 
1225         "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]            \n\t"
1226         "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
1227         "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]            \n\t"
1228         "punpcklbh  %[ftmp8],   %[ftmp8],       %[ftmp0]            \n\t"
1229 
1230         "paddh      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
1231         "paddh      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
1232         "paddh      %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
1233         "paddh      %[ftmp4],   %[ftmp4],       %[ftmp8]            \n\t"
1234 
1235         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
1236         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
1237         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
1238         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"
1239 
1240         MMI_SWC1(%[ftmp1], %[dst0], 0x00)
1241         MMI_SWC1(%[ftmp2], %[dst1], 0x00)
1242         MMI_SWC1(%[ftmp3], %[dst2], 0x00)
1243         MMI_SWC1(%[ftmp4], %[dst3], 0x00)
1244         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1245           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1246           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1247           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
1248           [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
1249           [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
1250           RESTRICT_ASM_LOW32
1251           RESTRICT_ASM_ALL64
1252           [tmp0]"=&r"(tmp[0])
1253         : [dst0]"r"(dst),                   [dst1]"r"(dst+stride),
1254           [dst2]"r"(dst+2*stride),          [dst3]"r"(dst+3*stride),
1255           [block]"r"(block),                [ff_pw_4]"f"(ff_pw_4),
1256           [ff_ph_4e7b]"f"(ff_ph_4e7b),      [ff_ph_22a3]"f"(ff_ph_22a3)
1257         : "memory"
1258     );
1259 #else
1260     int i, t0, t1, t2, t3;
1261     int16_t tmp[16];
1262 
1263     for (i = 0; i < 4; i++) {
1264         t0 = block[0 + i] + block[8 + i];
1265         t1 = block[0 + i] - block[8 + i];
1266         t2 = MUL_35468(block[4 + i]) - MUL_20091(block[12 + i]);
1267         t3 = MUL_20091(block[4 + i]) + MUL_35468(block[12 + i]);
1268         block[ 0 + i] = 0;
1269         block[ 4 + i] = 0;
1270         block[ 8 + i] = 0;
1271         block[12 + i] = 0;
1272 
1273         tmp[i * 4 + 0] = t0 + t3;
1274         tmp[i * 4 + 1] = t1 + t2;
1275         tmp[i * 4 + 2] = t1 - t2;
1276         tmp[i * 4 + 3] = t0 - t3;
1277     }
1278 
1279     for (i = 0; i < 4; i++) {
1280         t0 = tmp[0 + i] + tmp[8 + i];
1281         t1 = tmp[0 + i] - tmp[8 + i];
1282         t2 = MUL_35468(tmp[4 + i]) - MUL_20091(tmp[12 + i]);
1283         t3 = MUL_20091(tmp[4 + i]) + MUL_35468(tmp[12 + i]);
1284 
1285         dst[0] = av_clip_uint8(dst[0] + ((t0 + t3 + 4) >> 3));
1286         dst[1] = av_clip_uint8(dst[1] + ((t1 + t2 + 4) >> 3));
1287         dst[2] = av_clip_uint8(dst[2] + ((t1 - t2 + 4) >> 3));
1288         dst[3] = av_clip_uint8(dst[3] + ((t0 - t3 + 4) >> 3));
1289         dst   += stride;
1290     }
1291 #endif
1292 }
1293 
ff_vp8_idct_dc_add_mmi(uint8_t * dst,int16_t block[16],ptrdiff_t stride)1294 void ff_vp8_idct_dc_add_mmi(uint8_t *dst, int16_t block[16], ptrdiff_t stride)
1295 {
1296 #if 1
1297     int dc = (block[0] + 4) >> 3;
1298     double ftmp[6];
1299     DECLARE_VAR_LOW32;
1300 
1301     block[0] = 0;
1302 
1303     __asm__ volatile (
1304         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1305         "mtc1       %[dc],      %[ftmp5]                            \n\t"
1306         MMI_LWC1(%[ftmp1], %[dst0], 0x00)
1307         MMI_LWC1(%[ftmp2], %[dst1], 0x00)
1308         MMI_LWC1(%[ftmp3], %[dst2], 0x00)
1309         MMI_LWC1(%[ftmp4], %[dst3], 0x00)
1310         "pshufh     %[ftmp5],   %[ftmp5],       %[ftmp0]            \n\t"
1311         "punpcklbh  %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
1312         "punpcklbh  %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
1313         "punpcklbh  %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
1314         "punpcklbh  %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"
1315         "paddsh     %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
1316         "paddsh     %[ftmp2],   %[ftmp2],       %[ftmp5]            \n\t"
1317         "paddsh     %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
1318         "paddsh     %[ftmp4],   %[ftmp4],       %[ftmp5]            \n\t"
1319         "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
1320         "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
1321         "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
1322         "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"
1323         MMI_SWC1(%[ftmp1], %[dst0], 0x00)
1324         MMI_SWC1(%[ftmp2], %[dst1], 0x00)
1325         MMI_SWC1(%[ftmp3], %[dst2], 0x00)
1326         MMI_SWC1(%[ftmp4], %[dst3], 0x00)
1327         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1328           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1329           [ftmp4]"=&f"(ftmp[4]),
1330           RESTRICT_ASM_LOW32
1331           [ftmp5]"=&f"(ftmp[5])
1332         : [dst0]"r"(dst),                   [dst1]"r"(dst+stride),
1333           [dst2]"r"(dst+2*stride),          [dst3]"r"(dst+3*stride),
1334           [dc]"r"(dc)
1335         : "memory"
1336     );
1337 #else
1338     int i, dc = (block[0] + 4) >> 3;
1339 
1340     block[0] = 0;
1341 
1342     for (i = 0; i < 4; i++) {
1343         dst[0] = av_clip_uint8(dst[0] + dc);
1344         dst[1] = av_clip_uint8(dst[1] + dc);
1345         dst[2] = av_clip_uint8(dst[2] + dc);
1346         dst[3] = av_clip_uint8(dst[3] + dc);
1347         dst   += stride;
1348     }
1349 #endif
1350 }
1351 
ff_vp8_idct_dc_add4y_mmi(uint8_t * dst,int16_t block[4][16],ptrdiff_t stride)1352 void ff_vp8_idct_dc_add4y_mmi(uint8_t *dst, int16_t block[4][16],
1353         ptrdiff_t stride)
1354 {
1355     ff_vp8_idct_dc_add_mmi(dst +  0, block[0], stride);
1356     ff_vp8_idct_dc_add_mmi(dst +  4, block[1], stride);
1357     ff_vp8_idct_dc_add_mmi(dst +  8, block[2], stride);
1358     ff_vp8_idct_dc_add_mmi(dst + 12, block[3], stride);
1359 }
1360 
ff_vp8_idct_dc_add4uv_mmi(uint8_t * dst,int16_t block[4][16],ptrdiff_t stride)1361 void ff_vp8_idct_dc_add4uv_mmi(uint8_t *dst, int16_t block[4][16],
1362         ptrdiff_t stride)
1363 {
1364     ff_vp8_idct_dc_add_mmi(dst + stride * 0 + 0, block[0], stride);
1365     ff_vp8_idct_dc_add_mmi(dst + stride * 0 + 4, block[1], stride);
1366     ff_vp8_idct_dc_add_mmi(dst + stride * 4 + 0, block[2], stride);
1367     ff_vp8_idct_dc_add_mmi(dst + stride * 4 + 4, block[3], stride);
1368 }
1369 
1370 // loop filter applied to edges between macroblocks
ff_vp8_v_loop_filter16_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1371 void ff_vp8_v_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E,
1372         int flim_I, int hev_thresh)
1373 {
1374     vp8_v_loop_filter8_mmi(dst, stride, flim_E, flim_I, hev_thresh);
1375     vp8_v_loop_filter8_mmi(dst + 8, stride, flim_E, flim_I, hev_thresh);
1376 }
1377 
ff_vp8_h_loop_filter16_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1378 void ff_vp8_h_loop_filter16_mmi(uint8_t *dst, ptrdiff_t stride, int flim_E,
1379         int flim_I, int hev_thresh)
1380 {
1381     vp8_h_loop_filter8_mmi(dst, stride, flim_E, flim_I, hev_thresh);
1382     vp8_h_loop_filter8_mmi(dst + 8 * stride, stride, flim_E, flim_I,
1383                            hev_thresh);
1384 }
1385 
ff_vp8_v_loop_filter8uv_mmi(uint8_t * dstU,uint8_t * dstV,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1386 void ff_vp8_v_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride,
1387         int flim_E, int flim_I, int hev_thresh)
1388 {
1389     vp8_v_loop_filter8_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1390     vp8_v_loop_filter8_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1391 }
1392 
ff_vp8_h_loop_filter8uv_mmi(uint8_t * dstU,uint8_t * dstV,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1393 void ff_vp8_h_loop_filter8uv_mmi(uint8_t *dstU, uint8_t *dstV, ptrdiff_t stride,
1394         int flim_E, int flim_I, int hev_thresh)
1395 {
1396     vp8_h_loop_filter8_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1397     vp8_h_loop_filter8_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1398 }
1399 
1400 // loop filter applied to inner macroblock edges
ff_vp8_v_loop_filter16_inner_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1401 void ff_vp8_v_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride,
1402         int flim_E, int flim_I, int hev_thresh)
1403 {
1404     int i;
1405 
1406     for (i = 0; i < 16; i++)
1407         if (vp8_normal_limit(dst + i * 1, stride, flim_E, flim_I)) {
1408             int hv = hev(dst + i * 1, stride, hev_thresh);
1409             if (hv)
1410                 vp8_filter_common_is4tap(dst + i * 1, stride);
1411             else
1412                 vp8_filter_common_isnot4tap(dst + i * 1, stride);
1413         }
1414 }
1415 
ff_vp8_h_loop_filter16_inner_mmi(uint8_t * dst,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1416 void ff_vp8_h_loop_filter16_inner_mmi(uint8_t *dst, ptrdiff_t stride,
1417         int flim_E, int flim_I, int hev_thresh)
1418 {
1419     int i;
1420 
1421     for (i = 0; i < 16; i++)
1422         if (vp8_normal_limit(dst + i * stride, 1, flim_E, flim_I)) {
1423             int hv = hev(dst + i * stride, 1, hev_thresh);
1424             if (hv)
1425                 vp8_filter_common_is4tap(dst + i * stride, 1);
1426             else
1427                 vp8_filter_common_isnot4tap(dst + i * stride, 1);
1428         }
1429 }
1430 
ff_vp8_v_loop_filter8uv_inner_mmi(uint8_t * dstU,uint8_t * dstV,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1431 void ff_vp8_v_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV,
1432         ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
1433 {
1434     vp8_v_loop_filter8_inner_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1435     vp8_v_loop_filter8_inner_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1436 }
1437 
ff_vp8_h_loop_filter8uv_inner_mmi(uint8_t * dstU,uint8_t * dstV,ptrdiff_t stride,int flim_E,int flim_I,int hev_thresh)1438 void ff_vp8_h_loop_filter8uv_inner_mmi(uint8_t *dstU, uint8_t *dstV,
1439         ptrdiff_t stride, int flim_E, int flim_I, int hev_thresh)
1440 {
1441     vp8_h_loop_filter8_inner_mmi(dstU, stride, flim_E, flim_I, hev_thresh);
1442     vp8_h_loop_filter8_inner_mmi(dstV, stride, flim_E, flim_I, hev_thresh);
1443 }
1444 
ff_vp8_v_loop_filter_simple_mmi(uint8_t * dst,ptrdiff_t stride,int flim)1445 void ff_vp8_v_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
1446 {
1447     int i;
1448 
1449     for (i = 0; i < 16; i++)
1450         if (vp8_simple_limit(dst + i, stride, flim))
1451             vp8_filter_common_is4tap(dst + i, stride);
1452 }
1453 
ff_vp8_h_loop_filter_simple_mmi(uint8_t * dst,ptrdiff_t stride,int flim)1454 void ff_vp8_h_loop_filter_simple_mmi(uint8_t *dst, ptrdiff_t stride, int flim)
1455 {
1456     int i;
1457 
1458     for (i = 0; i < 16; i++)
1459         if (vp8_simple_limit(dst + i * stride, 1, flim))
1460             vp8_filter_common_is4tap(dst + i * stride, 1);
1461 }
1462 
ff_put_vp8_pixels16_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int x,int y)1463 void ff_put_vp8_pixels16_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1464         ptrdiff_t srcstride, int h, int x, int y)
1465 {
1466 #if 1
1467     double ftmp[2];
1468     uint64_t tmp[2];
1469     mips_reg addr[2];
1470     DECLARE_VAR_ALL64;
1471 
1472     __asm__ volatile (
1473         "1:                                                         \n\t"
1474         PTR_ADDU   "%[addr0],   %[src],         %[srcstride]        \n\t"
1475         MMI_ULDC1(%[ftmp0], %[src], 0x00)
1476         "ldl        %[tmp0],    0x0f(%[src])                        \n\t"
1477         "ldr        %[tmp0],    0x08(%[src])                        \n\t"
1478         MMI_ULDC1(%[ftmp1], %[addr0], 0x00)
1479         "ldl        %[tmp1],    0x0f(%[addr0])                      \n\t"
1480         "ldr        %[tmp1],    0x08(%[addr0])                      \n\t"
1481         PTR_ADDU   "%[addr1],   %[dst],         %[dststride]        \n\t"
1482         MMI_SDC1(%[ftmp0], %[dst], 0x00)
1483         "sdl        %[tmp0],    0x0f(%[dst])                        \n\t"
1484         "sdr        %[tmp0],    0x08(%[dst])                        \n\t"
1485         "addiu      %[h],       %[h],           -0x02               \n\t"
1486         MMI_SDC1(%[ftmp1], %[addr1], 0x00)
1487         PTR_ADDU   "%[src],     %[addr0],       %[srcstride]        \n\t"
1488         "sdl        %[tmp1],    0x0f(%[addr1])                      \n\t"
1489         "sdr        %[tmp1],    0x08(%[addr1])                      \n\t"
1490         PTR_ADDU   "%[dst],     %[addr1],       %[dststride]        \n\t"
1491         "bnez       %[h],       1b                                  \n\t"
1492         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1493           [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
1494           RESTRICT_ASM_ALL64
1495           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
1496           [dst]"+&r"(dst),                  [src]"+&r"(src),
1497           [h]"+&r"(h)
1498         : [dststride]"r"((mips_reg)dststride),
1499           [srcstride]"r"((mips_reg)srcstride)
1500         : "memory"
1501     );
1502 #else
1503     int i;
1504 
1505     for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1506         memcpy(dst, src, 16);
1507 #endif
1508 }
1509 
ff_put_vp8_pixels8_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int x,int y)1510 void ff_put_vp8_pixels8_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1511         ptrdiff_t srcstride, int h, int x, int y)
1512 {
1513 #if 1
1514     double ftmp[1];
1515     uint64_t tmp[1];
1516     mips_reg addr[2];
1517     DECLARE_VAR_ALL64;
1518 
1519     __asm__ volatile (
1520         "1:                                                         \n\t"
1521         PTR_ADDU   "%[addr0],   %[src],         %[srcstride]        \n\t"
1522         MMI_ULDC1(%[ftmp0], %[src], 0x00)
1523         "ldl        %[tmp0],    0x07(%[addr0])                      \n\t"
1524         "ldr        %[tmp0],    0x00(%[addr0])                      \n\t"
1525         PTR_ADDU   "%[addr1],   %[dst],         %[dststride]        \n\t"
1526         MMI_SDC1(%[ftmp0], %[dst], 0x00)
1527         "addiu      %[h],       %[h],           -0x02               \n\t"
1528         "sdl        %[tmp0],    0x07(%[addr1])                      \n\t"
1529         "sdr        %[tmp0],    0x00(%[addr1])                      \n\t"
1530         PTR_ADDU   "%[src],     %[addr0],       %[srcstride]        \n\t"
1531         PTR_ADDU   "%[dst],     %[addr1],       %[dststride]        \n\t"
1532         "bnez       %[h],       1b                                  \n\t"
1533         : [ftmp0]"=&f"(ftmp[0]),            [tmp0]"=&r"(tmp[0]),
1534           RESTRICT_ASM_ALL64
1535           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
1536           [dst]"+&r"(dst),                  [src]"+&r"(src),
1537           [h]"+&r"(h)
1538         : [dststride]"r"((mips_reg)dststride),
1539           [srcstride]"r"((mips_reg)srcstride)
1540         : "memory"
1541     );
1542 #else
1543     int i;
1544 
1545     for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1546         memcpy(dst, src, 8);
1547 #endif
1548 }
1549 
ff_put_vp8_pixels4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int x,int y)1550 void ff_put_vp8_pixels4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1551         ptrdiff_t srcstride, int h, int x, int y)
1552 {
1553 #if 1
1554     double ftmp[1];
1555     uint64_t tmp[1];
1556     mips_reg addr[2];
1557     DECLARE_VAR_LOW32;
1558 
1559     __asm__ volatile (
1560         "1:                                                         \n\t"
1561         PTR_ADDU   "%[addr0],   %[src],         %[srcstride]        \n\t"
1562         MMI_LWC1(%[ftmp0], %[src], 0x00)
1563         "lwl        %[tmp0],    0x03(%[addr0])                      \n\t"
1564         "lwr        %[tmp0],    0x00(%[addr0])                      \n\t"
1565         PTR_ADDU   "%[addr1],   %[dst],         %[dststride]        \n\t"
1566         MMI_SWC1(%[ftmp0], %[dst], 0x00)
1567         "addiu      %[h],       %[h],           -0x02               \n\t"
1568         "swl        %[tmp0],    0x03(%[addr1])                      \n\t"
1569         "swr        %[tmp0],    0x00(%[addr1])                      \n\t"
1570         PTR_ADDU   "%[src],     %[addr0],       %[srcstride]        \n\t"
1571         PTR_ADDU   "%[dst],     %[addr1],       %[dststride]        \n\t"
1572         "bnez       %[h],       1b                                  \n\t"
1573         : [ftmp0]"=&f"(ftmp[0]),            [tmp0]"=&r"(tmp[0]),
1574           RESTRICT_ASM_LOW32
1575           [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
1576           [dst]"+&r"(dst),                  [src]"+&r"(src),
1577           [h]"+&r"(h)
1578         : [dststride]"r"((mips_reg)dststride),
1579           [srcstride]"r"((mips_reg)srcstride)
1580         : "memory"
1581     );
1582 #else
1583     int i;
1584 
1585     for (i = 0; i < h; i++, dst += dststride, src += srcstride)
1586         memcpy(dst, src, 4);
1587 #endif
1588 }
1589 
ff_put_vp8_epel16_h4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)1590 void ff_put_vp8_epel16_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1591         ptrdiff_t srcstride, int h, int mx, int my)
1592 {
1593 #if 1
1594     const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1595     double ftmp[9];
1596     uint32_t tmp[1];
1597     mips_reg src1, dst1;
1598     DECLARE_VAR_ALL64;
1599 
1600     /*
1601     dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1602     dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1603     dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1604     dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1605     dst[4] = cm[(filter[2] * src[4] - filter[1] * src[ 3] + filter[3] * src[5] - filter[4] * src[6] + 64) >> 7];
1606     dst[5] = cm[(filter[2] * src[5] - filter[1] * src[ 4] + filter[3] * src[6] - filter[4] * src[7] + 64) >> 7];
1607     dst[6] = cm[(filter[2] * src[6] - filter[1] * src[ 5] + filter[3] * src[7] - filter[4] * src[8] + 64) >> 7];
1608     dst[7] = cm[(filter[2] * src[7] - filter[1] * src[ 6] + filter[3] * src[8] - filter[4] * src[9] + 64) >> 7];
1609 
1610     dst[ 8] = cm[(filter[2] * src[ 8] - filter[1] * src[ 7] + filter[3] * src[ 9] - filter[4] * src[10] + 64) >> 7];
1611     dst[ 9] = cm[(filter[2] * src[ 9] - filter[1] * src[ 8] + filter[3] * src[10] - filter[4] * src[11] + 64) >> 7];
1612     dst[10] = cm[(filter[2] * src[10] - filter[1] * src[ 9] + filter[3] * src[11] - filter[4] * src[12] + 64) >> 7];
1613     dst[11] = cm[(filter[2] * src[11] - filter[1] * src[10] + filter[3] * src[12] - filter[4] * src[13] + 64) >> 7];
1614     dst[12] = cm[(filter[2] * src[12] - filter[1] * src[11] + filter[3] * src[13] - filter[4] * src[14] + 64) >> 7];
1615     dst[13] = cm[(filter[2] * src[13] - filter[1] * src[12] + filter[3] * src[14] - filter[4] * src[15] + 64) >> 7];
1616     dst[14] = cm[(filter[2] * src[14] - filter[1] * src[13] + filter[3] * src[15] - filter[4] * src[16] + 64) >> 7];
1617     dst[15] = cm[(filter[2] * src[15] - filter[1] * src[14] + filter[3] * src[16] - filter[4] * src[17] + 64) >> 7];
1618     */
1619     __asm__ volatile (
1620         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1621         "li         %[tmp0],    0x07                                \n\t"
1622         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
1623 
1624         "1:                                                         \n\t"
1625         // 0 - 7
1626         PUT_VP8_EPEL8_H4_MMI(%[src], %[dst])
1627         PTR_ADDIU  "%[src1],    %[src],         0x08                \n\t"
1628         PTR_ADDIU  "%[dst1],    %[dst],         0x08                \n\t"
1629         // 8 - 15
1630         PUT_VP8_EPEL8_H4_MMI(%[src1], %[dst1])
1631 
1632         "addiu      %[h],       %[h],           -0x01               \n\t"
1633         PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
1634         PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
1635         "bnez       %[h],       1b                                  \n\t"
1636         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1637           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1638           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1639           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
1640           [ftmp8]"=&f"(ftmp[8]),
1641           [tmp0]"=&r"(tmp[0]),
1642           RESTRICT_ASM_ALL64
1643           [dst1]"=&r"(dst1),                [src1]"=&r"(src1),
1644           [h]"+&r"(h),
1645           [dst]"+&r"(dst),                  [src]"+&r"(src)
1646         : [ff_pw_64]"f"(ff_pw_64),
1647           [srcstride]"r"((mips_reg)srcstride),
1648           [dststride]"r"((mips_reg)dststride),
1649           [filter1]"f"(filter[1]),          [filter2]"f"(filter[2]),
1650           [filter3]"f"(filter[3]),          [filter4]"f"(filter[4])
1651         : "memory"
1652     );
1653 #else
1654     const uint8_t *filter = subpel_filters[mx - 1];
1655     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
1656     int x, y;
1657 
1658     for (y = 0; y < h; y++) {
1659         for (x = 0; x < 16; x++)
1660             dst[x] = FILTER_4TAP(src, filter, 1);
1661         dst += dststride;
1662         src += srcstride;
1663     }
1664 #endif
1665 }
1666 
ff_put_vp8_epel8_h4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)1667 void ff_put_vp8_epel8_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1668         ptrdiff_t srcstride, int h, int mx, int my)
1669 {
1670 #if 1
1671     const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1672     double ftmp[9];
1673     uint32_t tmp[1];
1674     DECLARE_VAR_ALL64;
1675 
1676     /*
1677     dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1678     dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1679     dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1680     dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1681     dst[4] = cm[(filter[2] * src[4] - filter[1] * src[ 3] + filter[3] * src[5] - filter[4] * src[6] + 64) >> 7];
1682     dst[5] = cm[(filter[2] * src[5] - filter[1] * src[ 4] + filter[3] * src[6] - filter[4] * src[7] + 64) >> 7];
1683     dst[6] = cm[(filter[2] * src[6] - filter[1] * src[ 5] + filter[3] * src[7] - filter[4] * src[8] + 64) >> 7];
1684     dst[7] = cm[(filter[2] * src[7] - filter[1] * src[ 6] + filter[3] * src[8] - filter[4] * src[9] + 64) >> 7];
1685     */
1686     __asm__ volatile (
1687         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1688         "li         %[tmp0],    0x07                                \n\t"
1689         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
1690 
1691         "1:                                                         \n\t"
1692         PUT_VP8_EPEL8_H4_MMI(%[src], %[dst])
1693 
1694         "addiu      %[h],       %[h],           -0x01               \n\t"
1695         PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
1696         PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
1697         "bnez       %[h],       1b                                  \n\t"
1698         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1699           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1700           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1701           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
1702           [ftmp8]"=&f"(ftmp[8]),
1703           [tmp0]"=&r"(tmp[0]),
1704           RESTRICT_ASM_ALL64
1705           [h]"+&r"(h),
1706           [dst]"+&r"(dst),                  [src]"+&r"(src)
1707         : [ff_pw_64]"f"(ff_pw_64),
1708           [srcstride]"r"((mips_reg)srcstride),
1709           [dststride]"r"((mips_reg)dststride),
1710           [filter1]"f"(filter[1]),          [filter2]"f"(filter[2]),
1711           [filter3]"f"(filter[3]),          [filter4]"f"(filter[4])
1712         : "memory"
1713     );
1714 #else
1715     const uint8_t *filter = subpel_filters[mx - 1];
1716     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
1717     int x, y;
1718 
1719     for (y = 0; y < h; y++) {
1720         for (x = 0; x < 8; x++)
1721             dst[x] = FILTER_4TAP(src, filter, 1);
1722         dst += dststride;
1723         src += srcstride;
1724     }
1725 #endif
1726 }
1727 
ff_put_vp8_epel4_h4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)1728 void ff_put_vp8_epel4_h4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1729         ptrdiff_t srcstride, int h, int mx, int my)
1730 {
1731 #if 1
1732     const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1733     double ftmp[6];
1734     uint32_t tmp[1];
1735     DECLARE_VAR_LOW32;
1736 
1737     /*
1738     dst[0] = cm[(filter[2] * src[0] - filter[1] * src[-1] + filter[3] * src[1] - filter[4] * src[2] + 64) >> 7];
1739     dst[1] = cm[(filter[2] * src[1] - filter[1] * src[ 0] + filter[3] * src[2] - filter[4] * src[3] + 64) >> 7];
1740     dst[2] = cm[(filter[2] * src[2] - filter[1] * src[ 1] + filter[3] * src[3] - filter[4] * src[4] + 64) >> 7];
1741     dst[3] = cm[(filter[2] * src[3] - filter[1] * src[ 2] + filter[3] * src[4] - filter[4] * src[5] + 64) >> 7];
1742     */
1743     __asm__ volatile (
1744         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1745         "li         %[tmp0],    0x07                                \n\t"
1746         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
1747 
1748         "1:                                                         \n\t"
1749         PUT_VP8_EPEL4_H4_MMI(%[src], %[dst])
1750 
1751         "addiu      %[h],       %[h],           -0x01               \n\t"
1752         PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
1753         PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
1754         "bnez       %[h],       1b                                  \n\t"
1755         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1756           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1757           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1758           [tmp0]"=&r"(tmp[0]),
1759           RESTRICT_ASM_LOW32
1760           [h]"+&r"(h),
1761           [dst]"+&r"(dst),                  [src]"+&r"(src)
1762         : [ff_pw_64]"f"(ff_pw_64),
1763           [srcstride]"r"((mips_reg)srcstride),
1764           [dststride]"r"((mips_reg)dststride),
1765           [filter1]"f"(filter[1]),          [filter2]"f"(filter[2]),
1766           [filter3]"f"(filter[3]),          [filter4]"f"(filter[4])
1767         : "memory"
1768     );
1769 #else
1770     const uint8_t *filter = subpel_filters[mx - 1];
1771     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
1772     int x, y;
1773 
1774     for (y = 0; y < h; y++) {
1775         for (x = 0; x < 4; x++)
1776             dst[x] = FILTER_4TAP(src, filter, 1);
1777         dst += dststride;
1778         src += srcstride;
1779     }
1780 #endif
1781 }
1782 
ff_put_vp8_epel16_h6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)1783 void ff_put_vp8_epel16_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1784         ptrdiff_t srcstride, int h, int mx, int my)
1785 {
1786 #if 1
1787     const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1788     double ftmp[9];
1789     uint32_t tmp[1];
1790     mips_reg src1, dst1;
1791     DECLARE_VAR_ALL64;
1792 
1793     /*
1794     dst[ 0] = cm[(filter[2]*src[ 0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[ 1] - filter[4]*src[ 2] + filter[5]*src[ 3] + 64) >> 7];
1795     dst[ 1] = cm[(filter[2]*src[ 1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[ 2] - filter[4]*src[ 3] + filter[5]*src[ 4] + 64) >> 7];
1796     dst[ 2] = cm[(filter[2]*src[ 2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[ 3] - filter[4]*src[ 4] + filter[5]*src[ 5] + 64) >> 7];
1797     dst[ 3] = cm[(filter[2]*src[ 3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[ 4] - filter[4]*src[ 5] + filter[5]*src[ 6] + 64) >> 7];
1798     dst[ 4] = cm[(filter[2]*src[ 4] - filter[1]*src[ 3] + filter[0]*src[ 2] + filter[3]*src[ 5] - filter[4]*src[ 6] + filter[5]*src[ 7] + 64) >> 7];
1799     dst[ 5] = cm[(filter[2]*src[ 5] - filter[1]*src[ 4] + filter[0]*src[ 3] + filter[3]*src[ 6] - filter[4]*src[ 7] + filter[5]*src[ 8] + 64) >> 7];
1800     dst[ 6] = cm[(filter[2]*src[ 6] - filter[1]*src[ 5] + filter[0]*src[ 4] + filter[3]*src[ 7] - filter[4]*src[ 8] + filter[5]*src[ 9] + 64) >> 7];
1801     dst[ 7] = cm[(filter[2]*src[ 7] - filter[1]*src[ 6] + filter[0]*src[ 5] + filter[3]*src[ 8] - filter[4]*src[ 9] + filter[5]*src[10] + 64) >> 7];
1802 
1803     dst[ 8] = cm[(filter[2]*src[ 8] - filter[1]*src[ 7] + filter[0]*src[ 6] + filter[3]*src[ 9] - filter[4]*src[10] + filter[5]*src[11] + 64) >> 7];
1804     dst[ 9] = cm[(filter[2]*src[ 9] - filter[1]*src[ 8] + filter[0]*src[ 7] + filter[3]*src[10] - filter[4]*src[11] + filter[5]*src[12] + 64) >> 7];
1805     dst[10] = cm[(filter[2]*src[10] - filter[1]*src[ 9] + filter[0]*src[ 8] + filter[3]*src[11] - filter[4]*src[12] + filter[5]*src[13] + 64) >> 7];
1806     dst[11] = cm[(filter[2]*src[11] - filter[1]*src[10] + filter[0]*src[ 9] + filter[3]*src[12] - filter[4]*src[13] + filter[5]*src[14] + 64) >> 7];
1807     dst[12] = cm[(filter[2]*src[12] - filter[1]*src[11] + filter[0]*src[10] + filter[3]*src[13] - filter[4]*src[14] + filter[5]*src[15] + 64) >> 7];
1808     dst[13] = cm[(filter[2]*src[13] - filter[1]*src[12] + filter[0]*src[11] + filter[3]*src[14] - filter[4]*src[15] + filter[5]*src[16] + 64) >> 7];
1809     dst[14] = cm[(filter[2]*src[14] - filter[1]*src[13] + filter[0]*src[12] + filter[3]*src[15] - filter[4]*src[16] + filter[5]*src[17] + 64) >> 7];
1810     dst[15] = cm[(filter[2]*src[15] - filter[1]*src[14] + filter[0]*src[13] + filter[3]*src[16] - filter[4]*src[17] + filter[5]*src[18] + 64) >> 7];
1811     */
1812     __asm__ volatile (
1813         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1814         "li         %[tmp0],    0x07                                \n\t"
1815         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
1816 
1817         "1:                                                         \n\t"
1818         // 0 - 7
1819         PUT_VP8_EPEL8_H6_MMI(%[src], %[dst])
1820         PTR_ADDIU  "%[src1],    %[src],         0x08                \n\t"
1821         PTR_ADDIU  "%[dst1],    %[dst],         0x08                \n\t"
1822         // 8 - 15
1823         PUT_VP8_EPEL8_H6_MMI(%[src1], %[dst1])
1824 
1825         "addiu      %[h],       %[h],           -0x01               \n\t"
1826         PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
1827         PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
1828         "bnez       %[h],       1b                                  \n\t"
1829         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1830           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1831           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1832           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
1833           [ftmp8]"=&f"(ftmp[8]),
1834           [tmp0]"=&r"(tmp[0]),
1835           RESTRICT_ASM_ALL64
1836           [dst1]"=&r"(dst1),                [src1]"=&r"(src1),
1837           [h]"+&r"(h),
1838           [dst]"+&r"(dst),                  [src]"+&r"(src)
1839         : [ff_pw_64]"f"(ff_pw_64),
1840           [srcstride]"r"((mips_reg)srcstride),
1841           [dststride]"r"((mips_reg)dststride),
1842           [filter0]"f"(filter[0]),          [filter1]"f"(filter[1]),
1843           [filter2]"f"(filter[2]),          [filter3]"f"(filter[3]),
1844           [filter4]"f"(filter[4]),          [filter5]"f"(filter[5])
1845         : "memory"
1846     );
1847 #else
1848     const uint8_t *filter = subpel_filters[mx - 1];
1849     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
1850     int x, y;
1851 
1852     for (y = 0; y < h; y++) {
1853         for (x = 0; x < 16; x++)
1854             dst[x] = FILTER_6TAP(src, filter, 1);
1855         dst += dststride;
1856         src += srcstride;
1857     }
1858 #endif
1859 }
1860 
ff_put_vp8_epel8_h6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)1861 void ff_put_vp8_epel8_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1862         ptrdiff_t srcstride, int h, int mx, int my)
1863 {
1864 #if 1
1865     const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1866     double ftmp[9];
1867     uint32_t tmp[1];
1868     DECLARE_VAR_ALL64;
1869 
1870     /*
1871     dst[0] = cm[(filter[2]*src[0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[1] - filter[4]*src[2] + filter[5]*src[ 3] + 64) >> 7];
1872     dst[1] = cm[(filter[2]*src[1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[2] - filter[4]*src[3] + filter[5]*src[ 4] + 64) >> 7];
1873     dst[2] = cm[(filter[2]*src[2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[3] - filter[4]*src[4] + filter[5]*src[ 5] + 64) >> 7];
1874     dst[3] = cm[(filter[2]*src[3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[4] - filter[4]*src[5] + filter[5]*src[ 6] + 64) >> 7];
1875     dst[4] = cm[(filter[2]*src[4] - filter[1]*src[ 3] + filter[0]*src[ 2] + filter[3]*src[5] - filter[4]*src[6] + filter[5]*src[ 7] + 64) >> 7];
1876     dst[5] = cm[(filter[2]*src[5] - filter[1]*src[ 4] + filter[0]*src[ 3] + filter[3]*src[6] - filter[4]*src[7] + filter[5]*src[ 8] + 64) >> 7];
1877     dst[6] = cm[(filter[2]*src[6] - filter[1]*src[ 5] + filter[0]*src[ 4] + filter[3]*src[7] - filter[4]*src[8] + filter[5]*src[ 9] + 64) >> 7];
1878     dst[7] = cm[(filter[2]*src[7] - filter[1]*src[ 6] + filter[0]*src[ 5] + filter[3]*src[8] - filter[4]*src[9] + filter[5]*src[10] + 64) >> 7];
1879     */
1880     __asm__ volatile (
1881         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1882         "li         %[tmp0],    0x07                                \n\t"
1883         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
1884 
1885         "1:                                                         \n\t"
1886         PUT_VP8_EPEL8_H6_MMI(%[src], %[dst])
1887 
1888         "addiu      %[h],       %[h],           -0x01               \n\t"
1889         PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
1890         PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
1891         "bnez       %[h],       1b                                  \n\t"
1892         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1893           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1894           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1895           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
1896           [ftmp8]"=&f"(ftmp[8]),
1897           [tmp0]"=&r"(tmp[0]),
1898           RESTRICT_ASM_ALL64
1899           [h]"+&r"(h),
1900           [dst]"+&r"(dst),                  [src]"+&r"(src)
1901         : [ff_pw_64]"f"(ff_pw_64),
1902           [srcstride]"r"((mips_reg)srcstride),
1903           [dststride]"r"((mips_reg)dststride),
1904           [filter0]"f"(filter[0]),          [filter1]"f"(filter[1]),
1905           [filter2]"f"(filter[2]),          [filter3]"f"(filter[3]),
1906           [filter4]"f"(filter[4]),          [filter5]"f"(filter[5])
1907         : "memory"
1908     );
1909 #else
1910     const uint8_t *filter = subpel_filters[mx - 1];
1911     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
1912     int x, y;
1913 
1914     for (y = 0; y < h; y++) {
1915         for (x = 0; x < 8; x++)
1916             dst[x] = FILTER_6TAP(src, filter, 1);
1917         dst += dststride;
1918         src += srcstride;
1919     }
1920 #endif
1921 }
1922 
ff_put_vp8_epel4_h6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)1923 void ff_put_vp8_epel4_h6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1924         ptrdiff_t srcstride, int h, int mx, int my)
1925 {
1926 #if 1
1927     const uint64_t *filter = fourtap_subpel_filters[mx - 1];
1928     double ftmp[6];
1929     uint32_t tmp[1];
1930     DECLARE_VAR_LOW32;
1931 
1932     /*
1933     dst[0] = cm[(filter[2]*src[0] - filter[1]*src[-1] + filter[0]*src[-2] + filter[3]*src[1] - filter[4]*src[2] + filter[5]*src[ 3] + 64) >> 7];
1934     dst[1] = cm[(filter[2]*src[1] - filter[1]*src[ 0] + filter[0]*src[-1] + filter[3]*src[2] - filter[4]*src[3] + filter[5]*src[ 4] + 64) >> 7];
1935     dst[2] = cm[(filter[2]*src[2] - filter[1]*src[ 1] + filter[0]*src[ 0] + filter[3]*src[3] - filter[4]*src[4] + filter[5]*src[ 5] + 64) >> 7];
1936     dst[3] = cm[(filter[2]*src[3] - filter[1]*src[ 2] + filter[0]*src[ 1] + filter[3]*src[4] - filter[4]*src[5] + filter[5]*src[ 6] + 64) >> 7];
1937     */
1938     __asm__ volatile (
1939         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
1940         "li         %[tmp0],    0x07                                \n\t"
1941         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
1942 
1943         "1:                                                         \n\t"
1944         PUT_VP8_EPEL4_H6_MMI(%[src], %[dst])
1945 
1946         "addiu      %[h],       %[h],           -0x01               \n\t"
1947         PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
1948         PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
1949         "bnez       %[h],       1b                                  \n\t"
1950         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
1951           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
1952           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
1953           [tmp0]"=&r"(tmp[0]),
1954           RESTRICT_ASM_LOW32
1955           [h]"+&r"(h),
1956           [dst]"+&r"(dst),                  [src]"+&r"(src)
1957         : [ff_pw_64]"f"(ff_pw_64),
1958           [srcstride]"r"((mips_reg)srcstride),
1959           [dststride]"r"((mips_reg)dststride),
1960           [filter0]"f"(filter[0]),          [filter1]"f"(filter[1]),
1961           [filter2]"f"(filter[2]),          [filter3]"f"(filter[3]),
1962           [filter4]"f"(filter[4]),          [filter5]"f"(filter[5])
1963         : "memory"
1964     );
1965 #else
1966     const uint8_t *filter = subpel_filters[mx - 1];
1967     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
1968     int x, y;
1969 
1970     for (y = 0; y < h; y++) {
1971         for (x = 0; x < 4; x++)
1972             dst[x] = FILTER_6TAP(src, filter, 1);
1973         dst += dststride;
1974         src += srcstride;
1975     }
1976 #endif
1977 }
1978 
ff_put_vp8_epel16_v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)1979 void ff_put_vp8_epel16_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
1980         ptrdiff_t srcstride, int h, int mx, int my)
1981 {
1982 #if 1
1983     const uint64_t *filter = fourtap_subpel_filters[my - 1];
1984     double ftmp[9];
1985     uint32_t tmp[1];
1986     mips_reg src0, src1, dst0;
1987     DECLARE_VAR_ALL64;
1988 
1989     /*
1990     dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[  srcstride] - filter[4] * src[  2*srcstride] + 64) >> 7];
1991     dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
1992     dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
1993     dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
1994     dst[4] = cm[(filter[2] * src[4] - filter[1] * src[4-srcstride] + filter[3] * src[4+srcstride] - filter[4] * src[4+2*srcstride] + 64) >> 7];
1995     dst[5] = cm[(filter[2] * src[5] - filter[1] * src[5-srcstride] + filter[3] * src[5+srcstride] - filter[4] * src[5+2*srcstride] + 64) >> 7];
1996     dst[6] = cm[(filter[2] * src[6] - filter[1] * src[6-srcstride] + filter[3] * src[6+srcstride] - filter[4] * src[6+2*srcstride] + 64) >> 7];
1997     dst[7] = cm[(filter[2] * src[7] - filter[1] * src[7-srcstride] + filter[3] * src[7+srcstride] - filter[4] * src[7+2*srcstride] + 64) >> 7];
1998 
1999     dst[ 8] = cm[(filter[2] * src[ 8] - filter[1] * src[ 8-srcstride] + filter[3] * src[ 8+srcstride] - filter[4] * src[ 8+2*srcstride] + 64) >> 7];
2000     dst[ 9] = cm[(filter[2] * src[ 9] - filter[1] * src[ 9-srcstride] + filter[3] * src[ 9+srcstride] - filter[4] * src[ 9+2*srcstride] + 64) >> 7];
2001     dst[10] = cm[(filter[2] * src[10] - filter[1] * src[10-srcstride] + filter[3] * src[10+srcstride] - filter[4] * src[10+2*srcstride] + 64) >> 7];
2002     dst[11] = cm[(filter[2] * src[11] - filter[1] * src[11-srcstride] + filter[3] * src[11+srcstride] - filter[4] * src[11+2*srcstride] + 64) >> 7];
2003     dst[12] = cm[(filter[2] * src[12] - filter[1] * src[12-srcstride] + filter[3] * src[12+srcstride] - filter[4] * src[12+2*srcstride] + 64) >> 7];
2004     dst[13] = cm[(filter[2] * src[13] - filter[1] * src[13-srcstride] + filter[3] * src[13+srcstride] - filter[4] * src[13+2*srcstride] + 64) >> 7];
2005     dst[14] = cm[(filter[2] * src[14] - filter[1] * src[14-srcstride] + filter[3] * src[14+srcstride] - filter[4] * src[14+2*srcstride] + 64) >> 7];
2006     dst[15] = cm[(filter[2] * src[15] - filter[1] * src[15-srcstride] + filter[3] * src[15+srcstride] - filter[4] * src[15+2*srcstride] + 64) >> 7];
2007     */
2008     __asm__ volatile (
2009         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
2010         "li         %[tmp0],    0x07                                \n\t"
2011         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
2012 
2013         "1:                                                         \n\t"
2014         // 0 - 7
2015         PUT_VP8_EPEL8_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2016         PTR_ADDIU  "%[src0],    %[src],         0x08                \n\t"
2017         PTR_ADDIU  "%[dst0],    %[dst],         0x08                \n\t"
2018         // 8 - 15
2019         PUT_VP8_EPEL8_V4_MMI(%[src0], %[src1], %[dst], %[srcstride])
2020 
2021         "addiu      %[h],       %[h],           -0x01               \n\t"
2022         PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
2023         PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
2024         "bnez       %[h],       1b                                  \n\t"
2025         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
2026           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
2027           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
2028           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
2029           [ftmp8]"=&f"(ftmp[8]),
2030           [tmp0]"=&r"(tmp[0]),
2031           RESTRICT_ASM_ALL64
2032           [src0]"=&r"(src0),                [dst0]"=&r"(dst0),
2033           [src1]"=&r"(src1),
2034           [h]"+&r"(h),
2035           [dst]"+&r"(dst),                  [src]"+&r"(src)
2036         : [ff_pw_64]"f"(ff_pw_64),
2037           [srcstride]"r"((mips_reg)srcstride),
2038           [dststride]"r"((mips_reg)dststride),
2039           [filter1]"f"(filter[1]),          [filter2]"f"(filter[2]),
2040           [filter3]"f"(filter[3]),          [filter4]"f"(filter[4])
2041         : "memory"
2042     );
2043 #else
2044     const uint8_t *filter = subpel_filters[my - 1];
2045     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2046     int x, y;
2047 
2048     for (y = 0; y < h; y++) {
2049         for (x = 0; x < 16; x++)
2050             dst[x] = FILTER_4TAP(src, filter, srcstride);
2051         dst += dststride;
2052         src += srcstride;
2053     }
2054 #endif
2055 }
2056 
ff_put_vp8_epel8_v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2057 void ff_put_vp8_epel8_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2058         ptrdiff_t srcstride, int h, int mx, int my)
2059 {
2060 #if 1
2061     const uint64_t *filter = fourtap_subpel_filters[my - 1];
2062     double ftmp[9];
2063     uint32_t tmp[1];
2064     mips_reg src1;
2065     DECLARE_VAR_ALL64;
2066 
2067     /*
2068     dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[  srcstride] - filter[4] * src[  2*srcstride] + 64) >> 7];
2069     dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
2070     dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
2071     dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
2072     dst[4] = cm[(filter[2] * src[4] - filter[1] * src[4-srcstride] + filter[3] * src[4+srcstride] - filter[4] * src[4+2*srcstride] + 64) >> 7];
2073     dst[5] = cm[(filter[2] * src[5] - filter[1] * src[5-srcstride] + filter[3] * src[5+srcstride] - filter[4] * src[5+2*srcstride] + 64) >> 7];
2074     dst[6] = cm[(filter[2] * src[6] - filter[1] * src[6-srcstride] + filter[3] * src[6+srcstride] - filter[4] * src[6+2*srcstride] + 64) >> 7];
2075     dst[7] = cm[(filter[2] * src[7] - filter[1] * src[7-srcstride] + filter[3] * src[7+srcstride] - filter[4] * src[7+2*srcstride] + 64) >> 7];
2076     */
2077     __asm__ volatile (
2078         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
2079         "li         %[tmp0],    0x07                                \n\t"
2080         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
2081 
2082         "1:                                                         \n\t"
2083         PUT_VP8_EPEL8_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2084 
2085         "addiu      %[h],       %[h],           -0x01               \n\t"
2086         PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
2087         PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
2088         "bnez       %[h],       1b                                  \n\t"
2089         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
2090           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
2091           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
2092           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
2093           [ftmp8]"=&f"(ftmp[8]),
2094           [tmp0]"=&r"(tmp[0]),
2095           RESTRICT_ASM_ALL64
2096           [src1]"=&r"(src1),
2097           [h]"+&r"(h),
2098           [dst]"+&r"(dst),                  [src]"+&r"(src)
2099         : [ff_pw_64]"f"(ff_pw_64),
2100           [srcstride]"r"((mips_reg)srcstride),
2101           [dststride]"r"((mips_reg)dststride),
2102           [filter1]"f"(filter[1]),          [filter2]"f"(filter[2]),
2103           [filter3]"f"(filter[3]),          [filter4]"f"(filter[4])
2104         : "memory"
2105     );
2106 #else
2107     const uint8_t *filter = subpel_filters[my - 1];
2108     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2109     int x, y;
2110 
2111     for (y = 0; y < h; y++) {
2112         for (x = 0; x < 8; x++)
2113             dst[x] = FILTER_4TAP(src, filter, srcstride);
2114         dst += dststride;
2115         src += srcstride;
2116     }
2117 #endif
2118 }
2119 
ff_put_vp8_epel4_v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2120 void ff_put_vp8_epel4_v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2121         ptrdiff_t srcstride, int h, int mx, int my)
2122 {
2123 #if 1
2124     const uint64_t *filter = fourtap_subpel_filters[my - 1];
2125     double ftmp[6];
2126     uint32_t tmp[1];
2127     mips_reg src1;
2128     DECLARE_VAR_LOW32;
2129 
2130     /*
2131     dst[0] = cm[(filter[2] * src[0] - filter[1] * src[ -srcstride] + filter[3] * src[  srcstride] - filter[4] * src[  2*srcstride] + 64) >> 7];
2132     dst[1] = cm[(filter[2] * src[1] - filter[1] * src[1-srcstride] + filter[3] * src[1+srcstride] - filter[4] * src[1+2*srcstride] + 64) >> 7];
2133     dst[2] = cm[(filter[2] * src[2] - filter[1] * src[2-srcstride] + filter[3] * src[2+srcstride] - filter[4] * src[2+2*srcstride] + 64) >> 7];
2134     dst[3] = cm[(filter[2] * src[3] - filter[1] * src[3-srcstride] + filter[3] * src[3+srcstride] - filter[4] * src[3+2*srcstride] + 64) >> 7];
2135     */
2136     __asm__ volatile (
2137         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
2138         "li         %[tmp0],    0x07                                \n\t"
2139         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
2140 
2141         "1:                                                         \n\t"
2142         PUT_VP8_EPEL4_V4_MMI(%[src], %[src1], %[dst], %[srcstride])
2143 
2144         "addiu      %[h],       %[h],           -0x01               \n\t"
2145         PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
2146         PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
2147         "bnez       %[h],       1b                                  \n\t"
2148         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
2149           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
2150           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
2151           [tmp0]"=&r"(tmp[0]),
2152           RESTRICT_ASM_LOW32
2153           [src1]"=&r"(src1),
2154           [h]"+&r"(h),
2155           [dst]"+&r"(dst),                  [src]"+&r"(src)
2156         : [ff_pw_64]"f"(ff_pw_64),
2157           [srcstride]"r"((mips_reg)srcstride),
2158           [dststride]"r"((mips_reg)dststride),
2159           [filter1]"f"(filter[1]),          [filter2]"f"(filter[2]),
2160           [filter3]"f"(filter[3]),          [filter4]"f"(filter[4])
2161         : "memory"
2162     );
2163 #else
2164     const uint8_t *filter = subpel_filters[my - 1];
2165     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2166     int x, y;
2167 
2168     for (y = 0; y < h; y++) {
2169         for (x = 0; x < 4; x++)
2170             dst[x] = FILTER_4TAP(src, filter, srcstride);
2171         dst += dststride;
2172         src += srcstride;
2173     }
2174 #endif
2175 }
2176 
ff_put_vp8_epel16_v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2177 void ff_put_vp8_epel16_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2178         ptrdiff_t srcstride, int h, int mx, int my)
2179 {
2180 #if 1
2181     const uint64_t *filter = fourtap_subpel_filters[my - 1];
2182     double ftmp[9];
2183     uint32_t tmp[1];
2184     mips_reg src0, src1, dst0;
2185     DECLARE_VAR_ALL64;
2186 
2187     /*
2188     dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2189     dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2190     dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2191     dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2192     dst[4] = cm[(filter[2]*src[4] - filter[1]*src[4-srcstride] + filter[0]*src[4-2*srcstride] + filter[3]*src[4+srcstride] - filter[4]*src[4+2*srcstride] + filter[5]*src[4+3*srcstride] + 64) >> 7];
2193     dst[5] = cm[(filter[2]*src[5] - filter[1]*src[5-srcstride] + filter[0]*src[5-2*srcstride] + filter[3]*src[5+srcstride] - filter[4]*src[5+2*srcstride] + filter[5]*src[5+3*srcstride] + 64) >> 7];
2194     dst[6] = cm[(filter[2]*src[6] - filter[1]*src[6-srcstride] + filter[0]*src[6-2*srcstride] + filter[3]*src[6+srcstride] - filter[4]*src[6+2*srcstride] + filter[5]*src[6+3*srcstride] + 64) >> 7];
2195     dst[7] = cm[(filter[2]*src[7] - filter[1]*src[7-srcstride] + filter[0]*src[7-2*srcstride] + filter[3]*src[7+srcstride] - filter[4]*src[7+2*srcstride] + filter[5]*src[7+3*srcstride] + 64) >> 7];
2196 
2197     dst[ 8] = cm[(filter[2]*src[ 8] - filter[1]*src[ 8-srcstride] + filter[0]*src[ 8-2*srcstride] + filter[3]*src[ 8+srcstride] - filter[4]*src[ 8+2*srcstride] + filter[5]*src[ 8+3*srcstride] + 64) >> 7];
2198     dst[ 9] = cm[(filter[2]*src[ 9] - filter[1]*src[ 9-srcstride] + filter[0]*src[ 9-2*srcstride] + filter[3]*src[ 9+srcstride] - filter[4]*src[ 9+2*srcstride] + filter[5]*src[ 9+3*srcstride] + 64) >> 7];
2199     dst[10] = cm[(filter[2]*src[10] - filter[1]*src[10-srcstride] + filter[0]*src[10-2*srcstride] + filter[3]*src[10+srcstride] - filter[4]*src[10+2*srcstride] + filter[5]*src[10+3*srcstride] + 64) >> 7];
2200     dst[11] = cm[(filter[2]*src[11] - filter[1]*src[11-srcstride] + filter[0]*src[11-2*srcstride] + filter[3]*src[11+srcstride] - filter[4]*src[11+2*srcstride] + filter[5]*src[11+3*srcstride] + 64) >> 7];
2201     dst[12] = cm[(filter[2]*src[12] - filter[1]*src[12-srcstride] + filter[0]*src[12-2*srcstride] + filter[3]*src[12+srcstride] - filter[4]*src[12+2*srcstride] + filter[5]*src[12+3*srcstride] + 64) >> 7];
2202     dst[13] = cm[(filter[2]*src[13] - filter[1]*src[13-srcstride] + filter[0]*src[13-2*srcstride] + filter[3]*src[13+srcstride] - filter[4]*src[13+2*srcstride] + filter[5]*src[13+3*srcstride] + 64) >> 7];
2203     dst[14] = cm[(filter[2]*src[14] - filter[1]*src[14-srcstride] + filter[0]*src[14-2*srcstride] + filter[3]*src[14+srcstride] - filter[4]*src[14+2*srcstride] + filter[5]*src[14+3*srcstride] + 64) >> 7];
2204     dst[15] = cm[(filter[2]*src[15] - filter[1]*src[15-srcstride] + filter[0]*src[15-2*srcstride] + filter[3]*src[15+srcstride] - filter[4]*src[15+2*srcstride] + filter[5]*src[15+3*srcstride] + 64) >> 7];
2205     */
2206     __asm__ volatile (
2207         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
2208         "li         %[tmp0],    0x07                                \n\t"
2209         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
2210 
2211         "1:                                                         \n\t"
2212         // 0 - 7
2213         PUT_VP8_EPEL8_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2214         PTR_ADDIU  "%[src0],    %[src],         0x08                \n\t"
2215         PTR_ADDIU  "%[dst0],    %[dst],         0x08                \n\t"
2216         // 8 - 15
2217         PUT_VP8_EPEL8_V6_MMI(%[src0], %[src1], %[dst0], %[srcstride])
2218 
2219         "addiu      %[h],       %[h],           -0x01               \n\t"
2220         PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
2221         PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
2222         "bnez       %[h],       1b                                  \n\t"
2223         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
2224           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
2225           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
2226           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
2227           [ftmp8]"=&f"(ftmp[8]),
2228           [tmp0]"=&r"(tmp[0]),
2229           RESTRICT_ASM_ALL64
2230           [src0]"=&r"(src0),                [dst0]"=&r"(dst0),
2231           [src1]"=&r"(src1),
2232           [h]"+&r"(h),
2233           [dst]"+&r"(dst),                  [src]"+&r"(src)
2234         : [ff_pw_64]"f"(ff_pw_64),
2235           [srcstride]"r"((mips_reg)srcstride),
2236           [dststride]"r"((mips_reg)dststride),
2237           [filter0]"f"(filter[0]),          [filter1]"f"(filter[1]),
2238           [filter2]"f"(filter[2]),          [filter3]"f"(filter[3]),
2239           [filter4]"f"(filter[4]),          [filter5]"f"(filter[5])
2240         : "memory"
2241     );
2242 #else
2243     const uint8_t *filter = subpel_filters[my - 1];
2244     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2245     int x, y;
2246 
2247     for (y = 0; y < h; y++) {
2248         for (x = 0; x < 16; x++)
2249             dst[x] = FILTER_6TAP(src, filter, srcstride);
2250         dst += dststride;
2251         src += srcstride;
2252     }
2253 #endif
2254 }
2255 
ff_put_vp8_epel8_v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2256 void ff_put_vp8_epel8_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2257         ptrdiff_t srcstride, int h, int mx, int my)
2258 {
2259 #if 1
2260     const uint64_t *filter = fourtap_subpel_filters[my - 1];
2261     double ftmp[9];
2262     uint32_t tmp[1];
2263     mips_reg src1;
2264     DECLARE_VAR_ALL64;
2265 
2266     /*
2267     dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2268     dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2269     dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2270     dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2271     dst[4] = cm[(filter[2]*src[4] - filter[1]*src[4-srcstride] + filter[0]*src[4-2*srcstride] + filter[3]*src[4+srcstride] - filter[4]*src[4+2*srcstride] + filter[5]*src[4+3*srcstride] + 64) >> 7];
2272     dst[5] = cm[(filter[2]*src[5] - filter[1]*src[5-srcstride] + filter[0]*src[5-2*srcstride] + filter[3]*src[5+srcstride] - filter[4]*src[5+2*srcstride] + filter[5]*src[5+3*srcstride] + 64) >> 7];
2273     dst[6] = cm[(filter[2]*src[6] - filter[1]*src[6-srcstride] + filter[0]*src[6-2*srcstride] + filter[3]*src[6+srcstride] - filter[4]*src[6+2*srcstride] + filter[5]*src[6+3*srcstride] + 64) >> 7];
2274     dst[7] = cm[(filter[2]*src[7] - filter[1]*src[7-srcstride] + filter[0]*src[7-2*srcstride] + filter[3]*src[7+srcstride] - filter[4]*src[7+2*srcstride] + filter[5]*src[7+3*srcstride] + 64) >> 7];
2275     */
2276     __asm__ volatile (
2277         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
2278         "li         %[tmp0],    0x07                                \n\t"
2279         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
2280 
2281         "1:                                                         \n\t"
2282         PUT_VP8_EPEL8_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2283 
2284         "addiu      %[h],       %[h],           -0x01               \n\t"
2285         PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
2286         PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
2287         "bnez       %[h],       1b                                  \n\t"
2288         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
2289           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
2290           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
2291           [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
2292           [ftmp8]"=&f"(ftmp[8]),
2293           [tmp0]"=&r"(tmp[0]),
2294           RESTRICT_ASM_ALL64
2295           [src1]"=&r"(src1),
2296           [h]"+&r"(h),
2297           [dst]"+&r"(dst),                  [src]"+&r"(src)
2298         : [ff_pw_64]"f"(ff_pw_64),
2299           [srcstride]"r"((mips_reg)srcstride),
2300           [dststride]"r"((mips_reg)dststride),
2301           [filter0]"f"(filter[0]),          [filter1]"f"(filter[1]),
2302           [filter2]"f"(filter[2]),          [filter3]"f"(filter[3]),
2303           [filter4]"f"(filter[4]),          [filter5]"f"(filter[5])
2304         : "memory"
2305     );
2306 #else
2307     const uint8_t *filter = subpel_filters[my - 1];
2308     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2309     int x, y;
2310 
2311     for (y = 0; y < h; y++) {
2312         for (x = 0; x < 8; x++)
2313             dst[x] = FILTER_6TAP(src, filter, srcstride);
2314         dst += dststride;
2315         src += srcstride;
2316     }
2317 #endif
2318 }
2319 
ff_put_vp8_epel4_v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2320 void ff_put_vp8_epel4_v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2321         ptrdiff_t srcstride, int h, int mx, int my)
2322 {
2323 #if 1
2324     const uint64_t *filter = fourtap_subpel_filters[my - 1];
2325     double ftmp[6];
2326     uint32_t tmp[1];
2327     mips_reg src1;
2328     DECLARE_VAR_LOW32;
2329 
2330     /*
2331     dst[0] = cm[(filter[2]*src[0] - filter[1]*src[0-srcstride] + filter[0]*src[0-2*srcstride] + filter[3]*src[0+srcstride] - filter[4]*src[0+2*srcstride] + filter[5]*src[0+3*srcstride] + 64) >> 7];
2332     dst[1] = cm[(filter[2]*src[1] - filter[1]*src[1-srcstride] + filter[0]*src[1-2*srcstride] + filter[3]*src[1+srcstride] - filter[4]*src[1+2*srcstride] + filter[5]*src[1+3*srcstride] + 64) >> 7];
2333     dst[2] = cm[(filter[2]*src[2] - filter[1]*src[2-srcstride] + filter[0]*src[2-2*srcstride] + filter[3]*src[2+srcstride] - filter[4]*src[2+2*srcstride] + filter[5]*src[2+3*srcstride] + 64) >> 7];
2334     dst[3] = cm[(filter[2]*src[3] - filter[1]*src[3-srcstride] + filter[0]*src[3-2*srcstride] + filter[3]*src[3+srcstride] - filter[4]*src[3+2*srcstride] + filter[5]*src[3+3*srcstride] + 64) >> 7];
2335     */
2336     __asm__ volatile (
2337         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
2338         "li         %[tmp0],    0x07                                \n\t"
2339         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
2340 
2341         "1:                                                         \n\t"
2342         PUT_VP8_EPEL4_V6_MMI(%[src], %[src1], %[dst], %[srcstride])
2343 
2344         "addiu      %[h],       %[h],           -0x01               \n\t"
2345         PTR_ADDU   "%[src],     %[src],         %[srcstride]        \n\t"
2346         PTR_ADDU   "%[dst],     %[dst],         %[dststride]        \n\t"
2347         "bnez       %[h],       1b                                  \n\t"
2348         : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
2349           [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
2350           [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
2351           [tmp0]"=&r"(tmp[0]),
2352           RESTRICT_ASM_LOW32
2353           [src1]"=&r"(src1),
2354           [h]"+&r"(h),
2355           [dst]"+&r"(dst),                  [src]"+&r"(src)
2356         : [ff_pw_64]"f"(ff_pw_64),
2357           [srcstride]"r"((mips_reg)srcstride),
2358           [dststride]"r"((mips_reg)dststride),
2359           [filter0]"f"(filter[0]),          [filter1]"f"(filter[1]),
2360           [filter2]"f"(filter[2]),          [filter3]"f"(filter[3]),
2361           [filter4]"f"(filter[4]),          [filter5]"f"(filter[5])
2362         : "memory"
2363     );
2364 #else
2365     const uint8_t *filter = subpel_filters[my - 1];
2366     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2367     int x, y;
2368 
2369     for (y = 0; y < h; y++) {
2370         for (x = 0; x < 4; x++)
2371             dst[x] = FILTER_6TAP(src, filter, srcstride);
2372         dst += dststride;
2373         src += srcstride;
2374     }
2375 #endif
2376 }
2377 
ff_put_vp8_epel16_h4v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2378 void ff_put_vp8_epel16_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2379         ptrdiff_t srcstride, int h, int mx, int my)
2380 {
2381 #if 1
2382     DECLARE_ALIGNED(8, uint8_t, tmp_array[560]);
2383     uint8_t *tmp = tmp_array;
2384 
2385     src -= srcstride;
2386     ff_put_vp8_epel16_h4_mmi(tmp, 16, src, srcstride, h + 3, mx, my);
2387     tmp = tmp_array + 16;
2388     ff_put_vp8_epel16_v4_mmi(dst, dststride, tmp, 16, h, mx, my);
2389 #else
2390     const uint8_t *filter = subpel_filters[mx - 1];
2391     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2392     int x, y;
2393     uint8_t tmp_array[560];
2394     uint8_t *tmp = tmp_array;
2395 
2396     src -= srcstride;
2397 
2398     for (y = 0; y < h + 3; y++) {
2399         for (x = 0; x < 16; x++)
2400             tmp[x] = FILTER_4TAP(src, filter, 1);
2401         tmp += 16;
2402         src += srcstride;
2403     }
2404 
2405     tmp    = tmp_array + 16;
2406     filter = subpel_filters[my - 1];
2407 
2408     for (y = 0; y < h; y++) {
2409         for (x = 0; x < 16; x++)
2410             dst[x] = FILTER_4TAP(tmp, filter, 16);
2411         dst += dststride;
2412         tmp += 16;
2413     }
2414 #endif
2415 }
2416 
ff_put_vp8_epel8_h4v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2417 void ff_put_vp8_epel8_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2418         ptrdiff_t srcstride, int h, int mx, int my)
2419 {
2420 #if 1
2421     DECLARE_ALIGNED(8, uint8_t, tmp_array[152]);
2422     uint8_t *tmp = tmp_array;
2423 
2424     src -= srcstride;
2425     ff_put_vp8_epel8_h4_mmi(tmp, 8, src, srcstride, h + 3, mx, my);
2426     tmp = tmp_array + 8;
2427     ff_put_vp8_epel8_v4_mmi(dst, dststride, tmp, 8, h, mx, my);
2428 #else
2429     const uint8_t *filter = subpel_filters[mx - 1];
2430     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2431     int x, y;
2432     uint8_t tmp_array[152];
2433     uint8_t *tmp = tmp_array;
2434 
2435     src -= srcstride;
2436 
2437     for (y = 0; y < h + 3; y++) {
2438         for (x = 0; x < 8; x++)
2439             tmp[x] = FILTER_4TAP(src, filter, 1);
2440         tmp += 8;
2441         src += srcstride;
2442     }
2443 
2444     tmp    = tmp_array + 8;
2445     filter = subpel_filters[my - 1];
2446 
2447     for (y = 0; y < h; y++) {
2448         for (x = 0; x < 8; x++)
2449             dst[x] = FILTER_4TAP(tmp, filter, 8);
2450         dst += dststride;
2451         tmp += 8;
2452     }
2453 #endif
2454 }
2455 
ff_put_vp8_epel4_h4v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2456 void ff_put_vp8_epel4_h4v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2457         ptrdiff_t srcstride, int h, int mx, int my)
2458 {
2459 #if 1
2460     DECLARE_ALIGNED(4, uint8_t, tmp_array[44]);
2461     uint8_t *tmp = tmp_array;
2462 
2463     src -= srcstride;
2464     ff_put_vp8_epel4_h4_mmi(tmp, 4, src, srcstride, h + 3, mx, my);
2465     tmp = tmp_array + 4;
2466     ff_put_vp8_epel4_v4_mmi(dst, dststride, tmp, 4, h, mx, my);
2467 #else
2468     const uint8_t *filter = subpel_filters[mx - 1];
2469     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2470     int x, y;
2471     uint8_t tmp_array[44];
2472     uint8_t *tmp = tmp_array;
2473 
2474     src -= srcstride;
2475 
2476     for (y = 0; y < h + 3; y++) {
2477         for (x = 0; x < 4; x++)
2478             tmp[x] = FILTER_4TAP(src, filter, 1);
2479         tmp += 4;
2480         src += srcstride;
2481     }
2482     tmp    = tmp_array + 4;
2483     filter = subpel_filters[my - 1];
2484 
2485     for (y = 0; y < h; y++) {
2486         for (x = 0; x < 4; x++)
2487             dst[x] = FILTER_4TAP(tmp, filter, 4);
2488         dst += dststride;
2489         tmp += 4;
2490     }
2491 #endif
2492 }
2493 
ff_put_vp8_epel16_h4v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2494 void ff_put_vp8_epel16_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2495         ptrdiff_t srcstride, int h, int mx, int my)
2496 {
2497 #if 1
2498     DECLARE_ALIGNED(8, uint8_t, tmp_array[592]);
2499     uint8_t *tmp = tmp_array;
2500 
2501     src -= 2 * srcstride;
2502     ff_put_vp8_epel16_h4_mmi(tmp, 16, src, srcstride, h + 5, mx, my);
2503     tmp    = tmp_array + 32;
2504     ff_put_vp8_epel16_v6_mmi(dst, dststride, tmp, 16, h, mx, my);
2505 #else
2506     const uint8_t *filter = subpel_filters[mx - 1];
2507     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2508     int x, y;
2509     uint8_t tmp_array[592];
2510     uint8_t *tmp = tmp_array;
2511 
2512     src -= 2 * srcstride;
2513 
2514     for (y = 0; y < h + 5; y++) {
2515         for (x = 0; x < 16; x++)
2516             tmp[x] = FILTER_4TAP(src, filter, 1);
2517         tmp += 16;
2518         src += srcstride;
2519     }
2520 
2521     tmp    = tmp_array + 32;
2522     filter = subpel_filters[my - 1];
2523 
2524     for (y = 0; y < h; y++) {
2525         for (x = 0; x < 16; x++)
2526             dst[x] = FILTER_6TAP(tmp, filter, 16);
2527         dst += dststride;
2528         tmp += 16;
2529     }
2530 #endif
2531 }
2532 
ff_put_vp8_epel8_h4v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2533 void ff_put_vp8_epel8_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2534         ptrdiff_t srcstride, int h, int mx, int my)
2535 {
2536 #if 1
2537     DECLARE_ALIGNED(8, uint8_t, tmp_array[168]);
2538     uint8_t *tmp = tmp_array;
2539 
2540     src -= 2 * srcstride;
2541     ff_put_vp8_epel8_h4_mmi(tmp, 8, src, srcstride, h + 5, mx, my);
2542     tmp    = tmp_array + 16;
2543     ff_put_vp8_epel8_v6_mmi(dst, dststride, tmp, 8, h, mx, my);
2544 #else
2545     const uint8_t *filter = subpel_filters[mx - 1];
2546     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2547     int x, y;
2548     uint8_t tmp_array[168];
2549     uint8_t *tmp = tmp_array;
2550 
2551     src -= 2 * srcstride;
2552 
2553     for (y = 0; y < h + 5; y++) {
2554         for (x = 0; x < 8; x++)
2555             tmp[x] = FILTER_4TAP(src, filter, 1);
2556         tmp += 8;
2557         src += srcstride;
2558     }
2559 
2560     tmp    = tmp_array + 16;
2561     filter = subpel_filters[my - 1];
2562 
2563     for (y = 0; y < h; y++) {
2564         for (x = 0; x < 8; x++)
2565             dst[x] = FILTER_6TAP(tmp, filter, 8);
2566         dst += dststride;
2567         tmp += 8;
2568     }
2569 #endif
2570 }
2571 
ff_put_vp8_epel4_h4v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2572 void ff_put_vp8_epel4_h4v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2573         ptrdiff_t srcstride, int h, int mx, int my)
2574 {
2575 #if 1
2576     DECLARE_ALIGNED(4, uint8_t, tmp_array[52]);
2577     uint8_t *tmp = tmp_array;
2578 
2579     src -= 2 * srcstride;
2580     ff_put_vp8_epel4_h4_mmi(tmp, 4, src, srcstride, h + 5, mx, my);
2581     tmp    = tmp_array + 8;
2582     ff_put_vp8_epel4_v6_mmi(dst, dststride, tmp, 4, h, mx, my);
2583 #else
2584     const uint8_t *filter = subpel_filters[mx - 1];
2585     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2586     int x, y;
2587     uint8_t tmp_array[52];
2588     uint8_t *tmp = tmp_array;
2589 
2590     src -= 2 * srcstride;
2591 
2592     for (y = 0; y < h + 5; y++) {
2593         for (x = 0; x < 4; x++)
2594             tmp[x] = FILTER_4TAP(src, filter, 1);
2595         tmp += 4;
2596         src += srcstride;
2597     }
2598 
2599     tmp    = tmp_array + 8;
2600     filter = subpel_filters[my - 1];
2601 
2602     for (y = 0; y < h; y++) {
2603         for (x = 0; x < 4; x++)
2604             dst[x] = FILTER_6TAP(tmp, filter, 4);
2605         dst += dststride;
2606         tmp += 4;
2607     }
2608 #endif
2609 }
2610 
ff_put_vp8_epel16_h6v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2611 void ff_put_vp8_epel16_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2612         ptrdiff_t srcstride, int h, int mx, int my)
2613 {
2614 #if 1
2615     DECLARE_ALIGNED(8, uint8_t, tmp_array[560]);
2616     uint8_t *tmp = tmp_array;
2617 
2618     src -= srcstride;
2619     ff_put_vp8_epel16_h6_mmi(tmp, 16, src, srcstride, h + 3, mx, my);
2620     tmp    = tmp_array + 16;
2621     ff_put_vp8_epel16_v4_mmi(dst, dststride, tmp, 16, h, mx, my);
2622 #else
2623     const uint8_t *filter = subpel_filters[mx - 1];
2624     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2625     int x, y;
2626     uint8_t tmp_array[560];
2627     uint8_t *tmp = tmp_array;
2628 
2629     src -= srcstride;
2630 
2631     for (y = 0; y < h + 3; y++) {
2632         for (x = 0; x < 16; x++)
2633             tmp[x] = FILTER_6TAP(src, filter, 1);
2634         tmp += 16;
2635         src += srcstride;
2636     }
2637 
2638     tmp    = tmp_array + 16;
2639     filter = subpel_filters[my - 1];
2640 
2641     for (y = 0; y < h; y++) {
2642         for (x = 0; x < 16; x++)
2643             dst[x] = FILTER_4TAP(tmp, filter, 16);
2644         dst += dststride;
2645         tmp += 16;
2646     }
2647 #endif
2648 }
2649 
ff_put_vp8_epel8_h6v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2650 void ff_put_vp8_epel8_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2651         ptrdiff_t srcstride, int h, int mx, int my)
2652 {
2653 #if 1
2654     DECLARE_ALIGNED(8, uint8_t, tmp_array[152]);
2655     uint8_t *tmp = tmp_array;
2656 
2657     src -= srcstride;
2658     ff_put_vp8_epel8_h6_mmi(tmp, 8, src, srcstride, h + 3, mx, my);
2659     tmp    = tmp_array + 8;
2660     ff_put_vp8_epel8_v4_mmi(dst, dststride, tmp, 8, h, mx, my);
2661 #else
2662     const uint8_t *filter = subpel_filters[mx - 1];
2663     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2664     int x, y;
2665     uint8_t tmp_array[152];
2666     uint8_t *tmp = tmp_array;
2667 
2668     src -= srcstride;
2669 
2670     for (y = 0; y < h + 3; y++) {
2671         for (x = 0; x < 8; x++)
2672             tmp[x] = FILTER_6TAP(src, filter, 1);
2673         tmp += 8;
2674         src += srcstride;
2675     }
2676 
2677     tmp    = tmp_array + 8;
2678     filter = subpel_filters[my - 1];
2679 
2680     for (y = 0; y < h; y++) {
2681         for (x = 0; x < 8; x++)
2682             dst[x] = FILTER_4TAP(tmp, filter, 8);
2683         dst += dststride;
2684         tmp += 8;
2685     }
2686 #endif
2687 }
2688 
ff_put_vp8_epel4_h6v4_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2689 void ff_put_vp8_epel4_h6v4_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2690         ptrdiff_t srcstride, int h, int mx, int my)
2691 {
2692 #if 1
2693     DECLARE_ALIGNED(4, uint8_t, tmp_array[44]);
2694     uint8_t *tmp = tmp_array;
2695 
2696     src -= srcstride;
2697     ff_put_vp8_epel4_h6_mmi(tmp, 4, src, srcstride, h + 3, mx, my);
2698     tmp    = tmp_array + 4;
2699     ff_put_vp8_epel4_v4_mmi(dst, dststride, tmp, 4, h, mx, my);
2700 #else
2701     const uint8_t *filter = subpel_filters[mx - 1];
2702     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2703     int x, y;
2704     uint8_t tmp_array[44];
2705     uint8_t *tmp = tmp_array;
2706 
2707     src -= srcstride;
2708 
2709     for (y = 0; y < h + 3; y++) {
2710         for (x = 0; x < 4; x++)
2711             tmp[x] = FILTER_6TAP(src, filter, 1);
2712         tmp += 4;
2713         src += srcstride;
2714     }
2715 
2716     tmp    = tmp_array + 4;
2717     filter = subpel_filters[my - 1];
2718 
2719     for (y = 0; y < h; y++) {
2720         for (x = 0; x < 4; x++)
2721             dst[x] = FILTER_4TAP(tmp, filter, 4);
2722         dst += dststride;
2723         tmp += 4;
2724     }
2725 #endif
2726 }
2727 
ff_put_vp8_epel16_h6v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2728 void ff_put_vp8_epel16_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2729         ptrdiff_t srcstride, int h, int mx, int my)
2730 {
2731 #if 1
2732     DECLARE_ALIGNED(8, uint8_t, tmp_array[592]);
2733     uint8_t *tmp = tmp_array;
2734 
2735     src -= 2 * srcstride;
2736     ff_put_vp8_epel16_h6_mmi(tmp, 16, src, srcstride, h + 5, mx, my);
2737     tmp    = tmp_array + 32;
2738     ff_put_vp8_epel16_v6_mmi(dst, dststride, tmp, 16, h, mx, my);
2739 #else
2740     const uint8_t *filter = subpel_filters[mx - 1];
2741     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2742     int x, y;
2743     uint8_t tmp_array[592];
2744     uint8_t *tmp = tmp_array;
2745 
2746     src -= 2 * srcstride;
2747 
2748     for (y = 0; y < h + 5; y++) {
2749         for (x = 0; x < 16; x++)
2750             tmp[x] = FILTER_6TAP(src, filter, 1);
2751         tmp += 16;
2752         src += srcstride;
2753     }
2754 
2755     tmp    = tmp_array + 32;
2756     filter = subpel_filters[my - 1];
2757 
2758     for (y = 0; y < h; y++) {
2759         for (x = 0; x < 16; x++)
2760             dst[x] = FILTER_6TAP(tmp, filter, 16);
2761         dst += dststride;
2762         tmp += 16;
2763     }
2764 #endif
2765 }
2766 
ff_put_vp8_epel8_h6v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2767 void ff_put_vp8_epel8_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2768         ptrdiff_t srcstride, int h, int mx, int my)
2769 {
2770 #if 1
2771     DECLARE_ALIGNED(8, uint8_t, tmp_array[168]);
2772     uint8_t *tmp = tmp_array;
2773 
2774     src -= 2 * srcstride;
2775     ff_put_vp8_epel8_h6_mmi(tmp, 8, src, srcstride, h + 5, mx, my);
2776     tmp    = tmp_array + 16;
2777     ff_put_vp8_epel8_v6_mmi(dst, dststride, tmp, 8, h, mx, my);
2778 #else
2779     const uint8_t *filter = subpel_filters[mx - 1];
2780     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2781     int x, y;
2782     uint8_t tmp_array[168];
2783     uint8_t *tmp = tmp_array;
2784 
2785     src -= 2 * srcstride;
2786 
2787     for (y = 0; y < h + 5; y++) {
2788         for (x = 0; x < 8; x++)
2789             tmp[x] = FILTER_6TAP(src, filter, 1);
2790         tmp += 8;
2791         src += srcstride;
2792     }
2793 
2794     tmp    = tmp_array + 16;
2795     filter = subpel_filters[my - 1];
2796 
2797     for (y = 0; y < h; y++) {
2798         for (x = 0; x < 8; x++)
2799             dst[x] = FILTER_6TAP(tmp, filter, 8);
2800         dst += dststride;
2801         tmp += 8;
2802     }
2803 #endif
2804 }
2805 
ff_put_vp8_epel4_h6v6_mmi(uint8_t * dst,ptrdiff_t dststride,uint8_t * src,ptrdiff_t srcstride,int h,int mx,int my)2806 void ff_put_vp8_epel4_h6v6_mmi(uint8_t *dst, ptrdiff_t dststride, uint8_t *src,
2807         ptrdiff_t srcstride, int h, int mx, int my)
2808 {
2809 #if 1
2810     DECLARE_ALIGNED(4, uint8_t, tmp_array[52]);
2811     uint8_t *tmp = tmp_array;
2812 
2813     src -= 2 * srcstride;
2814     ff_put_vp8_epel4_h6_mmi(tmp, 4, src, srcstride, h + 5, mx, my);
2815     tmp    = tmp_array + 8;
2816     ff_put_vp8_epel4_v6_mmi(dst, dststride, tmp, 4, h, mx, my);
2817 #else
2818     const uint8_t *filter = subpel_filters[mx - 1];
2819     const uint8_t *cm     = ff_crop_tab + MAX_NEG_CROP;
2820     int x, y;
2821     uint8_t tmp_array[52];
2822     uint8_t *tmp = tmp_array;
2823 
2824     src -= 2 * srcstride;
2825 
2826     for (y = 0; y < h + 5; y++) {
2827         for (x = 0; x < 4; x++)
2828             tmp[x] = FILTER_6TAP(src, filter, 1);
2829         tmp += 4;
2830         src += srcstride;
2831     }
2832 
2833     tmp    = tmp_array + 8;
2834     filter = subpel_filters[my - 1];
2835 
2836     for (y = 0; y < h; y++) {
2837         for (x = 0; x < 4; x++)
2838             dst[x] = FILTER_6TAP(tmp, filter, 4);
2839         dst += dststride;
2840         tmp += 4;
2841     }
2842 #endif
2843 }
2844 
ff_put_vp8_bilinear16_h_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)2845 void ff_put_vp8_bilinear16_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2846         ptrdiff_t sstride, int h, int mx, int my)
2847 {
2848 #if 1
2849     int a = 8 - mx, b = mx;
2850     double ftmp[7];
2851     uint32_t tmp[1];
2852     mips_reg dst0, src0;
2853     DECLARE_VAR_ALL64;
2854 
2855     /*
2856     dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
2857     dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
2858     dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
2859     dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
2860     dst[4] = (a * src[4] + b * src[5] + 4) >> 3;
2861     dst[5] = (a * src[5] + b * src[6] + 4) >> 3;
2862     dst[6] = (a * src[6] + b * src[7] + 4) >> 3;
2863     dst[7] = (a * src[7] + b * src[8] + 4) >> 3;
2864 
2865     dst[ 8] = (a * src[ 8] + b * src[ 9] + 4) >> 3;
2866     dst[ 9] = (a * src[ 9] + b * src[10] + 4) >> 3;
2867     dst[10] = (a * src[10] + b * src[11] + 4) >> 3;
2868     dst[11] = (a * src[11] + b * src[12] + 4) >> 3;
2869     dst[12] = (a * src[12] + b * src[13] + 4) >> 3;
2870     dst[13] = (a * src[13] + b * src[14] + 4) >> 3;
2871     dst[14] = (a * src[14] + b * src[15] + 4) >> 3;
2872     dst[15] = (a * src[15] + b * src[16] + 4) >> 3;
2873     */
2874     __asm__ volatile (
2875         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
2876         "li         %[tmp0],    0x03                                \n\t"
2877         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
2878         "pshufh     %[a],       %[a],           %[ftmp0]            \n\t"
2879         "pshufh     %[b],       %[b],           %[ftmp0]            \n\t"
2880 
2881         "1:                                                         \n\t"
2882         // 0 - 7
2883         PUT_VP8_BILINEAR8_H_MMI(%[src], %[dst])
2884         PTR_ADDIU  "%[src0],    %[src],         0x08                \n\t"
2885         PTR_ADDIU  "%[dst0],    %[dst],         0x08                \n\t"
2886         // 8 - 15
2887         PUT_VP8_BILINEAR8_H_MMI(%[src0], %[dst0])
2888 
2889         "addiu      %[h],       %[h],           -0x01               \n\t"
2890         PTR_ADDU   "%[src],     %[src],         %[sstride]          \n\t"
2891         PTR_ADDU   "%[dst],     %[dst],         %[dstride]          \n\t"
2892         "bnez       %[h],       1b                                  \n\t"
2893         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2894           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2895           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2896           [ftmp6]"=&f"(ftmp[6]),
2897           [tmp0]"=&r"(tmp[0]),
2898           RESTRICT_ASM_ALL64
2899           [dst0]"=&r"(dst0),            [src0]"=&r"(src0),
2900           [h]"+&r"(h),
2901           [dst]"+&r"(dst),              [src]"+&r"(src),
2902           [a]"+&f"(a),                  [b]"+&f"(b)
2903         : [sstride]"r"((mips_reg)sstride),
2904           [dstride]"r"((mips_reg)dstride),
2905           [ff_pw_4]"f"(ff_pw_4)
2906         : "memory"
2907     );
2908 #else
2909     int a = 8 - mx, b = mx;
2910     int x, y;
2911 
2912     for (y = 0; y < h; y++) {
2913         for (x = 0; x < 16; x++)
2914             dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
2915         dst += dstride;
2916         src += sstride;
2917     }
2918 #endif
2919 }
2920 
ff_put_vp8_bilinear16_v_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)2921 void ff_put_vp8_bilinear16_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2922         ptrdiff_t sstride, int h, int mx, int my)
2923 {
2924 #if 1
2925     int c = 8 - my, d = my;
2926     double ftmp[7];
2927     uint32_t tmp[1];
2928     mips_reg src0, src1, dst0;
2929     DECLARE_VAR_ALL64;
2930 
2931     /*
2932     dst[0] = (c * src[0] + d * src[    sstride] + 4) >> 3;
2933     dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
2934     dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
2935     dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
2936     dst[4] = (c * src[4] + d * src[4 + sstride] + 4) >> 3;
2937     dst[5] = (c * src[5] + d * src[5 + sstride] + 4) >> 3;
2938     dst[6] = (c * src[6] + d * src[6 + sstride] + 4) >> 3;
2939     dst[7] = (c * src[7] + d * src[7 + sstride] + 4) >> 3;
2940     */
2941     __asm__ volatile (
2942         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
2943         "li         %[tmp0],    0x03                                \n\t"
2944         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
2945         "pshufh     %[c],       %[c],           %[ftmp0]            \n\t"
2946         "pshufh     %[d],       %[d],           %[ftmp0]            \n\t"
2947 
2948         "1:                                                         \n\t"
2949         // 0 - 7
2950         PUT_VP8_BILINEAR8_V_MMI(%[src], %[src1], %[dst], %[sstride])
2951         PTR_ADDIU  "%[src0],    %[src],         0x08                \n\t"
2952         PTR_ADDIU  "%[dst0],    %[dst],         0x08                \n\t"
2953         // 8 - 15
2954         PUT_VP8_BILINEAR8_V_MMI(%[src0], %[src1], %[dst0], %[sstride])
2955 
2956         "addiu      %[h],       %[h],           -0x01               \n\t"
2957         PTR_ADDU   "%[src],     %[src],         %[sstride]          \n\t"
2958         PTR_ADDU   "%[dst],     %[dst],         %[dstride]          \n\t"
2959         "bnez       %[h],       1b                                  \n\t"
2960         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
2961           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
2962           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
2963           [ftmp6]"=&f"(ftmp[6]),
2964           [tmp0]"=&r"(tmp[0]),
2965           RESTRICT_ASM_ALL64
2966           [src0]"=&r"(src0),            [dst0]"=&r"(dst0),
2967           [src1]"=&r"(src1),
2968           [h]"+&r"(h),
2969           [dst]"+&r"(dst),              [src]"+&r"(src),
2970           [c]"+&f"(c),                  [d]"+&f"(d)
2971         : [sstride]"r"((mips_reg)sstride),
2972           [dstride]"r"((mips_reg)dstride),
2973           [ff_pw_4]"f"(ff_pw_4)
2974         : "memory"
2975     );
2976 #else
2977     int c = 8 - my, d = my;
2978     int x, y;
2979 
2980     for (y = 0; y < h; y++) {
2981         for (x = 0; x < 16; x++)
2982             dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
2983         dst += dstride;
2984         src += sstride;
2985     }
2986 #endif
2987 }
2988 
ff_put_vp8_bilinear16_hv_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)2989 void ff_put_vp8_bilinear16_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
2990         ptrdiff_t sstride, int h, int mx, int my)
2991 {
2992 #if 1
2993     DECLARE_ALIGNED(8, uint8_t, tmp_array[528]);
2994     uint8_t *tmp = tmp_array;
2995 
2996     ff_put_vp8_bilinear16_h_mmi(tmp, 16, src, sstride, h + 1, mx, my);
2997     ff_put_vp8_bilinear16_v_mmi(dst, dstride, tmp, 16, h, mx, my);
2998 #else
2999     int a = 8 - mx, b = mx;
3000     int c = 8 - my, d = my;
3001     int x, y;
3002     uint8_t tmp_array[528];
3003     uint8_t *tmp = tmp_array;
3004 
3005     for (y = 0; y < h + 1; y++) {
3006         for (x = 0; x < 16; x++)
3007             tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3008         tmp += 16;
3009         src += sstride;
3010     }
3011 
3012     tmp = tmp_array;
3013 
3014     for (y = 0; y < h; y++) {
3015         for (x = 0; x < 16; x++)
3016             dst[x] = (c * tmp[x] + d * tmp[x + 16] + 4) >> 3;
3017         dst += dstride;
3018         tmp += 16;
3019     }
3020 #endif
3021 }
3022 
ff_put_vp8_bilinear8_h_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)3023 void ff_put_vp8_bilinear8_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3024         ptrdiff_t sstride, int h, int mx, int my)
3025 {
3026 #if 1
3027     int a = 8 - mx, b = mx;
3028     double ftmp[7];
3029     uint32_t tmp[1];
3030     DECLARE_VAR_ALL64;
3031 
3032     /*
3033     dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
3034     dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
3035     dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
3036     dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
3037     dst[4] = (a * src[4] + b * src[5] + 4) >> 3;
3038     dst[5] = (a * src[5] + b * src[6] + 4) >> 3;
3039     dst[6] = (a * src[6] + b * src[7] + 4) >> 3;
3040     dst[7] = (a * src[7] + b * src[8] + 4) >> 3;
3041     */
3042     __asm__ volatile (
3043         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
3044         "li         %[tmp0],    0x03                                \n\t"
3045         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
3046         "pshufh     %[a],       %[a],           %[ftmp0]            \n\t"
3047         "pshufh     %[b],       %[b],           %[ftmp0]            \n\t"
3048 
3049         "1:                                                         \n\t"
3050         PUT_VP8_BILINEAR8_H_MMI(%[src], %[dst])
3051 
3052         "addiu      %[h],       %[h],           -0x01               \n\t"
3053         PTR_ADDU   "%[src],     %[src],         %[sstride]          \n\t"
3054         PTR_ADDU   "%[dst],     %[dst],         %[dstride]          \n\t"
3055         "bnez       %[h],       1b                                  \n\t"
3056         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
3057           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
3058           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
3059           [ftmp6]"=&f"(ftmp[6]),
3060           [tmp0]"=&r"(tmp[0]),
3061           RESTRICT_ASM_ALL64
3062           [h]"+&r"(h),
3063           [dst]"+&r"(dst),              [src]"+&r"(src),
3064           [a]"+&f"(a),                  [b]"+&f"(b)
3065         : [sstride]"r"((mips_reg)sstride),
3066           [dstride]"r"((mips_reg)dstride),
3067           [ff_pw_4]"f"(ff_pw_4)
3068         : "memory"
3069     );
3070 #else
3071     int a = 8 - mx, b = mx;
3072     int x, y;
3073 
3074     for (y = 0; y < h; y++) {
3075         for (x = 0; x < 8; x++)
3076             dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3077         dst += dstride;
3078         src += sstride;
3079     }
3080 #endif
3081 }
3082 
ff_put_vp8_bilinear8_v_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)3083 void ff_put_vp8_bilinear8_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3084         ptrdiff_t sstride, int h, int mx, int my)
3085 {
3086 #if 1
3087     int c = 8 - my, d = my;
3088     double ftmp[7];
3089     uint32_t tmp[1];
3090     mips_reg src1;
3091     DECLARE_VAR_ALL64;
3092 
3093     /*
3094     dst[0] = (c * src[0] + d * src[    sstride] + 4) >> 3;
3095     dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
3096     dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
3097     dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
3098     dst[4] = (c * src[4] + d * src[4 + sstride] + 4) >> 3;
3099     dst[5] = (c * src[5] + d * src[5 + sstride] + 4) >> 3;
3100     dst[6] = (c * src[6] + d * src[6 + sstride] + 4) >> 3;
3101     dst[7] = (c * src[7] + d * src[7 + sstride] + 4) >> 3;
3102     */
3103     __asm__ volatile (
3104         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
3105         "li         %[tmp0],    0x03                                \n\t"
3106         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
3107         "pshufh     %[c],       %[c],           %[ftmp0]            \n\t"
3108         "pshufh     %[d],       %[d],           %[ftmp0]            \n\t"
3109 
3110         "1:                                                         \n\t"
3111         PUT_VP8_BILINEAR8_V_MMI(%[src], %[src1], %[dst], %[sstride])
3112 
3113         "addiu      %[h],       %[h],           -0x01               \n\t"
3114         PTR_ADDU   "%[src],     %[src],         %[sstride]          \n\t"
3115         PTR_ADDU   "%[dst],     %[dst],         %[dstride]          \n\t"
3116         "bnez       %[h],       1b                                  \n\t"
3117         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
3118           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
3119           [ftmp4]"=&f"(ftmp[4]),        [ftmp5]"=&f"(ftmp[5]),
3120           [ftmp6]"=&f"(ftmp[6]),
3121           [tmp0]"=&r"(tmp[0]),
3122           RESTRICT_ASM_ALL64
3123           [src1]"=&r"(src1),
3124           [h]"+&r"(h),
3125           [dst]"+&r"(dst),              [src]"+&r"(src),
3126           [c]"+&f"(c),                  [d]"+&f"(d)
3127         : [sstride]"r"((mips_reg)sstride),
3128           [dstride]"r"((mips_reg)dstride),
3129           [ff_pw_4]"f"(ff_pw_4)
3130         : "memory"
3131     );
3132 #else
3133     int c = 8 - my, d = my;
3134     int x, y;
3135 
3136     for (y = 0; y < h; y++) {
3137         for (x = 0; x < 8; x++)
3138             dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
3139         dst += dstride;
3140         src += sstride;
3141     }
3142 #endif
3143 }
3144 
ff_put_vp8_bilinear8_hv_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)3145 void ff_put_vp8_bilinear8_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3146         ptrdiff_t sstride, int h, int mx, int my)
3147 {
3148 #if 1
3149     DECLARE_ALIGNED(8, uint8_t, tmp_array[136]);
3150     uint8_t *tmp = tmp_array;
3151 
3152     ff_put_vp8_bilinear8_h_mmi(tmp, 8, src, sstride, h + 1, mx, my);
3153     ff_put_vp8_bilinear8_v_mmi(dst, dstride, tmp, 8, h, mx, my);
3154 #else
3155     int a = 8 - mx, b = mx;
3156     int c = 8 - my, d = my;
3157     int x, y;
3158     uint8_t tmp_array[136];
3159     uint8_t *tmp = tmp_array;
3160 
3161     for (y = 0; y < h + 1; y++) {
3162         for (x = 0; x < 8; x++)
3163             tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3164         tmp += 8;
3165         src += sstride;
3166     }
3167 
3168     tmp = tmp_array;
3169 
3170     for (y = 0; y < h; y++) {
3171         for (x = 0; x < 8; x++)
3172             dst[x] = (c * tmp[x] + d * tmp[x + 8] + 4) >> 3;
3173         dst += dstride;
3174         tmp += 8;
3175     }
3176 #endif
3177 }
3178 
ff_put_vp8_bilinear4_h_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)3179 void ff_put_vp8_bilinear4_h_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3180         ptrdiff_t sstride, int h, int mx, int my)
3181 {
3182 #if 1
3183     int a = 8 - mx, b = mx;
3184     double ftmp[5];
3185     uint32_t tmp[1];
3186     DECLARE_VAR_LOW32;
3187     DECLARE_VAR_ALL64;
3188 
3189     /*
3190     dst[0] = (a * src[0] + b * src[1] + 4) >> 3;
3191     dst[1] = (a * src[1] + b * src[2] + 4) >> 3;
3192     dst[2] = (a * src[2] + b * src[3] + 4) >> 3;
3193     dst[3] = (a * src[3] + b * src[4] + 4) >> 3;
3194     */
3195     __asm__ volatile (
3196         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
3197         "li         %[tmp0],    0x03                                \n\t"
3198         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
3199         "pshufh     %[a],       %[a],           %[ftmp0]            \n\t"
3200         "pshufh     %[b],       %[b],           %[ftmp0]            \n\t"
3201 
3202         "1:                                                         \n\t"
3203         PUT_VP8_BILINEAR4_H_MMI(%[src], %[dst])
3204 
3205         "addiu      %[h],       %[h],           -0x01               \n\t"
3206         PTR_ADDU   "%[src],     %[src],         %[sstride]          \n\t"
3207         PTR_ADDU   "%[dst],     %[dst],         %[dstride]          \n\t"
3208         "bnez       %[h],       1b                                  \n\t"
3209         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
3210           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
3211           [ftmp4]"=&f"(ftmp[4]),
3212           [tmp0]"=&r"(tmp[0]),
3213           RESTRICT_ASM_LOW32
3214           RESTRICT_ASM_ALL64
3215           [h]"+&r"(h),
3216           [dst]"+&r"(dst),              [src]"+&r"(src),
3217           [a]"+&f"(a),                  [b]"+&f"(b)
3218         : [sstride]"r"((mips_reg)sstride),
3219           [dstride]"r"((mips_reg)dstride),
3220           [ff_pw_4]"f"(ff_pw_4)
3221         : "memory"
3222     );
3223 #else
3224     int a = 8 - mx, b = mx;
3225     int x, y;
3226 
3227     for (y = 0; y < h; y++) {
3228         for (x = 0; x < 4; x++)
3229             dst[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3230         dst += dstride;
3231         src += sstride;
3232     }
3233 #endif
3234 }
3235 
ff_put_vp8_bilinear4_v_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)3236 void ff_put_vp8_bilinear4_v_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3237         ptrdiff_t sstride, int h, int mx, int my)
3238 {
3239 #if 1
3240     int c = 8 - my, d = my;
3241     double ftmp[7];
3242     uint32_t tmp[1];
3243     mips_reg src1;
3244     DECLARE_VAR_LOW32;
3245     DECLARE_VAR_ALL64;
3246 
3247     /*
3248     dst[0] = (c * src[0] + d * src[    sstride] + 4) >> 3;
3249     dst[1] = (c * src[1] + d * src[1 + sstride] + 4) >> 3;
3250     dst[2] = (c * src[2] + d * src[2 + sstride] + 4) >> 3;
3251     dst[3] = (c * src[3] + d * src[3 + sstride] + 4) >> 3;
3252     */
3253     __asm__ volatile (
3254         "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
3255         "li         %[tmp0],    0x03                                \n\t"
3256         "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
3257         "pshufh     %[c],       %[c],           %[ftmp0]            \n\t"
3258         "pshufh     %[d],       %[d],           %[ftmp0]            \n\t"
3259 
3260         "1:                                                         \n\t"
3261         PUT_VP8_BILINEAR4_V_MMI(%[src], %[src1], %[dst], %[sstride])
3262 
3263         "addiu      %[h],       %[h],           -0x01               \n\t"
3264         PTR_ADDU   "%[src],     %[src],         %[sstride]          \n\t"
3265         PTR_ADDU   "%[dst],     %[dst],         %[dstride]          \n\t"
3266         "bnez       %[h],       1b                                  \n\t"
3267         : [ftmp0]"=&f"(ftmp[0]),        [ftmp1]"=&f"(ftmp[1]),
3268           [ftmp2]"=&f"(ftmp[2]),        [ftmp3]"=&f"(ftmp[3]),
3269           [ftmp4]"=&f"(ftmp[4]),
3270           [tmp0]"=&r"(tmp[0]),
3271           RESTRICT_ASM_LOW32
3272           RESTRICT_ASM_ALL64
3273           [src1]"=&r"(src1),
3274           [h]"+&r"(h),
3275           [dst]"+&r"(dst),              [src]"+&r"(src),
3276           [c]"+&f"(c),                  [d]"+&f"(d)
3277         : [sstride]"r"((mips_reg)sstride),
3278           [dstride]"r"((mips_reg)dstride),
3279           [ff_pw_4]"f"(ff_pw_4)
3280         : "memory"
3281     );
3282 #else
3283     int c = 8 - my, d = my;
3284     int x, y;
3285 
3286     for (y = 0; y < h; y++) {
3287         for (x = 0; x < 4; x++)
3288             dst[x] = (c * src[x] + d * src[x + sstride] + 4) >> 3;
3289         dst += dstride;
3290         src += sstride;
3291     }
3292 #endif
3293 }
3294 
ff_put_vp8_bilinear4_hv_mmi(uint8_t * dst,ptrdiff_t dstride,uint8_t * src,ptrdiff_t sstride,int h,int mx,int my)3295 void ff_put_vp8_bilinear4_hv_mmi(uint8_t *dst, ptrdiff_t dstride, uint8_t *src,
3296         ptrdiff_t sstride, int h, int mx, int my)
3297 {
3298 #if 1
3299     DECLARE_ALIGNED(4, uint8_t, tmp_array[36]);
3300     uint8_t *tmp = tmp_array;
3301 
3302     ff_put_vp8_bilinear4_h_mmi(tmp, 4, src, sstride, h + 1, mx, my);
3303     ff_put_vp8_bilinear4_v_mmi(dst, dstride, tmp, 4, h, mx, my);
3304 #else
3305     int a = 8 - mx, b = mx;
3306     int c = 8 - my, d = my;
3307     int x, y;
3308     uint8_t tmp_array[36];
3309     uint8_t *tmp = tmp_array;
3310 
3311     for (y = 0; y < h + 1; y++) {
3312         for (x = 0; x < 4; x++)
3313             tmp[x] = (a * src[x] + b * src[x + 1] + 4) >> 3;
3314         tmp += 4;
3315         src += sstride;
3316     }
3317 
3318     tmp = tmp_array;
3319 
3320     for (y = 0; y < h; y++) {
3321         for (x = 0; x < 4; x++)
3322             dst[x] = (c * tmp[x] + d * tmp[x + 4] + 4) >> 3;
3323         dst += dstride;
3324         tmp += 4;
3325     }
3326 #endif
3327 }
3328