1 /*
2 * VC-1 and WMV3 - DSP functions Loongson MMI-optimized
3 *
4 * Copyright (c) 2016 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23 #include "libavutil/avassert.h"
24 #include "libavcodec/vc1dsp.h"
25 #include "constants.h"
26 #include "vc1dsp_mips.h"
27 #include "hpeldsp_mips.h"
28 #include "libavutil/mips/mmiutils.h"
29
30 #define VC1_INV_TRANCS_8_TYPE1(o1, o2, r1, r2, r3, r4, c0) \
31 "li %[tmp0], "#r1" \n\t" \
32 "mtc1 %[tmp0], %[ftmp13] \n\t" \
33 "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \
34 "li %[tmp0], "#r2" \n\t" \
35 "mtc1 %[tmp0], %[ftmp14] \n\t" \
36 "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \
37 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp13] \n\t" \
38 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp14] \n\t" \
39 "paddw %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
40 "pmaddhw %[ftmp2], %[ftmp6], %[ftmp13] \n\t" \
41 "pmaddhw %[ftmp3], %[ftmp8], %[ftmp14] \n\t" \
42 "paddw %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \
43 \
44 "li %[tmp0], "#r3" \n\t" \
45 "mtc1 %[tmp0], %[ftmp13] \n\t" \
46 "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \
47 "li %[tmp0], "#r4" \n\t" \
48 "mtc1 %[tmp0], %[ftmp14] \n\t" \
49 "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \
50 "pmaddhw %[ftmp3], %[ftmp9], %[ftmp13] \n\t" \
51 "pmaddhw %[ftmp4], %[ftmp11], %[ftmp14] \n\t" \
52 "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
53 "pmaddhw %[ftmp4], %[ftmp10], %[ftmp13] \n\t" \
54 "pmaddhw %[ftmp13], %[ftmp12], %[ftmp14] \n\t" \
55 "paddw %[ftmp4], %[ftmp4], %[ftmp13] \n\t" \
56 \
57 "paddw %[ftmp1], %[ftmp1], "#c0" \n\t" \
58 "paddw %[ftmp2], %[ftmp2], "#c0" \n\t" \
59 "paddw %[ftmp13], %[ftmp1], %[ftmp3] \n\t" \
60 "psubw %[ftmp14], %[ftmp1], %[ftmp3] \n\t" \
61 "paddw %[ftmp1], %[ftmp2], %[ftmp4] \n\t" \
62 "psubw %[ftmp3], %[ftmp2], %[ftmp4] \n\t" \
63 "psraw %[ftmp13], %[ftmp13], %[ftmp0] \n\t" \
64 "psraw %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
65 "psraw %[ftmp14], %[ftmp14], %[ftmp0] \n\t" \
66 "psraw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
67 "punpcklhw %[ftmp2], %[ftmp13], %[ftmp1] \n\t" \
68 "punpckhhw %[ftmp4], %[ftmp13], %[ftmp1] \n\t" \
69 "punpcklhw "#o1", %[ftmp2], %[ftmp4] \n\t" \
70 "punpcklhw %[ftmp2], %[ftmp14], %[ftmp3] \n\t" \
71 "punpckhhw %[ftmp4], %[ftmp14], %[ftmp3] \n\t" \
72 "punpcklhw "#o2", %[ftmp2], %[ftmp4] \n\t"
73
74 #define VC1_INV_TRANCS_8_TYPE2(o1, o2, r1, r2, r3, r4, c0, c1) \
75 "li %[tmp0], "#r1" \n\t" \
76 "mtc1 %[tmp0], %[ftmp13] \n\t" \
77 "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \
78 "li %[tmp0], "#r2" \n\t" \
79 "mtc1 %[tmp0], %[ftmp14] \n\t" \
80 "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \
81 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp13] \n\t" \
82 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp14] \n\t" \
83 "paddw %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
84 "pmaddhw %[ftmp2], %[ftmp6], %[ftmp13] \n\t" \
85 "pmaddhw %[ftmp3], %[ftmp8], %[ftmp14] \n\t" \
86 "paddw %[ftmp2], %[ftmp2], %[ftmp3] \n\t" \
87 \
88 "li %[tmp0], "#r3" \n\t" \
89 "mtc1 %[tmp0], %[ftmp13] \n\t" \
90 "punpcklwd %[ftmp13], %[ftmp13], %[ftmp13] \n\t" \
91 "li %[tmp0], "#r4" \n\t" \
92 "mtc1 %[tmp0], %[ftmp14] \n\t" \
93 "punpcklwd %[ftmp14], %[ftmp14], %[ftmp14] \n\t" \
94 "pmaddhw %[ftmp3], %[ftmp9], %[ftmp13] \n\t" \
95 "pmaddhw %[ftmp4], %[ftmp11], %[ftmp14] \n\t" \
96 "paddw %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
97 "pmaddhw %[ftmp4], %[ftmp10], %[ftmp13] \n\t" \
98 "pmaddhw %[ftmp13], %[ftmp12], %[ftmp14] \n\t" \
99 "paddw %[ftmp4], %[ftmp4], %[ftmp13] \n\t" \
100 \
101 "paddw %[ftmp13], %[ftmp1], %[ftmp3] \n\t" \
102 "psubw %[ftmp14], %[ftmp1], %[ftmp3] \n\t" \
103 "paddw %[ftmp14], %[ftmp14], "#c1" \n\t" \
104 "paddw %[ftmp1], %[ftmp2], %[ftmp4] \n\t" \
105 "psubw %[ftmp3], %[ftmp2], %[ftmp4] \n\t" \
106 "paddw %[ftmp3], %[ftmp3], "#c1" \n\t" \
107 "paddw %[ftmp13], %[ftmp13], "#c0" \n\t" \
108 "paddw %[ftmp14], %[ftmp14], "#c0" \n\t" \
109 "paddw %[ftmp1], %[ftmp1], "#c0" \n\t" \
110 "paddw %[ftmp3], %[ftmp3], "#c0" \n\t" \
111 "psraw %[ftmp13], %[ftmp13], %[ftmp0] \n\t" \
112 "psraw %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
113 "psraw %[ftmp14], %[ftmp14], %[ftmp0] \n\t" \
114 "psraw %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
115 "punpcklhw %[ftmp2], %[ftmp13], %[ftmp1] \n\t" \
116 "punpckhhw %[ftmp4], %[ftmp13], %[ftmp1] \n\t" \
117 "punpcklhw "#o1", %[ftmp2], %[ftmp4] \n\t" \
118 "punpcklhw %[ftmp2], %[ftmp14], %[ftmp3] \n\t" \
119 "punpckhhw %[ftmp4], %[ftmp14], %[ftmp3] \n\t" \
120 "punpcklhw "#o2", %[ftmp2], %[ftmp4] \n\t"
121
122 /* Do inverse transform on 8x8 block */
ff_vc1_inv_trans_8x8_dc_mmi(uint8_t * dest,ptrdiff_t linesize,int16_t * block)123 void ff_vc1_inv_trans_8x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
124 {
125 int dc = block[0];
126 double ftmp[9];
127 mips_reg addr[1];
128 int count;
129
130 dc = (3 * dc + 1) >> 1;
131 dc = (3 * dc + 16) >> 5;
132
133 __asm__ volatile(
134 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
135 "pshufh %[dc], %[dc], %[ftmp0] \n\t"
136 "li %[count], 0x02 \n\t"
137
138 "1: \n\t"
139 MMI_LDC1(%[ftmp1], %[dest], 0x00)
140 PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
141 MMI_LDC1(%[ftmp2], %[addr0], 0x00)
142 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
143 MMI_LDC1(%[ftmp3], %[addr0], 0x00)
144 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
145 MMI_LDC1(%[ftmp4], %[addr0], 0x00)
146
147 "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t"
148 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
149 "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
150 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
151 "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t"
152 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
153 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t"
154 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
155
156 "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
157 "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
158 "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
159 "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
160 "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t"
161 "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t"
162 "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t"
163 "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t"
164
165 "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
166 "packushb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
167 "packushb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
168 "packushb %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
169
170 MMI_SDC1(%[ftmp1], %[dest], 0x00)
171 PTR_ADDU "%[addr0], %[dest], %[linesize] \n\t"
172 MMI_SDC1(%[ftmp2], %[addr0], 0x00)
173 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
174 MMI_SDC1(%[ftmp3], %[addr0], 0x00)
175 PTR_ADDU "%[addr0], %[addr0], %[linesize] \n\t"
176 MMI_SDC1(%[ftmp4], %[addr0], 0x00)
177
178 "addiu %[count], %[count], -0x01 \n\t"
179 PTR_ADDU "%[dest], %[addr0], %[linesize] \n\t"
180 "bnez %[count], 1b \n\t"
181 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
182 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
183 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
184 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
185 [ftmp8]"=&f"(ftmp[8]),
186 [addr0]"=&r"(addr[0]),
187 [count]"=&r"(count), [dest]"+&r"(dest)
188 : [linesize]"r"((mips_reg)linesize),
189 [dc]"f"(dc)
190 : "memory"
191 );
192 }
193
194 #if _MIPS_SIM != _ABIO32
ff_vc1_inv_trans_8x8_mmi(int16_t block[64])195 void ff_vc1_inv_trans_8x8_mmi(int16_t block[64])
196 {
197 DECLARE_ALIGNED(16, int16_t, temp[64]);
198 DECLARE_ALIGNED(8, const uint64_t, ff_pw_1_local) = {0x0000000100000001ULL};
199 DECLARE_ALIGNED(8, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
200 DECLARE_ALIGNED(8, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
201 double ftmp[23];
202 uint64_t tmp[1];
203
204 __asm__ volatile (
205 /* 1st loop: start */
206 "li %[tmp0], 0x03 \n\t"
207 "mtc1 %[tmp0], %[ftmp0] \n\t"
208
209 // 1st part
210 MMI_LDC1(%[ftmp1], %[block], 0x00)
211 MMI_LDC1(%[ftmp11], %[block], 0x10)
212 MMI_LDC1(%[ftmp2], %[block], 0x20)
213 MMI_LDC1(%[ftmp12], %[block], 0x30)
214 MMI_LDC1(%[ftmp3], %[block], 0x40)
215 MMI_LDC1(%[ftmp13], %[block], 0x50)
216 MMI_LDC1(%[ftmp4], %[block], 0x60)
217 MMI_LDC1(%[ftmp14], %[block], 0x70)
218 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
219 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
220 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
221 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
222
223 "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
224 "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
225 "punpcklhw %[ftmp11], %[ftmp13], %[ftmp14] \n\t"
226 "punpckhhw %[ftmp12], %[ftmp13], %[ftmp14] \n\t"
227
228 /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
229 VC1_INV_TRANCS_8_TYPE1(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
230 0x000f0010, 0x00040009, %[ff_pw_4])
231
232 /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
233 VC1_INV_TRANCS_8_TYPE1(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
234 0xfffc000f, 0xfff7fff0, %[ff_pw_4])
235
236 /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
237 VC1_INV_TRANCS_8_TYPE1(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
238 0xfff00009, 0x000f0004, %[ff_pw_4])
239
240 /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
241 VC1_INV_TRANCS_8_TYPE1(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
242 0xfff70004, 0xfff0000f, %[ff_pw_4])
243
244 TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
245 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
246
247 TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
248 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
249
250 MMI_SDC1(%[ftmp15], %[temp], 0x00)
251 MMI_SDC1(%[ftmp19], %[temp], 0x08)
252 MMI_SDC1(%[ftmp16], %[temp], 0x10)
253 MMI_SDC1(%[ftmp20], %[temp], 0x18)
254 MMI_SDC1(%[ftmp17], %[temp], 0x20)
255 MMI_SDC1(%[ftmp21], %[temp], 0x28)
256 MMI_SDC1(%[ftmp18], %[temp], 0x30)
257 MMI_SDC1(%[ftmp22], %[temp], 0x38)
258
259 // 2nd part
260 MMI_LDC1(%[ftmp1], %[block], 0x08)
261 MMI_LDC1(%[ftmp11], %[block], 0x18)
262 MMI_LDC1(%[ftmp2], %[block], 0x28)
263 MMI_LDC1(%[ftmp12], %[block], 0x38)
264 MMI_LDC1(%[ftmp3], %[block], 0x48)
265 MMI_LDC1(%[ftmp13], %[block], 0x58)
266 MMI_LDC1(%[ftmp4], %[block], 0x68)
267 MMI_LDC1(%[ftmp14], %[block], 0x78)
268 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
269 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
270 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
271 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
272
273 "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
274 "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
275 "punpcklhw %[ftmp11], %[ftmp13], %[ftmp14] \n\t"
276 "punpckhhw %[ftmp12], %[ftmp13], %[ftmp14] \n\t"
277
278 /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
279 VC1_INV_TRANCS_8_TYPE1(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
280 0x000f0010, 0x00040009, %[ff_pw_4])
281
282 /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
283 VC1_INV_TRANCS_8_TYPE1(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
284 0xfffc000f, 0xfff7fff0, %[ff_pw_4])
285
286 /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
287 VC1_INV_TRANCS_8_TYPE1(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
288 0xfff00009, 0x000f0004, %[ff_pw_4])
289
290 /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
291 VC1_INV_TRANCS_8_TYPE1(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
292 0xfff70004, 0xfff0000f, %[ff_pw_4])
293
294 TRANSPOSE_4H(%[ftmp15], %[ftmp16], %[ftmp17], %[ftmp18],
295 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
296
297 TRANSPOSE_4H(%[ftmp19], %[ftmp20], %[ftmp21], %[ftmp22],
298 %[ftmp1], %[ftmp2], %[ftmp3], %[ftmp4])
299
300 MMI_SDC1(%[ftmp19], %[temp], 0x48)
301 MMI_SDC1(%[ftmp20], %[temp], 0x58)
302 MMI_SDC1(%[ftmp21], %[temp], 0x68)
303 MMI_SDC1(%[ftmp22], %[temp], 0x78)
304 /* 1st loop: end */
305
306 /* 2nd loop: start */
307 "li %[tmp0], 0x07 \n\t"
308 "mtc1 %[tmp0], %[ftmp0] \n\t"
309
310 // 1st part
311 MMI_LDC1(%[ftmp1], %[temp], 0x00)
312 MMI_LDC1(%[ftmp11], %[temp], 0x10)
313 MMI_LDC1(%[ftmp2], %[temp], 0x20)
314 MMI_LDC1(%[ftmp12], %[temp], 0x30)
315 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
316 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
317 "punpcklhw %[ftmp7], %[ftmp15], %[ftmp17] \n\t"
318 "punpckhhw %[ftmp8], %[ftmp15], %[ftmp17] \n\t"
319
320 "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
321 "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
322 "punpcklhw %[ftmp11], %[ftmp16], %[ftmp18] \n\t"
323 "punpckhhw %[ftmp12], %[ftmp16], %[ftmp18] \n\t"
324
325 /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
326 VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
327 0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
328
329 /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
330 VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
331 0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
332
333 /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
334 VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
335 0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
336
337 /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
338 VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
339 0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
340
341 MMI_SDC1(%[ftmp15], %[block], 0x00)
342 MMI_SDC1(%[ftmp16], %[block], 0x10)
343 MMI_SDC1(%[ftmp17], %[block], 0x20)
344 MMI_SDC1(%[ftmp18], %[block], 0x30)
345 MMI_SDC1(%[ftmp19], %[block], 0x40)
346 MMI_SDC1(%[ftmp20], %[block], 0x50)
347 MMI_SDC1(%[ftmp21], %[block], 0x60)
348 MMI_SDC1(%[ftmp22], %[block], 0x70)
349
350 // 2nd part
351 MMI_LDC1(%[ftmp1], %[temp], 0x08)
352 MMI_LDC1(%[ftmp11], %[temp], 0x18)
353 MMI_LDC1(%[ftmp2], %[temp], 0x28)
354 MMI_LDC1(%[ftmp12], %[temp], 0x38)
355 MMI_LDC1(%[ftmp3], %[temp], 0x48)
356 MMI_LDC1(%[ftmp13], %[temp], 0x58)
357 MMI_LDC1(%[ftmp4], %[temp], 0x68)
358 MMI_LDC1(%[ftmp14], %[temp], 0x78)
359 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
360 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
361 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
362 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
363
364 "punpcklhw %[ftmp9], %[ftmp11], %[ftmp12] \n\t"
365 "punpckhhw %[ftmp10], %[ftmp11], %[ftmp12] \n\t"
366 "punpcklhw %[ftmp11], %[ftmp13], %[ftmp14] \n\t"
367 "punpckhhw %[ftmp12], %[ftmp13], %[ftmp14] \n\t"
368
369 /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
370 VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
371 0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
372
373 /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
374 VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
375 0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
376
377 /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
378 VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
379 0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
380
381 /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
382 VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
383 0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
384
385 MMI_SDC1(%[ftmp15], %[block], 0x08)
386 MMI_SDC1(%[ftmp16], %[block], 0x18)
387 MMI_SDC1(%[ftmp17], %[block], 0x28)
388 MMI_SDC1(%[ftmp18], %[block], 0x38)
389 MMI_SDC1(%[ftmp19], %[block], 0x48)
390 MMI_SDC1(%[ftmp20], %[block], 0x58)
391 MMI_SDC1(%[ftmp21], %[block], 0x68)
392 MMI_SDC1(%[ftmp22], %[block], 0x78)
393 /* 2nd loop: end */
394 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
395 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
396 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
397 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
398 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
399 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
400 [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
401 [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
402 [ftmp16]"=&f"(ftmp[16]), [ftmp17]"=&f"(ftmp[17]),
403 [ftmp18]"=&f"(ftmp[18]), [ftmp19]"=&f"(ftmp[19]),
404 [ftmp20]"=&f"(ftmp[20]), [ftmp21]"=&f"(ftmp[21]),
405 [ftmp22]"=&f"(ftmp[22]),
406 [tmp0]"=&r"(tmp[0])
407 : [ff_pw_1]"f"(ff_pw_1_local), [ff_pw_64]"f"(ff_pw_64_local),
408 [ff_pw_4]"f"(ff_pw_4_local), [block]"r"(block),
409 [temp]"r"(temp)
410 : "memory"
411 );
412 }
413 #endif
414
415 /* Do inverse transform on 8x4 part of block */
ff_vc1_inv_trans_8x4_dc_mmi(uint8_t * dest,ptrdiff_t linesize,int16_t * block)416 void ff_vc1_inv_trans_8x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
417 {
418 int dc = block[0];
419 double ftmp[9];
420
421 dc = ( 3 * dc + 1) >> 1;
422 dc = (17 * dc + 64) >> 7;
423
424 __asm__ volatile(
425 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
426 "pshufh %[dc], %[dc], %[ftmp0] \n\t"
427
428 MMI_LDC1(%[ftmp1], %[dest0], 0x00)
429 MMI_LDC1(%[ftmp2], %[dest1], 0x00)
430 MMI_LDC1(%[ftmp3], %[dest2], 0x00)
431 MMI_LDC1(%[ftmp4], %[dest3], 0x00)
432
433 "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t"
434 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
435 "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t"
436 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
437 "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t"
438 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
439 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t"
440 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
441
442 "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
443 "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
444 "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
445 "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
446 "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t"
447 "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t"
448 "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t"
449 "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t"
450
451 "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
452 "packushb %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
453 "packushb %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
454 "packushb %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
455
456 MMI_SDC1(%[ftmp1], %[dest0], 0x00)
457 MMI_SDC1(%[ftmp2], %[dest1], 0x00)
458 MMI_SDC1(%[ftmp3], %[dest2], 0x00)
459 MMI_SDC1(%[ftmp4], %[dest3], 0x00)
460 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
461 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
462 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
463 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
464 [ftmp8]"=&f"(ftmp[8])
465 : [dest0]"r"(dest+0*linesize), [dest1]"r"(dest+1*linesize),
466 [dest2]"r"(dest+2*linesize), [dest3]"r"(dest+3*linesize),
467 [dc]"f"(dc)
468 : "memory"
469 );
470 }
471
472 #if _MIPS_SIM != _ABIO32
ff_vc1_inv_trans_8x4_mmi(uint8_t * dest,ptrdiff_t linesize,int16_t * block)473 void ff_vc1_inv_trans_8x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
474 {
475 int16_t *src = block;
476 int16_t *dst = block;
477 double ftmp[16];
478 uint32_t tmp[1];
479 int16_t count = 4;
480 DECLARE_ALIGNED(16, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
481 DECLARE_ALIGNED(16, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
482 int16_t coeff[64] = {12, 16, 16, 15, 12, 9, 6, 4,
483 12, 15, 6, -4, -12, -16, -16, -9,
484 12, 9, -6, -16, -12, 4, 16, 15,
485 12, 4, -16, -9, 12, 15, -6, -16,
486 12, -4, -16, 9, 12, -15, -6, 16,
487 12, -9, -6, 16, -12, -4, 16, -15,
488 12, -15, 6, 4, -12, 16, -16, 9,
489 12, -16, 16, -15, 12, -9, 6, -4};
490
491 // 1st loop
492 __asm__ volatile (
493 "li %[tmp0], 0x03 \n\t"
494 "mtc1 %[tmp0], %[ftmp0] \n\t"
495
496 "1: \n\t"
497 MMI_LDC1(%[ftmp1], %[src], 0x00)
498 MMI_LDC1(%[ftmp2], %[src], 0x08)
499
500 /* ftmp11: dst1,dst0 */
501 MMI_LDC1(%[ftmp3], %[coeff], 0x00)
502 MMI_LDC1(%[ftmp4], %[coeff], 0x08)
503 MMI_LDC1(%[ftmp5], %[coeff], 0x10)
504 MMI_LDC1(%[ftmp6], %[coeff], 0x18)
505 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
506 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
507 "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
508 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t"
509 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t"
510 "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
511 "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t"
512 "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
513 "paddw %[ftmp11], %[ftmp7], %[ftmp8] \n\t"
514 "paddw %[ftmp11], %[ftmp11], %[ff_pw_4] \n\t"
515
516 /* ftmp12: dst3,dst2 */
517 MMI_LDC1(%[ftmp3], %[coeff], 0x20)
518 MMI_LDC1(%[ftmp4], %[coeff], 0x28)
519 MMI_LDC1(%[ftmp5], %[coeff], 0x30)
520 MMI_LDC1(%[ftmp6], %[coeff], 0x38)
521 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
522 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
523 "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
524 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t"
525 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t"
526 "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
527 "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t"
528 "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
529 "paddw %[ftmp12], %[ftmp7], %[ftmp8] \n\t"
530 "paddw %[ftmp12], %[ftmp12], %[ff_pw_4] \n\t"
531
532 /* ftmp13: dst5,dst4 */
533 MMI_LDC1(%[ftmp3], %[coeff], 0x40)
534 MMI_LDC1(%[ftmp4], %[coeff], 0x48)
535 MMI_LDC1(%[ftmp5], %[coeff], 0x50)
536 MMI_LDC1(%[ftmp6], %[coeff], 0x58)
537 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
538 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
539 "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
540 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t"
541 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t"
542 "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
543 "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t"
544 "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
545 "paddw %[ftmp13], %[ftmp7], %[ftmp8] \n\t"
546 "paddw %[ftmp13], %[ftmp13], %[ff_pw_4] \n\t"
547
548 /* ftmp14: dst7,dst6 */
549 MMI_LDC1(%[ftmp3], %[coeff], 0x60)
550 MMI_LDC1(%[ftmp4], %[coeff], 0x68)
551 MMI_LDC1(%[ftmp5], %[coeff], 0x70)
552 MMI_LDC1(%[ftmp6], %[coeff], 0x78)
553 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp3] \n\t"
554 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp4] \n\t"
555 "paddw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
556 "pmaddhw %[ftmp7], %[ftmp1], %[ftmp5] \n\t"
557 "pmaddhw %[ftmp8], %[ftmp2], %[ftmp6] \n\t"
558 "paddw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
559 "punpcklwd %[ftmp7], %[ftmp9], %[ftmp10] \n\t"
560 "punpckhwd %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
561 "paddw %[ftmp14], %[ftmp7], %[ftmp8] \n\t"
562 "paddw %[ftmp14], %[ftmp14], %[ff_pw_4] \n\t"
563
564 /* ftmp9: dst3,dst2,dst1,dst0 ftmp10: dst7,dst6,dst5,dst4 */
565 "psraw %[ftmp11], %[ftmp11], %[ftmp0] \n\t"
566 "psraw %[ftmp12], %[ftmp12], %[ftmp0] \n\t"
567 "psraw %[ftmp13], %[ftmp13], %[ftmp0] \n\t"
568 "psraw %[ftmp14], %[ftmp14], %[ftmp0] \n\t"
569 "punpcklhw %[ftmp7], %[ftmp11], %[ftmp12] \n\t"
570 "punpckhhw %[ftmp8], %[ftmp11], %[ftmp12] \n\t"
571 "punpcklhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
572 "punpcklhw %[ftmp7], %[ftmp13], %[ftmp14] \n\t"
573 "punpckhhw %[ftmp8], %[ftmp13], %[ftmp14] \n\t"
574 "punpcklhw %[ftmp10], %[ftmp7], %[ftmp8] \n\t"
575 MMI_SDC1(%[ftmp9], %[dst], 0x00)
576 MMI_SDC1(%[ftmp10], %[dst], 0x08)
577
578 PTR_ADDIU "%[src], %[src], 0x10 \n\t"
579 PTR_ADDIU "%[dst], %[dst], 0x10 \n\t"
580 "addiu %[count], %[count], -0x01 \n\t"
581 "bnez %[count], 1b \n\t"
582 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
583 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
584 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
585 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
586 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
587 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
588 [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
589 [ftmp14]"=&f"(ftmp[14]), [tmp0]"=&r"(tmp[0]),
590 [src]"+&r"(src), [dst]"+&r"(dst), [count]"+&r"(count)
591 : [ff_pw_4]"f"(ff_pw_4_local), [coeff]"r"(coeff)
592 : "memory"
593 );
594
595 src = block;
596
597 // 2nd loop
598 __asm__ volatile (
599 "li %[tmp0], 0x44 \n\t"
600 "mtc1 %[tmp0], %[ftmp15] \n\t"
601
602 // 1st part
603 "li %[tmp0], 0x07 \n\t"
604 "mtc1 %[tmp0], %[ftmp0] \n\t"
605 MMI_LDC1(%[ftmp1], %[src], 0x00)
606 MMI_LDC1(%[ftmp2], %[src], 0x10)
607 MMI_LDC1(%[ftmp3], %[src], 0x20)
608 MMI_LDC1(%[ftmp4], %[src], 0x30)
609 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
610 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
611 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
612 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
613
614 /* ftmp11: dst03,dst02,dst01,dst00 */
615 "li %[tmp0], 0x00160011 \n\t"
616 "mtc1 %[tmp0], %[ftmp3] \n\t"
617 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
618 "li %[tmp0], 0x000a0011 \n\t"
619 "mtc1 %[tmp0], %[ftmp4] \n\t"
620 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
621 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
622 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
623 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
624 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
625 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
626 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
627 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
628 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
629 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
630 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
631 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
632 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
633 "punpcklhw %[ftmp11], %[ftmp1], %[ftmp2] \n\t"
634
635 /* ftmp12: dst13,dst12,dst11,dst10 */
636 "li %[tmp0], 0x000a0011 \n\t"
637 "mtc1 %[tmp0], %[ftmp3] \n\t"
638 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
639 "li %[tmp0], 0xffeaffef \n\t"
640 "mtc1 %[tmp0], %[ftmp4] \n\t"
641 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
642 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
643 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
644 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
645 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
646 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
647 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
648 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
649 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
650 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
651 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
652 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
653 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
654 "punpcklhw %[ftmp12], %[ftmp1], %[ftmp2] \n\t"
655
656 /* ftmp13: dst23,dst22,dst21,dst20 */
657 "li %[tmp0], 0xfff60011 \n\t"
658 "mtc1 %[tmp0], %[ftmp3] \n\t"
659 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
660 "li %[tmp0], 0x0016ffef \n\t"
661 "mtc1 %[tmp0], %[ftmp4] \n\t"
662 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
663 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
664 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
665 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
666 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
667 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
668 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
669 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
670 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
671 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
672 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
673 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
674 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
675 "punpcklhw %[ftmp13], %[ftmp1], %[ftmp2] \n\t"
676
677 /* ftmp14: dst33,dst32,dst31,dst30 */
678 "li %[tmp0], 0xffea0011 \n\t"
679 "mtc1 %[tmp0], %[ftmp3] \n\t"
680 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
681 "li %[tmp0], 0xfff60011 \n\t"
682 "mtc1 %[tmp0], %[ftmp4] \n\t"
683 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
684 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
685 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
686 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
687 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
688 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
689 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
690 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
691 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
692 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
693 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
694 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
695 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
696 "punpcklhw %[ftmp14], %[ftmp1], %[ftmp2] \n\t"
697
698 MMI_LWC1(%[ftmp1], %[dest], 0x00)
699 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
700 MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
701 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
702 MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
703 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
704 MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
705 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
706 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
707 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
708 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
709 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
710 "paddh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
711 "paddh %[ftmp2], %[ftmp2], %[ftmp12] \n\t"
712 "paddh %[ftmp3], %[ftmp3], %[ftmp13] \n\t"
713 "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t"
714 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
715 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
716 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
717 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
718 MMI_SWC1(%[ftmp1], %[dest], 0x00)
719 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
720 MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
721 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
722 MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
723 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
724 MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
725
726 // 2nd part
727 "li %[tmp0], 0x07 \n\t"
728 "mtc1 %[tmp0], %[ftmp0] \n\t"
729 MMI_LDC1(%[ftmp1], %[src], 0x08)
730 MMI_LDC1(%[ftmp2], %[src], 0x18)
731 MMI_LDC1(%[ftmp3], %[src], 0x28)
732 MMI_LDC1(%[ftmp4], %[src], 0x38)
733 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
734 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
735 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
736 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
737
738 /* ftmp11: dst03,dst02,dst01,dst00 */
739 "li %[tmp0], 0x00160011 \n\t"
740 "mtc1 %[tmp0], %[ftmp3] \n\t"
741 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
742 "li %[tmp0], 0x000a0011 \n\t"
743 "mtc1 %[tmp0], %[ftmp4] \n\t"
744 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
745 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
746 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
747 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
748 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
749 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
750 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
751 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
752 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
753 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
754 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
755 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
756 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
757 "punpcklhw %[ftmp11], %[ftmp1], %[ftmp2] \n\t"
758
759 /* ftmp12: dst13,dst12,dst11,dst10 */
760 "li %[tmp0], 0x000a0011 \n\t"
761 "mtc1 %[tmp0], %[ftmp3] \n\t"
762 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
763 "li %[tmp0], 0xffeaffef \n\t"
764 "mtc1 %[tmp0], %[ftmp4] \n\t"
765 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
766 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
767 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
768 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
769 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
770 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
771 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
772 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
773 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
774 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
775 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
776 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
777 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
778 "punpcklhw %[ftmp12], %[ftmp1], %[ftmp2] \n\t"
779
780 /* ftmp13: dst23,dst22,dst21,dst20 */
781 "li %[tmp0], 0xfff60011 \n\t"
782 "mtc1 %[tmp0], %[ftmp3] \n\t"
783 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
784 "li %[tmp0], 0x0016ffef \n\t"
785 "mtc1 %[tmp0], %[ftmp4] \n\t"
786 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
787 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
788 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
789 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
790 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
791 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
792 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
793 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
794 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
795 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
796 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
797 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
798 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
799 "punpcklhw %[ftmp13], %[ftmp1], %[ftmp2] \n\t"
800
801 /* ftmp14: dst33,dst32,dst31,dst30 */
802 "li %[tmp0], 0xffea0011 \n\t"
803 "mtc1 %[tmp0], %[ftmp3] \n\t"
804 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
805 "li %[tmp0], 0xfff60011 \n\t"
806 "mtc1 %[tmp0], %[ftmp4] \n\t"
807 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
808 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
809 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
810 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
811 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
812 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
813 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
814 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
815 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
816 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
817 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
818 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
819 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
820 "punpcklhw %[ftmp14], %[ftmp1], %[ftmp2] \n\t"
821
822 MMI_LWC1(%[ftmp1], %[dest], 0x04)
823 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
824 MMI_LWC1(%[ftmp2], %[tmp0], 0x04)
825 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
826 MMI_LWC1(%[ftmp3], %[tmp0], 0x04)
827 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
828 MMI_LWC1(%[ftmp4], %[tmp0], 0x04)
829 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
830 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
831 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
832 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
833 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
834 "paddh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
835 "paddh %[ftmp2], %[ftmp2], %[ftmp12] \n\t"
836 "paddh %[ftmp3], %[ftmp3], %[ftmp13] \n\t"
837 "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t"
838 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
839 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
840 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
841 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
842 MMI_SWC1(%[ftmp1], %[dest], 0x04)
843 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
844 MMI_SWC1(%[ftmp2], %[tmp0], 0x04)
845 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
846 MMI_SWC1(%[ftmp3], %[tmp0], 0x04)
847 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
848 MMI_SWC1(%[ftmp4], %[tmp0], 0x04)
849
850 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
851 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
852 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
853 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
854 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
855 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
856 [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
857 [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
858 [tmp0]"=&r"(tmp[0])
859 : [ff_pw_64]"f"(ff_pw_64_local),
860 [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
861 :"memory"
862 );
863 }
864 #endif
865
866 /* Do inverse transform on 4x8 parts of block */
ff_vc1_inv_trans_4x8_dc_mmi(uint8_t * dest,ptrdiff_t linesize,int16_t * block)867 void ff_vc1_inv_trans_4x8_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
868 {
869 int dc = block[0];
870 double ftmp[9];
871 DECLARE_VAR_LOW32;
872
873 dc = (17 * dc + 4) >> 3;
874 dc = (12 * dc + 64) >> 7;
875
876 __asm__ volatile(
877 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
878 "pshufh %[dc], %[dc], %[ftmp0] \n\t"
879
880 MMI_LWC1(%[ftmp1], %[dest0], 0x00)
881 MMI_LWC1(%[ftmp2], %[dest1], 0x00)
882 MMI_LWC1(%[ftmp3], %[dest2], 0x00)
883 MMI_LWC1(%[ftmp4], %[dest3], 0x00)
884 MMI_LWC1(%[ftmp5], %[dest4], 0x00)
885 MMI_LWC1(%[ftmp6], %[dest5], 0x00)
886 MMI_LWC1(%[ftmp7], %[dest6], 0x00)
887 MMI_LWC1(%[ftmp8], %[dest7], 0x00)
888
889 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
890 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
891 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
892 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
893 "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
894 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
895 "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
896 "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
897
898 "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
899 "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
900 "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
901 "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
902 "paddsh %[ftmp5], %[ftmp5], %[dc] \n\t"
903 "paddsh %[ftmp6], %[ftmp6], %[dc] \n\t"
904 "paddsh %[ftmp7], %[ftmp7], %[dc] \n\t"
905 "paddsh %[ftmp8], %[ftmp8], %[dc] \n\t"
906
907 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
908 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
909 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
910 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
911 "packushb %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
912 "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
913 "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
914 "packushb %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
915
916 MMI_SWC1(%[ftmp1], %[dest0], 0x00)
917 MMI_SWC1(%[ftmp2], %[dest1], 0x00)
918 MMI_SWC1(%[ftmp3], %[dest2], 0x00)
919 MMI_SWC1(%[ftmp4], %[dest3], 0x00)
920 MMI_SWC1(%[ftmp5], %[dest4], 0x00)
921 MMI_SWC1(%[ftmp6], %[dest5], 0x00)
922 MMI_SWC1(%[ftmp7], %[dest6], 0x00)
923 MMI_SWC1(%[ftmp8], %[dest7], 0x00)
924 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
925 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
926 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
927 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
928 RESTRICT_ASM_LOW32
929 [ftmp8]"=&f"(ftmp[8])
930 : [dest0]"r"(dest+0*linesize), [dest1]"r"(dest+1*linesize),
931 [dest2]"r"(dest+2*linesize), [dest3]"r"(dest+3*linesize),
932 [dest4]"r"(dest+4*linesize), [dest5]"r"(dest+5*linesize),
933 [dest6]"r"(dest+6*linesize), [dest7]"r"(dest+7*linesize),
934 [dc]"f"(dc)
935 : "memory"
936 );
937 }
938
939 #if _MIPS_SIM != _ABIO32
ff_vc1_inv_trans_4x8_mmi(uint8_t * dest,ptrdiff_t linesize,int16_t * block)940 void ff_vc1_inv_trans_4x8_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
941 {
942 int16_t *src = block;
943 int16_t *dst = block;
944 double ftmp[23];
945 uint32_t count = 8, tmp[1];
946 int16_t coeff[16] = {17, 22, 17, 10,
947 17, 10,-17,-22,
948 17,-10,-17, 22,
949 17,-22, 17,-10};
950 DECLARE_ALIGNED(8, const uint64_t, ff_pw_1_local) = {0x0000000100000001ULL};
951 DECLARE_ALIGNED(8, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
952 DECLARE_ALIGNED(8, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
953
954 // 1st loop
955 __asm__ volatile (
956
957 "li %[tmp0], 0x03 \n\t"
958 "mtc1 %[tmp0], %[ftmp0] \n\t"
959
960 MMI_LDC1(%[ftmp2], %[coeff], 0x00)
961 MMI_LDC1(%[ftmp3], %[coeff], 0x08)
962 MMI_LDC1(%[ftmp4], %[coeff], 0x10)
963 MMI_LDC1(%[ftmp5], %[coeff], 0x18)
964 "1: \n\t"
965 /* ftmp8: dst3,dst2,dst1,dst0 */
966 MMI_LDC1(%[ftmp1], %[src], 0x00)
967 "pmaddhw %[ftmp6], %[ftmp2], %[ftmp1] \n\t"
968 "pmaddhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t"
969 "pmaddhw %[ftmp8], %[ftmp4], %[ftmp1] \n\t"
970 "pmaddhw %[ftmp9], %[ftmp5], %[ftmp1] \n\t"
971 "punpcklwd %[ftmp10], %[ftmp6], %[ftmp7] \n\t"
972 "punpckhwd %[ftmp11], %[ftmp6], %[ftmp7] \n\t"
973 "punpcklwd %[ftmp6], %[ftmp8], %[ftmp9] \n\t"
974 "punpckhwd %[ftmp7], %[ftmp8], %[ftmp9] \n\t"
975 "paddw %[ftmp8], %[ftmp10], %[ftmp11] \n\t"
976 "paddw %[ftmp9], %[ftmp6], %[ftmp7] \n\t"
977 "paddw %[ftmp8], %[ftmp8], %[ff_pw_4] \n\t"
978 "paddw %[ftmp9], %[ftmp9], %[ff_pw_4] \n\t"
979 "psraw %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
980 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
981 "punpcklhw %[ftmp6], %[ftmp8], %[ftmp9] \n\t"
982 "punpckhhw %[ftmp7], %[ftmp8], %[ftmp9] \n\t"
983 "punpcklhw %[ftmp8], %[ftmp6], %[ftmp7] \n\t"
984 MMI_SDC1(%[ftmp8], %[dst], 0x00)
985
986 PTR_ADDIU "%[src], %[src], 0x10 \n\t"
987 PTR_ADDIU "%[dst], %[dst], 0x10 \n\t"
988 "addiu %[count], %[count], -0x01 \n\t"
989 "bnez %[count], 1b \n\t"
990 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
991 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
992 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
993 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
994 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
995 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
996 [tmp0]"=&r"(tmp[0]), [count]"+&r"(count),
997 [src]"+&r"(src), [dst]"+&r"(dst)
998 : [ff_pw_4]"f"(ff_pw_4_local), [coeff]"r"(coeff)
999 : "memory"
1000 );
1001
1002 src = block;
1003
1004 // 2nd loop
1005 __asm__ volatile (
1006 "li %[tmp0], 0x07 \n\t"
1007 "mtc1 %[tmp0], %[ftmp0] \n\t"
1008
1009 MMI_LDC1(%[ftmp1], %[src], 0x00)
1010 MMI_LDC1(%[ftmp2], %[src], 0x20)
1011 MMI_LDC1(%[ftmp3], %[src], 0x40)
1012 MMI_LDC1(%[ftmp4], %[src], 0x60)
1013 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
1014 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
1015 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
1016 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
1017
1018 MMI_LDC1(%[ftmp1], %[src], 0x10)
1019 MMI_LDC1(%[ftmp2], %[src], 0x30)
1020 MMI_LDC1(%[ftmp3], %[src], 0x50)
1021 MMI_LDC1(%[ftmp4], %[src], 0x70)
1022 "punpcklhw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1023 "punpckhhw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1024 "punpcklhw %[ftmp11], %[ftmp3], %[ftmp4] \n\t"
1025 "punpckhhw %[ftmp12], %[ftmp3], %[ftmp4] \n\t"
1026
1027 /* ftmp15:dst03,dst02,dst01,dst00 ftmp22:dst73,dst72,dst71,dst70 */
1028 VC1_INV_TRANCS_8_TYPE2(%[ftmp15], %[ftmp22], 0x0010000c, 0x0006000c,
1029 0x000f0010, 0x00040009, %[ff_pw_64], %[ff_pw_1])
1030
1031 /* ftmp16:dst13,dst12,dst11,dst10 ftmp21:dst63,dst62,dst61,dst60 */
1032 VC1_INV_TRANCS_8_TYPE2(%[ftmp16], %[ftmp21], 0x0006000c, 0xfff0fff4,
1033 0xfffc000f, 0xfff7fff0, %[ff_pw_64], %[ff_pw_1])
1034
1035 /* ftmp17:dst23,dst22,dst21,dst20 ftmp20:dst53,dst52,dst51,dst50 */
1036 VC1_INV_TRANCS_8_TYPE2(%[ftmp17], %[ftmp20], 0xfffa000c, 0x0010fff4,
1037 0xfff00009, 0x000f0004, %[ff_pw_64], %[ff_pw_1])
1038
1039 /* ftmp18:dst33,dst32,dst31,dst30 ftmp19:dst43,dst42,dst41,dst40 */
1040 VC1_INV_TRANCS_8_TYPE2(%[ftmp18], %[ftmp19], 0xfff0000c, 0xfffa000c,
1041 0xfff70004, 0xfff0000f, %[ff_pw_64], %[ff_pw_1])
1042
1043 MMI_LWC1(%[ftmp1], %[dest], 0x00)
1044 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
1045 MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
1046 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1047 MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
1048 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1049 MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
1050 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1051 MMI_LWC1(%[ftmp5], %[tmp0], 0x00)
1052 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1053 MMI_LWC1(%[ftmp6], %[tmp0], 0x00)
1054 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1055 MMI_LWC1(%[ftmp7], %[tmp0], 0x00)
1056 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1057 MMI_LWC1(%[ftmp8], %[tmp0], 0x00)
1058 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1059 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1060 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1061 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1062 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1063 "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1064 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1065 "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
1066 "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1067
1068 "paddh %[ftmp1], %[ftmp1], %[ftmp15] \n\t"
1069 "paddh %[ftmp2], %[ftmp2], %[ftmp16] \n\t"
1070 "paddh %[ftmp3], %[ftmp3], %[ftmp17] \n\t"
1071 "paddh %[ftmp4], %[ftmp4], %[ftmp18] \n\t"
1072 "paddh %[ftmp5], %[ftmp5], %[ftmp19] \n\t"
1073 "paddh %[ftmp6], %[ftmp6], %[ftmp20] \n\t"
1074 "paddh %[ftmp7], %[ftmp7], %[ftmp21] \n\t"
1075 "paddh %[ftmp8], %[ftmp8], %[ftmp22] \n\t"
1076
1077 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1078 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1079 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1080 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1081 "packushb %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
1082 "packushb %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
1083 "packushb %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
1084 "packushb %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1085
1086 MMI_SWC1(%[ftmp1], %[dest], 0x00)
1087 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
1088 MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
1089 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1090 MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
1091 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1092 MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
1093 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1094 MMI_SWC1(%[ftmp5], %[tmp0], 0x00)
1095 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1096 MMI_SWC1(%[ftmp6], %[tmp0], 0x00)
1097 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1098 MMI_SWC1(%[ftmp7], %[tmp0], 0x00)
1099 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1100 MMI_SWC1(%[ftmp8], %[tmp0], 0x00)
1101
1102 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1103 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1104 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1105 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1106 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1107 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1108 [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
1109 [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
1110 [ftmp16]"=&f"(ftmp[16]), [ftmp17]"=&f"(ftmp[17]),
1111 [ftmp18]"=&f"(ftmp[18]), [ftmp19]"=&f"(ftmp[19]),
1112 [ftmp20]"=&f"(ftmp[20]), [ftmp21]"=&f"(ftmp[21]),
1113 [ftmp22]"=&f"(ftmp[22]),
1114 [tmp0]"=&r"(tmp[0])
1115 : [ff_pw_1]"f"(ff_pw_1_local), [ff_pw_64]"f"(ff_pw_64_local),
1116 [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
1117 : "memory"
1118 );
1119 }
1120 #endif
1121
1122 /* Do inverse transform on 4x4 part of block */
ff_vc1_inv_trans_4x4_dc_mmi(uint8_t * dest,ptrdiff_t linesize,int16_t * block)1123 void ff_vc1_inv_trans_4x4_dc_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
1124 {
1125 int dc = block[0];
1126 double ftmp[5];
1127 DECLARE_VAR_LOW32;
1128
1129 dc = (17 * dc + 4) >> 3;
1130 dc = (17 * dc + 64) >> 7;
1131
1132 __asm__ volatile(
1133 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1134 "pshufh %[dc], %[dc], %[ftmp0] \n\t"
1135
1136 MMI_LWC1(%[ftmp1], %[dest0], 0x00)
1137 MMI_LWC1(%[ftmp2], %[dest1], 0x00)
1138 MMI_LWC1(%[ftmp3], %[dest2], 0x00)
1139 MMI_LWC1(%[ftmp4], %[dest3], 0x00)
1140
1141 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1142 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1143 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1144 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1145
1146 "paddsh %[ftmp1], %[ftmp1], %[dc] \n\t"
1147 "paddsh %[ftmp2], %[ftmp2], %[dc] \n\t"
1148 "paddsh %[ftmp3], %[ftmp3], %[dc] \n\t"
1149 "paddsh %[ftmp4], %[ftmp4], %[dc] \n\t"
1150
1151 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1152 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1153 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1154 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1155
1156 MMI_SWC1(%[ftmp1], %[dest0], 0x00)
1157 MMI_SWC1(%[ftmp2], %[dest1], 0x00)
1158 MMI_SWC1(%[ftmp3], %[dest2], 0x00)
1159 MMI_SWC1(%[ftmp4], %[dest3], 0x00)
1160 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1161 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1162 RESTRICT_ASM_LOW32
1163 [ftmp4]"=&f"(ftmp[4])
1164 : [dest0]"r"(dest+0*linesize), [dest1]"r"(dest+1*linesize),
1165 [dest2]"r"(dest+2*linesize), [dest3]"r"(dest+3*linesize),
1166 [dc]"f"(dc)
1167 : "memory"
1168 );
1169 }
1170
ff_vc1_inv_trans_4x4_mmi(uint8_t * dest,ptrdiff_t linesize,int16_t * block)1171 void ff_vc1_inv_trans_4x4_mmi(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
1172 {
1173 int16_t *src = block;
1174 int16_t *dst = block;
1175 double ftmp[16];
1176 uint32_t count = 4, tmp[1];
1177 int16_t coeff[16] = {17, 22, 17, 10,
1178 17, 10,-17,-22,
1179 17,-10,-17, 22,
1180 17,-22, 17,-10};
1181 DECLARE_ALIGNED(8, const uint64_t, ff_pw_4_local) = {0x0000000400000004ULL};
1182 DECLARE_ALIGNED(8, const uint64_t, ff_pw_64_local)= {0x0000004000000040ULL};
1183 // 1st loop
1184 __asm__ volatile (
1185
1186 "li %[tmp0], 0x03 \n\t"
1187 "mtc1 %[tmp0], %[ftmp0] \n\t"
1188 MMI_LDC1(%[ftmp2], %[coeff], 0x00)
1189 MMI_LDC1(%[ftmp3], %[coeff], 0x08)
1190 MMI_LDC1(%[ftmp4], %[coeff], 0x10)
1191 MMI_LDC1(%[ftmp5], %[coeff], 0x18)
1192 "1: \n\t"
1193 /* ftmp8: dst3,dst2,dst1,dst0 */
1194 MMI_LDC1(%[ftmp1], %[src], 0x00)
1195 "pmaddhw %[ftmp6], %[ftmp2], %[ftmp1] \n\t"
1196 "pmaddhw %[ftmp7], %[ftmp3], %[ftmp1] \n\t"
1197 "pmaddhw %[ftmp8], %[ftmp4], %[ftmp1] \n\t"
1198 "pmaddhw %[ftmp9], %[ftmp5], %[ftmp1] \n\t"
1199 "punpcklwd %[ftmp10], %[ftmp6], %[ftmp7] \n\t"
1200 "punpckhwd %[ftmp11], %[ftmp6], %[ftmp7] \n\t"
1201 "punpcklwd %[ftmp6], %[ftmp8], %[ftmp9] \n\t"
1202 "punpckhwd %[ftmp7], %[ftmp8], %[ftmp9] \n\t"
1203 "paddw %[ftmp8], %[ftmp10], %[ftmp11] \n\t"
1204 "paddw %[ftmp9], %[ftmp6], %[ftmp7] \n\t"
1205 "paddw %[ftmp8], %[ftmp8], %[ff_pw_4] \n\t"
1206 "paddw %[ftmp9], %[ftmp9], %[ff_pw_4] \n\t"
1207 "psraw %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
1208 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1209 "punpcklhw %[ftmp6], %[ftmp8], %[ftmp9] \n\t"
1210 "punpckhhw %[ftmp7], %[ftmp8], %[ftmp9] \n\t"
1211 "punpcklhw %[ftmp8], %[ftmp6], %[ftmp7] \n\t"
1212 MMI_SDC1(%[ftmp8], %[dst], 0x00)
1213
1214 PTR_ADDIU "%[src], %[src], 0x10 \n\t"
1215 PTR_ADDIU "%[dst], %[dst], 0x10 \n\t"
1216 "addiu %[count], %[count], -0x01 \n\t"
1217 "bnez %[count], 1b \n\t"
1218 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1219 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1220 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1221 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1222 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1223 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1224 [tmp0]"=&r"(tmp[0]), [count]"+&r"(count),
1225 [src]"+&r"(src), [dst]"+&r"(dst)
1226 : [ff_pw_4]"f"(ff_pw_4_local), [coeff]"r"(coeff)
1227 : "memory"
1228 );
1229
1230 src = block;
1231
1232 // 2nd loop
1233 __asm__ volatile (
1234 "li %[tmp0], 0x07 \n\t"
1235 "mtc1 %[tmp0], %[ftmp0] \n\t"
1236 "li %[tmp0], 0x44 \n\t"
1237 "mtc1 %[tmp0], %[ftmp15] \n\t"
1238
1239 MMI_LDC1(%[ftmp1], %[src], 0x00)
1240 MMI_LDC1(%[ftmp2], %[src], 0x10)
1241 MMI_LDC1(%[ftmp3], %[src], 0x20)
1242 MMI_LDC1(%[ftmp4], %[src], 0x30)
1243 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp2] \n\t"
1244 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp2] \n\t"
1245 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp4] \n\t"
1246 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp4] \n\t"
1247
1248 /* ftmp11: dst03,dst02,dst01,dst00 */
1249 "li %[tmp0], 0x00160011 \n\t"
1250 "mtc1 %[tmp0], %[ftmp3] \n\t"
1251 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
1252 "li %[tmp0], 0x000a0011 \n\t"
1253 "mtc1 %[tmp0], %[ftmp4] \n\t"
1254 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
1255 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
1256 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
1257 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1258 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
1259 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
1260 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1261 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
1262 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
1263 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1264 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
1265 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
1266 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
1267 "punpcklhw %[ftmp11], %[ftmp1], %[ftmp2] \n\t"
1268
1269 /* ftmp12: dst13,dst12,dst11,dst10 */
1270 "li %[tmp0], 0x000a0011 \n\t"
1271 "mtc1 %[tmp0], %[ftmp3] \n\t"
1272 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
1273 "li %[tmp0], 0xffeaffef \n\t"
1274 "mtc1 %[tmp0], %[ftmp4] \n\t"
1275 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
1276 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
1277 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
1278 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1279 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
1280 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
1281 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1282 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
1283 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
1284 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1285 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
1286 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
1287 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
1288 "punpcklhw %[ftmp12], %[ftmp1], %[ftmp2] \n\t"
1289
1290 /* ftmp13: dst23,dst22,dst21,dst20 */
1291 "li %[tmp0], 0xfff60011 \n\t"
1292 "mtc1 %[tmp0], %[ftmp3] \n\t"
1293 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
1294 "li %[tmp0], 0x0016ffef \n\t"
1295 "mtc1 %[tmp0], %[ftmp4] \n\t"
1296 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
1297 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
1298 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
1299 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1300 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
1301 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
1302 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1303 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
1304 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
1305 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1306 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
1307 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
1308 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
1309 "punpcklhw %[ftmp13], %[ftmp1], %[ftmp2] \n\t"
1310
1311 /* ftmp14: dst33,dst32,dst31,dst30 */
1312 "li %[tmp0], 0xffea0011 \n\t"
1313 "mtc1 %[tmp0], %[ftmp3] \n\t"
1314 "pshufh %[ftmp3], %[ftmp3], %[ftmp15] \n\t"
1315 "li %[tmp0], 0xfff60011 \n\t"
1316 "mtc1 %[tmp0], %[ftmp4] \n\t"
1317 "pshufh %[ftmp4], %[ftmp4], %[ftmp15] \n\t"
1318 "pmaddhw %[ftmp1], %[ftmp5], %[ftmp3] \n\t"
1319 "pmaddhw %[ftmp2], %[ftmp7], %[ftmp4] \n\t"
1320 "paddw %[ftmp9], %[ftmp1], %[ftmp2] \n\t"
1321 "pmaddhw %[ftmp1], %[ftmp6], %[ftmp3] \n\t"
1322 "pmaddhw %[ftmp2], %[ftmp8], %[ftmp4] \n\t"
1323 "paddw %[ftmp10], %[ftmp1], %[ftmp2] \n\t"
1324 "paddw %[ftmp9], %[ftmp9], %[ff_pw_64] \n\t"
1325 "paddw %[ftmp10], %[ftmp10], %[ff_pw_64] \n\t"
1326 "psraw %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
1327 "psraw %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
1328 "punpcklhw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
1329 "punpckhhw %[ftmp2], %[ftmp9], %[ftmp10] \n\t"
1330 "punpcklhw %[ftmp14], %[ftmp1], %[ftmp2] \n\t"
1331
1332 MMI_LWC1(%[ftmp1], %[dest], 0x00)
1333 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
1334 MMI_LWC1(%[ftmp2], %[tmp0], 0x00)
1335 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1336 MMI_LWC1(%[ftmp3], %[tmp0], 0x00)
1337 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1338 MMI_LWC1(%[ftmp4], %[tmp0], 0x00)
1339 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
1340 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1341 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1342 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1343 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1344 "paddh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
1345 "paddh %[ftmp2], %[ftmp2], %[ftmp12] \n\t"
1346 "paddh %[ftmp3], %[ftmp3], %[ftmp13] \n\t"
1347 "paddh %[ftmp4], %[ftmp4], %[ftmp14] \n\t"
1348 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
1349 "packushb %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
1350 "packushb %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
1351 "packushb %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
1352
1353 MMI_SWC1(%[ftmp1], %[dest], 0x00)
1354 PTR_ADDU "%[tmp0], %[dest], %[linesize] \n\t"
1355 MMI_SWC1(%[ftmp2], %[tmp0], 0x00)
1356 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1357 MMI_SWC1(%[ftmp3], %[tmp0], 0x00)
1358 PTR_ADDU "%[tmp0], %[tmp0], %[linesize] \n\t"
1359 MMI_SWC1(%[ftmp4], %[tmp0], 0x00)
1360
1361 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
1362 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
1363 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
1364 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
1365 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
1366 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
1367 [ftmp12]"=&f"(ftmp[12]), [ftmp13]"=&f"(ftmp[13]),
1368 [ftmp14]"=&f"(ftmp[14]), [ftmp15]"=&f"(ftmp[15]),
1369 [tmp0]"=&r"(tmp[0])
1370 : [ff_pw_64]"f"(ff_pw_64_local),
1371 [src]"r"(src), [dest]"r"(dest), [linesize]"r"(linesize)
1372 :"memory"
1373 );
1374 }
1375
1376 /* Apply overlap transform to horizontal edge */
ff_vc1_h_overlap_mmi(uint8_t * src,int stride)1377 void ff_vc1_h_overlap_mmi(uint8_t *src, int stride)
1378 {
1379 int i;
1380 int a, b, c, d;
1381 int d1, d2;
1382 int rnd = 1;
1383 for (i = 0; i < 8; i++) {
1384 a = src[-2];
1385 b = src[-1];
1386 c = src[0];
1387 d = src[1];
1388 d1 = (a - d + 3 + rnd) >> 3;
1389 d2 = (a - d + b - c + 4 - rnd) >> 3;
1390
1391 src[-2] = a - d1;
1392 src[-1] = av_clip_uint8(b - d2);
1393 src[0] = av_clip_uint8(c + d2);
1394 src[1] = d + d1;
1395 src += stride;
1396 rnd = !rnd;
1397 }
1398 }
1399
ff_vc1_h_s_overlap_mmi(int16_t * left,int16_t * right,int left_stride,int right_stride,int flags)1400 void ff_vc1_h_s_overlap_mmi(int16_t *left, int16_t *right, int left_stride, int right_stride, int flags)
1401 {
1402 int i;
1403 int a, b, c, d;
1404 int d1, d2;
1405 int rnd1 = flags & 2 ? 3 : 4;
1406 int rnd2 = 7 - rnd1;
1407 for (i = 0; i < 8; i++) {
1408 a = left[6];
1409 b = left[7];
1410 c = right[0];
1411 d = right[1];
1412 d1 = a - d;
1413 d2 = a - d + b - c;
1414
1415 left[6] = ((a << 3) - d1 + rnd1) >> 3;
1416 left[7] = ((b << 3) - d2 + rnd2) >> 3;
1417 right[0] = ((c << 3) + d2 + rnd1) >> 3;
1418 right[1] = ((d << 3) + d1 + rnd2) >> 3;
1419
1420 right += right_stride;
1421 left += left_stride;
1422 if (flags & 1) {
1423 rnd2 = 7 - rnd2;
1424 rnd1 = 7 - rnd1;
1425 }
1426 }
1427 }
1428
1429 /* Apply overlap transform to vertical edge */
ff_vc1_v_overlap_mmi(uint8_t * src,int stride)1430 void ff_vc1_v_overlap_mmi(uint8_t *src, int stride)
1431 {
1432 int i;
1433 int a, b, c, d;
1434 int d1, d2;
1435 int rnd = 1;
1436 for (i = 0; i < 8; i++) {
1437 a = src[-2 * stride];
1438 b = src[-stride];
1439 c = src[0];
1440 d = src[stride];
1441 d1 = (a - d + 3 + rnd) >> 3;
1442 d2 = (a - d + b - c + 4 - rnd) >> 3;
1443
1444 src[-2 * stride] = a - d1;
1445 src[-stride] = av_clip_uint8(b - d2);
1446 src[0] = av_clip_uint8(c + d2);
1447 src[stride] = d + d1;
1448 src++;
1449 rnd = !rnd;
1450 }
1451 }
1452
ff_vc1_v_s_overlap_mmi(int16_t * top,int16_t * bottom)1453 void ff_vc1_v_s_overlap_mmi(int16_t *top, int16_t *bottom)
1454 {
1455 int i;
1456 int a, b, c, d;
1457 int d1, d2;
1458 int rnd1 = 4, rnd2 = 3;
1459 for (i = 0; i < 8; i++) {
1460 a = top[48];
1461 b = top[56];
1462 c = bottom[0];
1463 d = bottom[8];
1464 d1 = a - d;
1465 d2 = a - d + b - c;
1466
1467 top[48] = ((a << 3) - d1 + rnd1) >> 3;
1468 top[56] = ((b << 3) - d2 + rnd2) >> 3;
1469 bottom[0] = ((c << 3) + d2 + rnd1) >> 3;
1470 bottom[8] = ((d << 3) + d1 + rnd2) >> 3;
1471
1472 bottom++;
1473 top++;
1474 rnd2 = 7 - rnd2;
1475 rnd1 = 7 - rnd1;
1476 }
1477 }
1478
1479 /**
1480 * VC-1 in-loop deblocking filter for one line
1481 * @param src source block type
1482 * @param stride block stride
1483 * @param pq block quantizer
1484 * @return whether other 3 pairs should be filtered or not
1485 * @see 8.6
1486 */
vc1_filter_line(uint8_t * src,int stride,int pq)1487 static av_always_inline int vc1_filter_line(uint8_t *src, int stride, int pq)
1488 {
1489 int a0 = (2 * (src[-2 * stride] - src[1 * stride]) -
1490 5 * (src[-1 * stride] - src[0 * stride]) + 4) >> 3;
1491 int a0_sign = a0 >> 31; /* Store sign */
1492
1493 a0 = (a0 ^ a0_sign) - a0_sign; /* a0 = FFABS(a0); */
1494 if (a0 < pq) {
1495 int a1 = FFABS((2 * (src[-4 * stride] - src[-1 * stride]) -
1496 5 * (src[-3 * stride] - src[-2 * stride]) + 4) >> 3);
1497 int a2 = FFABS((2 * (src[ 0 * stride] - src[ 3 * stride]) -
1498 5 * (src[ 1 * stride] - src[ 2 * stride]) + 4) >> 3);
1499 if (a1 < a0 || a2 < a0) {
1500 int clip = src[-1 * stride] - src[0 * stride];
1501 int clip_sign = clip >> 31;
1502
1503 clip = ((clip ^ clip_sign) - clip_sign) >> 1;
1504 if (clip) {
1505 int a3 = FFMIN(a1, a2);
1506 int d = 5 * (a3 - a0);
1507 int d_sign = (d >> 31);
1508
1509 d = ((d ^ d_sign) - d_sign) >> 3;
1510 d_sign ^= a0_sign;
1511
1512 if (d_sign ^ clip_sign)
1513 d = 0;
1514 else {
1515 d = FFMIN(d, clip);
1516 d = (d ^ d_sign) - d_sign; /* Restore sign */
1517 src[-1 * stride] = av_clip_uint8(src[-1 * stride] - d);
1518 src[ 0 * stride] = av_clip_uint8(src[ 0 * stride] + d);
1519 }
1520 return 1;
1521 }
1522 }
1523 }
1524 return 0;
1525 }
1526
1527 /**
1528 * VC-1 in-loop deblocking filter
1529 * @param src source block type
1530 * @param step distance between horizontally adjacent elements
1531 * @param stride distance between vertically adjacent elements
1532 * @param len edge length to filter (4 or 8 pixels)
1533 * @param pq block quantizer
1534 * @see 8.6
1535 */
vc1_loop_filter(uint8_t * src,int step,int stride,int len,int pq)1536 static inline void vc1_loop_filter(uint8_t *src, int step, int stride,
1537 int len, int pq)
1538 {
1539 int i;
1540 int filt3;
1541
1542 for (i = 0; i < len; i += 4) {
1543 filt3 = vc1_filter_line(src + 2 * step, stride, pq);
1544 if (filt3) {
1545 vc1_filter_line(src + 0 * step, stride, pq);
1546 vc1_filter_line(src + 1 * step, stride, pq);
1547 vc1_filter_line(src + 3 * step, stride, pq);
1548 }
1549 src += step * 4;
1550 }
1551 }
1552
ff_vc1_v_loop_filter4_mmi(uint8_t * src,int stride,int pq)1553 void ff_vc1_v_loop_filter4_mmi(uint8_t *src, int stride, int pq)
1554 {
1555 vc1_loop_filter(src, 1, stride, 4, pq);
1556 }
1557
ff_vc1_h_loop_filter4_mmi(uint8_t * src,int stride,int pq)1558 void ff_vc1_h_loop_filter4_mmi(uint8_t *src, int stride, int pq)
1559 {
1560 vc1_loop_filter(src, stride, 1, 4, pq);
1561 }
1562
ff_vc1_v_loop_filter8_mmi(uint8_t * src,int stride,int pq)1563 void ff_vc1_v_loop_filter8_mmi(uint8_t *src, int stride, int pq)
1564 {
1565 vc1_loop_filter(src, 1, stride, 8, pq);
1566 }
1567
ff_vc1_h_loop_filter8_mmi(uint8_t * src,int stride,int pq)1568 void ff_vc1_h_loop_filter8_mmi(uint8_t *src, int stride, int pq)
1569 {
1570 vc1_loop_filter(src, stride, 1, 8, pq);
1571 }
1572
ff_vc1_v_loop_filter16_mmi(uint8_t * src,int stride,int pq)1573 void ff_vc1_v_loop_filter16_mmi(uint8_t *src, int stride, int pq)
1574 {
1575 vc1_loop_filter(src, 1, stride, 16, pq);
1576 }
1577
ff_vc1_h_loop_filter16_mmi(uint8_t * src,int stride,int pq)1578 void ff_vc1_h_loop_filter16_mmi(uint8_t *src, int stride, int pq)
1579 {
1580 vc1_loop_filter(src, stride, 1, 16, pq);
1581 }
1582
ff_put_vc1_mspel_mc00_mmi(uint8_t * dst,const uint8_t * src,ptrdiff_t stride,int rnd)1583 void ff_put_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
1584 ptrdiff_t stride, int rnd)
1585 {
1586 ff_put_pixels8_8_mmi(dst, src, stride, 8);
1587 }
ff_put_vc1_mspel_mc00_16_mmi(uint8_t * dst,const uint8_t * src,ptrdiff_t stride,int rnd)1588 void ff_put_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
1589 ptrdiff_t stride, int rnd)
1590 {
1591 ff_put_pixels16_8_mmi(dst, src, stride, 16);
1592 }
ff_avg_vc1_mspel_mc00_mmi(uint8_t * dst,const uint8_t * src,ptrdiff_t stride,int rnd)1593 void ff_avg_vc1_mspel_mc00_mmi(uint8_t *dst, const uint8_t *src,
1594 ptrdiff_t stride, int rnd)
1595 {
1596 ff_avg_pixels8_8_mmi(dst, src, stride, 8);
1597 }
ff_avg_vc1_mspel_mc00_16_mmi(uint8_t * dst,const uint8_t * src,ptrdiff_t stride,int rnd)1598 void ff_avg_vc1_mspel_mc00_16_mmi(uint8_t *dst, const uint8_t *src,
1599 ptrdiff_t stride, int rnd)
1600 {
1601 ff_avg_pixels16_8_mmi(dst, src, stride, 16);
1602 }
1603
1604 #define OP_PUT(S, D)
1605 #define OP_AVG(S, D) \
1606 "ldc1 $f16, "#S" \n\t" \
1607 "pavgb "#D", "#D", $f16 \n\t"
1608
1609 /** Add rounder from $f14 to $f6 and pack result at destination */
1610 #define NORMALIZE_MMI(SHIFT) \
1611 "paddh $f6, $f6, $f14 \n\t" /* +bias-r */ \
1612 "paddh $f8, $f8, $f14 \n\t" /* +bias-r */ \
1613 "psrah $f6, $f6, "SHIFT" \n\t" \
1614 "psrah $f8, $f8, "SHIFT" \n\t"
1615
1616 #define TRANSFER_DO_PACK(OP) \
1617 "packushb $f6, $f6, $f8 \n\t" \
1618 OP((%[dst]), $f6) \
1619 "sdc1 $f6, 0x00(%[dst]) \n\t"
1620
1621 #define TRANSFER_DONT_PACK(OP) \
1622 OP(0(%[dst]), $f6) \
1623 OP(8(%[dst]), $f8) \
1624 "sdc1 $f6, 0x00(%[dst]) \n\t" \
1625 "sdc1 $f8, 0x08(%[dst]) \n\t"
1626
1627 /** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
1628 #define DO_UNPACK(reg) \
1629 "punpcklbh "reg", "reg", $f0 \n\t"
1630 #define DONT_UNPACK(reg)
1631
1632 /** Compute the rounder 32-r or 8-r and unpacks it to $f14 */
1633 #define LOAD_ROUNDER_MMI(ROUND) \
1634 "lwc1 $f14, "ROUND" \n\t" \
1635 "punpcklhw $f14, $f14, $f14 \n\t" \
1636 "punpcklwd $f14, $f14, $f14 \n\t"
1637
1638
1639 #define SHIFT2_LINE(OFF, R0, R1, R2, R3) \
1640 "paddh "#R1", "#R1", "#R2" \n\t" \
1641 PTR_ADDU "$9, %[src], %[stride1] \n\t" \
1642 MMI_ULWC1(R0, $9, 0x00) \
1643 "pmullh "#R1", "#R1", $f6 \n\t" \
1644 "punpcklbh "#R0", "#R0", $f0 \n\t" \
1645 PTR_ADDU "$9, %[src], %[stride] \n\t" \
1646 MMI_ULWC1(R3, $9, 0x00) \
1647 "psubh "#R1", "#R1", "#R0" \n\t" \
1648 "punpcklbh "#R3", "#R3", $f0 \n\t" \
1649 "paddh "#R1", "#R1", $f14 \n\t" \
1650 "psubh "#R1", "#R1", "#R3" \n\t" \
1651 "psrah "#R1", "#R1", %[shift] \n\t" \
1652 MMI_SDC1(R1, %[dst], OFF) \
1653 PTR_ADDU "%[src], %[src], %[stride] \n\t"
1654
1655 /** Sacrificing $f12 makes it possible to pipeline loads from src */
vc1_put_ver_16b_shift2_mmi(int16_t * dst,const uint8_t * src,mips_reg stride,int rnd,int64_t shift)1656 static void vc1_put_ver_16b_shift2_mmi(int16_t *dst,
1657 const uint8_t *src, mips_reg stride,
1658 int rnd, int64_t shift)
1659 {
1660 DECLARE_VAR_LOW32;
1661 DECLARE_VAR_ADDRT;
1662
1663 __asm__ volatile(
1664 "xor $f0, $f0, $f0 \n\t"
1665 "li $8, 0x03 \n\t"
1666 LOAD_ROUNDER_MMI("%[rnd]")
1667 "ldc1 $f12, %[ff_pw_9] \n\t"
1668 "1: \n\t"
1669 MMI_ULWC1($f4, %[src], 0x00)
1670 PTR_ADDU "%[src], %[src], %[stride] \n\t"
1671 MMI_ULWC1($f6, %[src], 0x00)
1672 "punpcklbh $f4, $f4, $f0 \n\t"
1673 "punpcklbh $f6, $f6, $f0 \n\t"
1674 SHIFT2_LINE( 0, $f2, $f4, $f6, $f8)
1675 SHIFT2_LINE( 24, $f4, $f6, $f8, $f2)
1676 SHIFT2_LINE( 48, $f6, $f8, $f2, $f4)
1677 SHIFT2_LINE( 72, $f8, $f2, $f4, $f6)
1678 SHIFT2_LINE( 96, $f2, $f4, $f6, $f8)
1679 SHIFT2_LINE(120, $f4, $f6, $f8, $f2)
1680 SHIFT2_LINE(144, $f6, $f8, $f2, $f4)
1681 SHIFT2_LINE(168, $f8, $f2, $f4, $f6)
1682 PTR_SUBU "%[src], %[src], %[stride2] \n\t"
1683 PTR_ADDIU "%[dst], %[dst], 0x08 \n\t"
1684 "addiu $8, $8, -0x01 \n\t"
1685 "bnez $8, 1b \n\t"
1686 : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT
1687 [src]"+r"(src), [dst]"+r"(dst)
1688 : [stride]"r"(stride), [stride1]"r"(-2*stride),
1689 [shift]"f"(shift), [rnd]"m"(rnd),
1690 [stride2]"r"(9*stride-4), [ff_pw_9]"m"(ff_pw_9)
1691 : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12",
1692 "$f14", "$f16", "memory"
1693 );
1694 }
1695
1696 /**
1697 * Data is already unpacked, so some operations can directly be made from
1698 * memory.
1699 */
1700 #define VC1_HOR_16B_SHIFT2(OP, OPNAME) \
1701 static void OPNAME ## vc1_hor_16b_shift2_mmi(uint8_t *dst, mips_reg stride, \
1702 const int16_t *src, int rnd) \
1703 { \
1704 int h = 8; \
1705 DECLARE_VAR_ALL64; \
1706 DECLARE_VAR_ADDRT; \
1707 \
1708 src -= 1; \
1709 rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */ \
1710 \
1711 __asm__ volatile( \
1712 LOAD_ROUNDER_MMI("%[rnd]") \
1713 "ldc1 $f12, %[ff_pw_128] \n\t" \
1714 "ldc1 $f10, %[ff_pw_9] \n\t" \
1715 "1: \n\t" \
1716 MMI_ULDC1($f2, %[src], 0x00) \
1717 MMI_ULDC1($f4, %[src], 0x08) \
1718 MMI_ULDC1($f6, %[src], 0x02) \
1719 MMI_ULDC1($f8, %[src], 0x0a) \
1720 MMI_ULDC1($f0, %[src], 0x06) \
1721 "paddh $f2, $f2, $f0 \n\t" \
1722 MMI_ULDC1($f0, %[src], 0x0e) \
1723 "paddh $f4, $f4, $f0 \n\t" \
1724 MMI_ULDC1($f0, %[src], 0x04) \
1725 "paddh $f6, $f6, $f0 \n\t" \
1726 MMI_ULDC1($f0, %[src], 0x0b) \
1727 "paddh $f8, $f8, $f0 \n\t" \
1728 "pmullh $f6, $f6, $f10 \n\t" \
1729 "pmullh $f8, $f8, $f10 \n\t" \
1730 "psubh $f6, $f6, $f2 \n\t" \
1731 "psubh $f8, $f8, $f4 \n\t" \
1732 "li $8, 0x07 \n\t" \
1733 "mtc1 $8, $f16 \n\t" \
1734 NORMALIZE_MMI("$f16") \
1735 /* Remove bias */ \
1736 "paddh $f6, $f6, $f12 \n\t" \
1737 "paddh $f8, $f8, $f12 \n\t" \
1738 TRANSFER_DO_PACK(OP) \
1739 "addiu %[h], %[h], -0x01 \n\t" \
1740 PTR_ADDIU "%[src], %[src], 0x18 \n\t" \
1741 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
1742 "bnez %[h], 1b \n\t" \
1743 : RESTRICT_ASM_ALL64 RESTRICT_ASM_ADDRT \
1744 [h]"+r"(h), \
1745 [src]"+r"(src), [dst]"+r"(dst) \
1746 : [stride]"r"(stride), [rnd]"m"(rnd), \
1747 [ff_pw_9]"m"(ff_pw_9), [ff_pw_128]"m"(ff_pw_128) \
1748 : "$8", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", "$f14", \
1749 "$f16", "memory" \
1750 ); \
1751 }
1752
1753 VC1_HOR_16B_SHIFT2(OP_PUT, put_)
1754 VC1_HOR_16B_SHIFT2(OP_AVG, avg_)
1755
1756 /**
1757 * Purely vertical or horizontal 1/2 shift interpolation.
1758 * Sacrify $f12 for *9 factor.
1759 */
1760 #define VC1_SHIFT2(OP, OPNAME)\
1761 static void OPNAME ## vc1_shift2_mmi(uint8_t *dst, const uint8_t *src, \
1762 mips_reg stride, int rnd, \
1763 mips_reg offset) \
1764 { \
1765 DECLARE_VAR_LOW32; \
1766 DECLARE_VAR_ADDRT; \
1767 \
1768 rnd = 8 - rnd; \
1769 \
1770 __asm__ volatile( \
1771 "xor $f0, $f0, $f0 \n\t" \
1772 "li $10, 0x08 \n\t" \
1773 LOAD_ROUNDER_MMI("%[rnd]") \
1774 "ldc1 $f12, %[ff_pw_9] \n\t" \
1775 "1: \n\t" \
1776 MMI_ULWC1($f6, %[src], 0x00) \
1777 MMI_ULWC1($f8, %[src], 0x04) \
1778 PTR_ADDU "$9, %[src], %[offset] \n\t" \
1779 MMI_ULWC1($f2, $9, 0x00) \
1780 MMI_ULWC1($f4, $9, 0x04) \
1781 PTR_ADDU "%[src], %[src], %[offset] \n\t" \
1782 "punpcklbh $f6, $f6, $f0 \n\t" \
1783 "punpcklbh $f8, $f8, $f0 \n\t" \
1784 "punpcklbh $f2, $f2, $f0 \n\t" \
1785 "punpcklbh $f4, $f4, $f0 \n\t" \
1786 "paddh $f6, $f6, $f2 \n\t" \
1787 "paddh $f8, $f8, $f4 \n\t" \
1788 PTR_ADDU "$9, %[src], %[offset_x2n] \n\t" \
1789 MMI_ULWC1($f2, $9, 0x00) \
1790 MMI_ULWC1($f4, $9, 0x04) \
1791 "pmullh $f6, $f6, $f12 \n\t" /* 0,9,9,0*/ \
1792 "pmullh $f8, $f8, $f12 \n\t" /* 0,9,9,0*/ \
1793 "punpcklbh $f2, $f2, $f0 \n\t" \
1794 "punpcklbh $f4, $f4, $f0 \n\t" \
1795 "psubh $f6, $f6, $f2 \n\t" /*-1,9,9,0*/ \
1796 "psubh $f8, $f8, $f4 \n\t" /*-1,9,9,0*/ \
1797 PTR_ADDU "$9, %[src], %[offset] \n\t" \
1798 MMI_ULWC1($f2, $9, 0x00) \
1799 MMI_ULWC1($f4, $9, 0x04) \
1800 "punpcklbh $f2, $f2, $f0 \n\t" \
1801 "punpcklbh $f4, $f4, $f0 \n\t" \
1802 "psubh $f6, $f6, $f2 \n\t" /*-1,9,9,-1*/ \
1803 "psubh $f8, $f8, $f4 \n\t" /*-1,9,9,-1*/ \
1804 "li $8, 0x04 \n\t" \
1805 "mtc1 $8, $f16 \n\t" \
1806 NORMALIZE_MMI("$f16") \
1807 "packushb $f6, $f6, $f8 \n\t" \
1808 OP((%[dst]), $f6) \
1809 "sdc1 $f6, 0x00(%[dst]) \n\t" \
1810 "addiu $10, $10, -0x01 \n\t" \
1811 PTR_ADDU "%[src], %[src], %[stride1] \n\t" \
1812 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
1813 "bnez $10, 1b \n\t" \
1814 : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \
1815 [src]"+r"(src), [dst]"+r"(dst) \
1816 : [offset]"r"(offset), [offset_x2n]"r"(-2*offset), \
1817 [stride]"r"(stride), [rnd]"m"(rnd), \
1818 [stride1]"r"(stride-offset), \
1819 [ff_pw_9]"m"(ff_pw_9) \
1820 : "$8", "$9", "$10", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", \
1821 "$f12", "$f14", "$f16", "memory" \
1822 ); \
1823 }
1824
1825 VC1_SHIFT2(OP_PUT, put_)
1826 VC1_SHIFT2(OP_AVG, avg_)
1827
1828 /**
1829 * Core of the 1/4 and 3/4 shift bicubic interpolation.
1830 *
1831 * @param UNPACK Macro unpacking arguments from 8 to 16bits (can be empty).
1832 * @param LOAD "MMI_ULWC1" or "MMI_ULDC1", if data read is already unpacked.
1833 * @param M "1" for MMI_ULWC1, "2" for MMI_ULDC1.
1834 * @param A1 Stride address of 1st tap (beware of unpacked/packed).
1835 * @param A2 Stride address of 2nd tap
1836 * @param A3 Stride address of 3rd tap
1837 * @param A4 Stride address of 4th tap
1838 */
1839 #define MSPEL_FILTER13_CORE(UNPACK, LOAD, M, A1, A2, A3, A4) \
1840 PTR_ADDU "$9, %[src], "#A1" \n\t" \
1841 LOAD($f2, $9, M*0) \
1842 LOAD($f4, $9, M*4) \
1843 UNPACK("$f2") \
1844 UNPACK("$f4") \
1845 "pmullh $f2, $f2, %[ff_pw_3] \n\t" \
1846 "pmullh $f4, $f4, %[ff_pw_3] \n\t" \
1847 PTR_ADDU "$9, %[src], "#A2" \n\t" \
1848 LOAD($f6, $9, M*0) \
1849 LOAD($f8, $9, M*4) \
1850 UNPACK("$f6") \
1851 UNPACK("$f8") \
1852 "pmullh $f6, $f6, $f12 \n\t" /* *18 */ \
1853 "pmullh $f8, $f8, $f12 \n\t" /* *18 */ \
1854 "psubh $f6, $f6, $f2 \n\t" /* *18, -3 */ \
1855 "psubh $f8, $f8, $f4 \n\t" /* *18, -3 */ \
1856 PTR_ADDU "$9, %[src], "#A4" \n\t" \
1857 LOAD($f2, $9, M*0) \
1858 LOAD($f4, $9, M*4) \
1859 UNPACK("$f2") \
1860 UNPACK("$f4") \
1861 "li $8, 0x02 \n\t" \
1862 "mtc1 $8, $f16 \n\t" \
1863 "psllh $f2, $f2, $f16 \n\t" /* 4* */ \
1864 "psllh $f4, $f4, $f16 \n\t" /* 4* */ \
1865 "psubh $f6, $f6, $f2 \n\t" /* -4,18,-3 */ \
1866 "psubh $f8, $f8, $f4 \n\t" /* -4,18,-3 */ \
1867 PTR_ADDU "$9, %[src], "#A3" \n\t" \
1868 LOAD($f2, $9, M*0) \
1869 LOAD($f4, $9, M*4) \
1870 UNPACK("$f2") \
1871 UNPACK("$f4") \
1872 "pmullh $f2, $f2, $f10 \n\t" /* *53 */ \
1873 "pmullh $f4, $f4, $f10 \n\t" /* *53 */ \
1874 "paddh $f6, $f6, $f2 \n\t" /* 4,53,18,-3 */ \
1875 "paddh $f8, $f8, $f4 \n\t" /* 4,53,18,-3 */
1876
1877 /**
1878 * Macro to build the vertical 16bits version of vc1_put_shift[13].
1879 * Here, offset=src_stride. Parameters passed A1 to A4 must use
1880 * %3 (src_stride), %4 (2*src_stride) and %5 (3*src_stride).
1881 *
1882 * @param NAME Either 1 or 3
1883 * @see MSPEL_FILTER13_CORE for information on A1->A4
1884 */
1885 #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \
1886 static void \
1887 vc1_put_ver_16b_ ## NAME ## _mmi(int16_t *dst, const uint8_t *src, \
1888 mips_reg src_stride, \
1889 int rnd, int64_t shift) \
1890 { \
1891 int h = 8; \
1892 DECLARE_VAR_LOW32; \
1893 DECLARE_VAR_ADDRT; \
1894 \
1895 src -= src_stride; \
1896 \
1897 __asm__ volatile( \
1898 "xor $f0, $f0, $f0 \n\t" \
1899 LOAD_ROUNDER_MMI("%[rnd]") \
1900 "ldc1 $f10, %[ff_pw_53] \n\t" \
1901 "ldc1 $f12, %[ff_pw_18] \n\t" \
1902 ".p2align 3 \n\t" \
1903 "1: \n\t" \
1904 MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4) \
1905 NORMALIZE_MMI("%[shift]") \
1906 TRANSFER_DONT_PACK(OP_PUT) \
1907 /* Last 3 (in fact 4) bytes on the line */ \
1908 PTR_ADDU "$9, %[src], "#A1" \n\t" \
1909 MMI_ULWC1($f2, $9, 0x08) \
1910 DO_UNPACK("$f2") \
1911 "mov.d $f6, $f2 \n\t" \
1912 "paddh $f2, $f2, $f2 \n\t" \
1913 "paddh $f2, $f2, $f6 \n\t" /* 3* */ \
1914 PTR_ADDU "$9, %[src], "#A2" \n\t" \
1915 MMI_ULWC1($f6, $9, 0x08) \
1916 DO_UNPACK("$f6") \
1917 "pmullh $f6, $f6, $f12 \n\t" /* *18 */ \
1918 "psubh $f6, $f6, $f2 \n\t" /* *18,-3 */ \
1919 PTR_ADDU "$9, %[src], "#A3" \n\t" \
1920 MMI_ULWC1($f2, $9, 0x08) \
1921 DO_UNPACK("$f2") \
1922 "pmullh $f2, $f2, $f10 \n\t" /* *53 */ \
1923 "paddh $f6, $f6, $f2 \n\t" /* *53,18,-3 */ \
1924 PTR_ADDU "$9, %[src], "#A4" \n\t" \
1925 MMI_ULWC1($f2, $9, 0x08) \
1926 DO_UNPACK("$f2") \
1927 "li $8, 0x02 \n\t" \
1928 "mtc1 $8, $f16 \n\t" \
1929 "psllh $f2, $f2, $f16 \n\t" /* 4* */ \
1930 "psubh $f6, $f6, $f2 \n\t" \
1931 "paddh $f6, $f6, $f14 \n\t" \
1932 "li $8, 0x06 \n\t" \
1933 "mtc1 $8, $f16 \n\t" \
1934 "psrah $f6, $f6, $f16 \n\t" \
1935 "sdc1 $f6, 0x10(%[dst]) \n\t" \
1936 "addiu %[h], %[h], -0x01 \n\t" \
1937 PTR_ADDU "%[src], %[src], %[stride_x1] \n\t" \
1938 PTR_ADDIU "%[dst], %[dst], 0x18 \n\t" \
1939 "bnez %[h], 1b \n\t" \
1940 : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \
1941 [h]"+r"(h), \
1942 [src]"+r"(src), [dst]"+r"(dst) \
1943 : [stride_x1]"r"(src_stride), [stride_x2]"r"(2*src_stride), \
1944 [stride_x3]"r"(3*src_stride), \
1945 [rnd]"m"(rnd), [shift]"f"(shift), \
1946 [ff_pw_53]"m"(ff_pw_53), [ff_pw_18]"m"(ff_pw_18), \
1947 [ff_pw_3]"f"(ff_pw_3) \
1948 : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", \
1949 "$f14", "$f16", "memory" \
1950 ); \
1951 }
1952
1953 /**
1954 * Macro to build the horizontal 16bits version of vc1_put_shift[13].
1955 * Here, offset=16bits, so parameters passed A1 to A4 should be simple.
1956 *
1957 * @param NAME Either 1 or 3
1958 * @see MSPEL_FILTER13_CORE for information on A1->A4
1959 */
1960 #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \
1961 static void \
1962 OPNAME ## vc1_hor_16b_ ## NAME ## _mmi(uint8_t *dst, mips_reg stride, \
1963 const int16_t *src, int rnd) \
1964 { \
1965 int h = 8; \
1966 DECLARE_VAR_ALL64; \
1967 DECLARE_VAR_ADDRT; \
1968 \
1969 src -= 1; \
1970 rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \
1971 \
1972 __asm__ volatile( \
1973 "xor $f0, $f0, $f0 \n\t" \
1974 LOAD_ROUNDER_MMI("%[rnd]") \
1975 "ldc1 $f10, %[ff_pw_53] \n\t" \
1976 "ldc1 $f12, %[ff_pw_18] \n\t" \
1977 ".p2align 3 \n\t" \
1978 "1: \n\t" \
1979 MSPEL_FILTER13_CORE(DONT_UNPACK, MMI_ULDC1, 2, A1, A2, A3, A4) \
1980 "li $8, 0x07 \n\t" \
1981 "mtc1 $8, $f16 \n\t" \
1982 NORMALIZE_MMI("$f16") \
1983 /* Remove bias */ \
1984 "paddh $f6, $f6, %[ff_pw_128] \n\t" \
1985 "paddh $f8, $f8, %[ff_pw_128] \n\t" \
1986 TRANSFER_DO_PACK(OP) \
1987 "addiu %[h], %[h], -0x01 \n\t" \
1988 PTR_ADDU "%[src], %[src], 0x18 \n\t" \
1989 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
1990 "bnez %[h], 1b \n\t" \
1991 : RESTRICT_ASM_ALL64 RESTRICT_ASM_ADDRT \
1992 [h]"+r"(h), \
1993 [src]"+r"(src), [dst]"+r"(dst) \
1994 : [stride]"r"(stride), [rnd]"m"(rnd), \
1995 [ff_pw_53]"m"(ff_pw_53), [ff_pw_18]"m"(ff_pw_18), \
1996 [ff_pw_3]"f"(ff_pw_3), [ff_pw_128]"f"(ff_pw_128) \
1997 : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", \
1998 "$f14", "$f16", "memory" \
1999 ); \
2000 }
2001
2002 /**
2003 * Macro to build the 8bits, any direction, version of vc1_put_shift[13].
2004 * Here, offset=src_stride. Parameters passed A1 to A4 must use
2005 * %3 (offset), %4 (2*offset) and %5 (3*offset).
2006 *
2007 * @param NAME Either 1 or 3
2008 * @see MSPEL_FILTER13_CORE for information on A1->A4
2009 */
2010 #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \
2011 static void \
2012 OPNAME ## vc1_## NAME ## _mmi(uint8_t *dst, const uint8_t *src, \
2013 mips_reg stride, int rnd, mips_reg offset) \
2014 { \
2015 int h = 8; \
2016 DECLARE_VAR_LOW32; \
2017 DECLARE_VAR_ADDRT; \
2018 \
2019 src -= offset; \
2020 rnd = 32-rnd; \
2021 \
2022 __asm__ volatile ( \
2023 "xor $f0, $f0, $f0 \n\t" \
2024 LOAD_ROUNDER_MMI("%[rnd]") \
2025 "ldc1 $f10, %[ff_pw_53] \n\t" \
2026 "ldc1 $f12, %[ff_pw_18] \n\t" \
2027 ".p2align 3 \n\t" \
2028 "1: \n\t" \
2029 MSPEL_FILTER13_CORE(DO_UNPACK, MMI_ULWC1, 1, A1, A2, A3, A4) \
2030 "li $8, 0x06 \n\t" \
2031 "mtc1 $8, $f16 \n\t" \
2032 NORMALIZE_MMI("$f16") \
2033 TRANSFER_DO_PACK(OP) \
2034 "addiu %[h], %[h], -0x01 \n\t" \
2035 PTR_ADDU "%[src], %[src], %[stride] \n\t" \
2036 PTR_ADDU "%[dst], %[dst], %[stride] \n\t" \
2037 "bnez %[h], 1b \n\t" \
2038 : RESTRICT_ASM_LOW32 RESTRICT_ASM_ADDRT \
2039 [h]"+r"(h), \
2040 [src]"+r"(src), [dst]"+r"(dst) \
2041 : [offset_x1]"r"(offset), [offset_x2]"r"(2*offset), \
2042 [offset_x3]"r"(3*offset), [stride]"r"(stride), \
2043 [rnd]"m"(rnd), \
2044 [ff_pw_53]"m"(ff_pw_53), [ff_pw_18]"m"(ff_pw_18), \
2045 [ff_pw_3]"f"(ff_pw_3) \
2046 : "$8", "$9", "$f0", "$f2", "$f4", "$f6", "$f8", "$f10", "$f12", \
2047 "$f14", "$f16", "memory" \
2048 ); \
2049 }
2050
2051
2052 /** 1/4 shift bicubic interpolation */
2053 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_PUT, put_)
2054 MSPEL_FILTER13_8B(shift1, %[offset_x3], %[offset_x2], %[offset_x1], $0, OP_AVG, avg_)
2055 MSPEL_FILTER13_VER_16B(shift1, %[stride_x3], %[stride_x2], %[stride_x1], $0)
2056 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_PUT, put_)
2057 MSPEL_FILTER13_HOR_16B(shift1, 6, 4, 2, 0, OP_AVG, avg_)
2058
2059 /** 3/4 shift bicubic interpolation */
2060 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_PUT, put_)
2061 MSPEL_FILTER13_8B(shift3, $0, %[offset_x1], %[offset_x2], %[offset_x3], OP_AVG, avg_)
2062 MSPEL_FILTER13_VER_16B(shift3, $0, %[stride_x1], %[stride_x2], %[stride_x3])
2063 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_PUT, put_)
2064 MSPEL_FILTER13_HOR_16B(shift3, 0, 2, 4, 6, OP_AVG, avg_)
2065
2066 typedef void (*vc1_mspel_mc_filter_ver_16bits)
2067 (int16_t *dst, const uint8_t *src, mips_reg src_stride, int rnd,
2068 int64_t shift);
2069 typedef void (*vc1_mspel_mc_filter_hor_16bits)
2070 (uint8_t *dst, mips_reg dst_stride, const int16_t *src, int rnd);
2071 typedef void (*vc1_mspel_mc_filter_8bits)
2072 (uint8_t *dst, const uint8_t *src, mips_reg stride, int rnd,
2073 mips_reg offset);
2074
2075 /**
2076 * Interpolate fractional pel values by applying proper vertical then
2077 * horizontal filter.
2078 *
2079 * @param dst Destination buffer for interpolated pels.
2080 * @param src Source buffer.
2081 * @param stride Stride for both src and dst buffers.
2082 * @param hmode Horizontal filter (expressed in quarter pixels shift).
2083 * @param hmode Vertical filter.
2084 * @param rnd Rounding bias.
2085 */
2086 #define VC1_MSPEL_MC(OP) \
2087 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
2088 int hmode, int vmode, int rnd) \
2089 { \
2090 static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
2091 { NULL, vc1_put_ver_16b_shift1_mmi, \
2092 vc1_put_ver_16b_shift2_mmi, \
2093 vc1_put_ver_16b_shift3_mmi }; \
2094 static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
2095 { NULL, OP ## vc1_hor_16b_shift1_mmi, \
2096 OP ## vc1_hor_16b_shift2_mmi, \
2097 OP ## vc1_hor_16b_shift3_mmi }; \
2098 static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] = \
2099 { NULL, OP ## vc1_shift1_mmi, \
2100 OP ## vc1_shift2_mmi, \
2101 OP ## vc1_shift3_mmi }; \
2102 \
2103 if (vmode) { /* Vertical filter to apply */ \
2104 if (hmode) { /* Horizontal filter to apply, output to tmp */ \
2105 static const int shift_value[] = { 0, 5, 1, 5 }; \
2106 int shift = (shift_value[hmode]+shift_value[vmode])>>1; \
2107 int r; \
2108 LOCAL_ALIGNED(16, int16_t, tmp, [12*8]); \
2109 \
2110 r = (1<<(shift-1)) + rnd-1; \
2111 vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift); \
2112 \
2113 vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd); \
2114 return; \
2115 } \
2116 else { /* No horizontal filter, output 8 lines to dst */ \
2117 vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride); \
2118 return; \
2119 } \
2120 } \
2121 \
2122 /* Horizontal mode with no vertical mode */ \
2123 vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1); \
2124 } \
2125 static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \
2126 int stride, int hmode, int vmode, int rnd)\
2127 { \
2128 OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
2129 OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
2130 dst += 8*stride; src += 8*stride; \
2131 OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \
2132 OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \
2133 }
2134
2135 VC1_MSPEL_MC(put_)
VC1_MSPEL_MC(avg_)2136 VC1_MSPEL_MC(avg_)
2137
2138 /** Macro to ease bicubic filter interpolation functions declarations */
2139 #define DECLARE_FUNCTION(a, b) \
2140 void ff_put_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst, \
2141 const uint8_t *src, \
2142 ptrdiff_t stride, \
2143 int rnd) \
2144 { \
2145 put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
2146 } \
2147 void ff_avg_vc1_mspel_mc ## a ## b ## _mmi(uint8_t *dst, \
2148 const uint8_t *src, \
2149 ptrdiff_t stride, \
2150 int rnd) \
2151 { \
2152 avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
2153 } \
2154 void ff_put_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst, \
2155 const uint8_t *src, \
2156 ptrdiff_t stride, \
2157 int rnd) \
2158 { \
2159 put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \
2160 } \
2161 void ff_avg_vc1_mspel_mc ## a ## b ## _16_mmi(uint8_t *dst, \
2162 const uint8_t *src, \
2163 ptrdiff_t stride, \
2164 int rnd) \
2165 { \
2166 avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd); \
2167 }
2168
2169 DECLARE_FUNCTION(0, 1)
2170 DECLARE_FUNCTION(0, 2)
2171 DECLARE_FUNCTION(0, 3)
2172
2173 DECLARE_FUNCTION(1, 0)
2174 DECLARE_FUNCTION(1, 1)
2175 DECLARE_FUNCTION(1, 2)
2176 DECLARE_FUNCTION(1, 3)
2177
2178 DECLARE_FUNCTION(2, 0)
2179 DECLARE_FUNCTION(2, 1)
2180 DECLARE_FUNCTION(2, 2)
2181 DECLARE_FUNCTION(2, 3)
2182
2183 DECLARE_FUNCTION(3, 0)
2184 DECLARE_FUNCTION(3, 1)
2185 DECLARE_FUNCTION(3, 2)
2186 DECLARE_FUNCTION(3, 3)
2187
2188 #define CHROMA_MC_8_MMI \
2189 "punpckhbh %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
2190 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
2191 "punpckhbh %[ftmp6], %[ftmp2], %[ftmp0] \n\t" \
2192 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
2193 "punpckhbh %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \
2194 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
2195 "punpckhbh %[ftmp8], %[ftmp4], %[ftmp0] \n\t" \
2196 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
2197 \
2198 "pmullh %[ftmp1], %[ftmp1], %[A] \n\t" \
2199 "pmullh %[ftmp5], %[ftmp5], %[A] \n\t" \
2200 "pmullh %[ftmp2], %[ftmp2], %[B] \n\t" \
2201 "pmullh %[ftmp6], %[ftmp6], %[B] \n\t" \
2202 "pmullh %[ftmp3], %[ftmp3], %[C] \n\t" \
2203 "pmullh %[ftmp7], %[ftmp7], %[C] \n\t" \
2204 "pmullh %[ftmp4], %[ftmp4], %[D] \n\t" \
2205 "pmullh %[ftmp8], %[ftmp8], %[D] \n\t" \
2206 \
2207 "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
2208 "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
2209 "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
2210 "paddh %[ftmp1], %[ftmp1], %[ff_pw_28] \n\t" \
2211 \
2212 "paddh %[ftmp5], %[ftmp5], %[ftmp6] \n\t" \
2213 "paddh %[ftmp7], %[ftmp7], %[ftmp8] \n\t" \
2214 "paddh %[ftmp5], %[ftmp5], %[ftmp7] \n\t" \
2215 "paddh %[ftmp5], %[ftmp5], %[ff_pw_28] \n\t" \
2216 \
2217 "psrlh %[ftmp1], %[ftmp1], %[ftmp9] \n\t" \
2218 "psrlh %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \
2219 "packushb %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
2220
2221
2222 #define CHROMA_MC_4_MMI \
2223 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" \
2224 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" \
2225 "punpcklbh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" \
2226 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" \
2227 \
2228 "pmullh %[ftmp1], %[ftmp1], %[A] \n\t" \
2229 "pmullh %[ftmp2], %[ftmp2], %[B] \n\t" \
2230 "pmullh %[ftmp3], %[ftmp3], %[C] \n\t" \
2231 "pmullh %[ftmp4], %[ftmp4], %[D] \n\t" \
2232 \
2233 "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" \
2234 "paddh %[ftmp3], %[ftmp3], %[ftmp4] \n\t" \
2235 "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" \
2236 "paddh %[ftmp1], %[ftmp1], %[ff_pw_28] \n\t" \
2237 \
2238 "psrlh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" \
2239 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
2240
2241
2242 void ff_put_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
2243 uint8_t *src /* align 1 */,
2244 int stride, int h, int x, int y)
2245 {
2246 const int A = (8 - x) * (8 - y);
2247 const int B = (x) * (8 - y);
2248 const int C = (8 - x) * (y);
2249 const int D = (x) * (y);
2250 double ftmp[10];
2251 uint32_t tmp[1];
2252 DECLARE_VAR_ALL64;
2253 DECLARE_VAR_ADDRT;
2254
2255 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2256
2257 __asm__ volatile(
2258 "li %[tmp0], 0x06 \n\t"
2259 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2260 "mtc1 %[tmp0], %[ftmp9] \n\t"
2261 "pshufh %[A], %[A], %[ftmp0] \n\t"
2262 "pshufh %[B], %[B], %[ftmp0] \n\t"
2263 "pshufh %[C], %[C], %[ftmp0] \n\t"
2264 "pshufh %[D], %[D], %[ftmp0] \n\t"
2265
2266 "1: \n\t"
2267 MMI_ULDC1(%[ftmp1], %[src], 0x00)
2268 MMI_ULDC1(%[ftmp2], %[src], 0x01)
2269 PTR_ADDU "%[src], %[src], %[stride] \n\t"
2270 MMI_ULDC1(%[ftmp3], %[src], 0x00)
2271 MMI_ULDC1(%[ftmp4], %[src], 0x01)
2272
2273 CHROMA_MC_8_MMI
2274
2275 MMI_SDC1(%[ftmp1], %[dst], 0x00)
2276 "addiu %[h], %[h], -0x01 \n\t"
2277 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2278 "bnez %[h], 1b \n\t"
2279 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2280 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2281 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2282 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2283 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
2284 RESTRICT_ASM_ALL64
2285 RESTRICT_ASM_ADDRT
2286 [tmp0]"=&r"(tmp[0]),
2287 [src]"+&r"(src), [dst]"+&r"(dst),
2288 [h]"+&r"(h)
2289 : [stride]"r"((mips_reg)stride),
2290 [A]"f"(A), [B]"f"(B),
2291 [C]"f"(C), [D]"f"(D),
2292 [ff_pw_28]"f"(ff_pw_28)
2293 : "memory"
2294 );
2295 }
2296
ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t * dst,uint8_t * src,int stride,int h,int x,int y)2297 void ff_put_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
2298 uint8_t *src /* align 1 */,
2299 int stride, int h, int x, int y)
2300 {
2301 const int A = (8 - x) * (8 - y);
2302 const int B = (x) * (8 - y);
2303 const int C = (8 - x) * (y);
2304 const int D = (x) * (y);
2305 double ftmp[6];
2306 uint32_t tmp[1];
2307 DECLARE_VAR_LOW32;
2308 DECLARE_VAR_ADDRT;
2309
2310 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2311
2312 __asm__ volatile(
2313 "li %[tmp0], 0x06 \n\t"
2314 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2315 "mtc1 %[tmp0], %[ftmp5] \n\t"
2316 "pshufh %[A], %[A], %[ftmp0] \n\t"
2317 "pshufh %[B], %[B], %[ftmp0] \n\t"
2318 "pshufh %[C], %[C], %[ftmp0] \n\t"
2319 "pshufh %[D], %[D], %[ftmp0] \n\t"
2320
2321 "1: \n\t"
2322 MMI_ULWC1(%[ftmp1], %[src], 0x00)
2323 MMI_ULWC1(%[ftmp2], %[src], 0x01)
2324 PTR_ADDU "%[src], %[src], %[stride] \n\t"
2325 MMI_ULWC1(%[ftmp3], %[src], 0x00)
2326 MMI_ULWC1(%[ftmp4], %[src], 0x01)
2327
2328 CHROMA_MC_4_MMI
2329
2330 MMI_SWC1(%[ftmp1], %[dst], 0x00)
2331 "addiu %[h], %[h], -0x01 \n\t"
2332 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2333 "bnez %[h], 1b \n\t"
2334 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2335 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2336 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2337 [tmp0]"=&r"(tmp[0]),
2338 RESTRICT_ASM_LOW32
2339 RESTRICT_ASM_ADDRT
2340 [src]"+&r"(src), [dst]"+&r"(dst),
2341 [h]"+&r"(h)
2342 : [stride]"r"((mips_reg)stride),
2343 [A]"f"(A), [B]"f"(B),
2344 [C]"f"(C), [D]"f"(D),
2345 [ff_pw_28]"f"(ff_pw_28)
2346 : "memory"
2347 );
2348 }
2349
ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t * dst,uint8_t * src,int stride,int h,int x,int y)2350 void ff_avg_no_rnd_vc1_chroma_mc8_mmi(uint8_t *dst /* align 8 */,
2351 uint8_t *src /* align 1 */,
2352 int stride, int h, int x, int y)
2353 {
2354 const int A = (8 - x) * (8 - y);
2355 const int B = (x) * (8 - y);
2356 const int C = (8 - x) * (y);
2357 const int D = (x) * (y);
2358 double ftmp[10];
2359 uint32_t tmp[1];
2360 DECLARE_VAR_ALL64;
2361 DECLARE_VAR_ADDRT;
2362
2363 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2364
2365 __asm__ volatile(
2366 "li %[tmp0], 0x06 \n\t"
2367 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2368 "mtc1 %[tmp0], %[ftmp9] \n\t"
2369 "pshufh %[A], %[A], %[ftmp0] \n\t"
2370 "pshufh %[B], %[B], %[ftmp0] \n\t"
2371 "pshufh %[C], %[C], %[ftmp0] \n\t"
2372 "pshufh %[D], %[D], %[ftmp0] \n\t"
2373
2374 "1: \n\t"
2375 MMI_ULDC1(%[ftmp1], %[src], 0x00)
2376 MMI_ULDC1(%[ftmp2], %[src], 0x01)
2377 PTR_ADDU "%[src], %[src], %[stride] \n\t"
2378 MMI_ULDC1(%[ftmp3], %[src], 0x00)
2379 MMI_ULDC1(%[ftmp4], %[src], 0x01)
2380
2381 CHROMA_MC_8_MMI
2382
2383 MMI_LDC1(%[ftmp2], %[dst], 0x00)
2384 "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
2385
2386 MMI_SDC1(%[ftmp1], %[dst], 0x00)
2387 "addiu %[h], %[h], -0x01 \n\t"
2388 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2389 "bnez %[h], 1b \n\t"
2390 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2391 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2392 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2393 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
2394 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
2395 [tmp0]"=&r"(tmp[0]),
2396 RESTRICT_ASM_ALL64
2397 RESTRICT_ASM_ADDRT
2398 [src]"+&r"(src), [dst]"+&r"(dst),
2399 [h]"+&r"(h)
2400 : [stride]"r"((mips_reg)stride),
2401 [A]"f"(A), [B]"f"(B),
2402 [C]"f"(C), [D]"f"(D),
2403 [ff_pw_28]"f"(ff_pw_28)
2404 : "memory"
2405 );
2406 }
2407
ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t * dst,uint8_t * src,int stride,int h,int x,int y)2408 void ff_avg_no_rnd_vc1_chroma_mc4_mmi(uint8_t *dst /* align 8 */,
2409 uint8_t *src /* align 1 */,
2410 int stride, int h, int x, int y)
2411 {
2412 const int A = (8 - x) * (8 - y);
2413 const int B = ( x) * (8 - y);
2414 const int C = (8 - x) * ( y);
2415 const int D = ( x) * ( y);
2416 double ftmp[6];
2417 uint32_t tmp[1];
2418 DECLARE_VAR_LOW32;
2419 DECLARE_VAR_ADDRT;
2420
2421 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2422
2423 __asm__ volatile(
2424 "li %[tmp0], 0x06 \n\t"
2425 "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
2426 "mtc1 %[tmp0], %[ftmp5] \n\t"
2427 "pshufh %[A], %[A], %[ftmp0] \n\t"
2428 "pshufh %[B], %[B], %[ftmp0] \n\t"
2429 "pshufh %[C], %[C], %[ftmp0] \n\t"
2430 "pshufh %[D], %[D], %[ftmp0] \n\t"
2431
2432 "1: \n\t"
2433 MMI_ULWC1(%[ftmp1], %[src], 0x00)
2434 MMI_ULWC1(%[ftmp2], %[src], 0x01)
2435 PTR_ADDU "%[src], %[src], %[stride] \n\t"
2436 MMI_ULWC1(%[ftmp3], %[src], 0x00)
2437 MMI_ULWC1(%[ftmp4], %[src], 0x01)
2438
2439 CHROMA_MC_4_MMI
2440
2441 MMI_LWC1(%[ftmp2], %[dst], 0x00)
2442 "pavgb %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
2443
2444 MMI_SWC1(%[ftmp1], %[dst], 0x00)
2445 "addiu %[h], %[h], -0x01 \n\t"
2446 PTR_ADDU "%[dst], %[dst], %[stride] \n\t"
2447 "bnez %[h], 1b \n\t"
2448 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
2449 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
2450 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
2451 [tmp0]"=&r"(tmp[0]),
2452 RESTRICT_ASM_LOW32
2453 RESTRICT_ASM_ADDRT
2454 [src]"+&r"(src), [dst]"+&r"(dst),
2455 [h]"+&r"(h)
2456 : [stride]"r"((mips_reg)stride),
2457 [A]"f"(A), [B]"f"(B),
2458 [C]"f"(C), [D]"f"(D),
2459 [ff_pw_28]"f"(ff_pw_28)
2460 : "memory"
2461 );
2462 }
2463