1 /*
2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "vp8/common/filter.h"
12 #include "vpx_ports/asmdefs_mmi.h"
13
14 DECLARE_ALIGNED(8, static const int16_t, vp8_six_tap_mmi[8][6 * 8]) = {
15 { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
16 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
17 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080,
18 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
19 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
20 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 },
21 { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
22 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa,
23 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b,
24 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c,
25 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
26 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 },
27 { 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002,
28 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5,
29 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c,
30 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024,
31 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8,
32 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001 },
33 { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
34 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7,
35 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d,
36 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032,
37 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa,
38 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 },
39 { 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003,
40 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0,
41 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d,
42 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d,
43 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0,
44 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003 },
45 { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
46 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa,
47 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032,
48 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d,
49 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7,
50 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 },
51 { 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001,
52 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8,
53 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024,
54 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c,
55 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5,
56 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002 },
57 { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
58 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
59 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c,
60 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b,
61 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa,
62 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }
63 };
64
65 /* Horizontal filter: pixel_step is 1, output_height and output_width are
66 the size of horizontal filtering output, output_height is always H + 5 */
vp8_filter_block1d_h6_mmi(unsigned char * src_ptr,uint16_t * output_ptr,unsigned int src_pixels_per_line,unsigned int output_height,unsigned int output_width,const int16_t * vp8_filter)67 static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr,
68 uint16_t *output_ptr,
69 unsigned int src_pixels_per_line,
70 unsigned int output_height,
71 unsigned int output_width,
72 const int16_t *vp8_filter) {
73 uint32_t tmp[1];
74 DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
75
76 #if _MIPS_SIM == _ABIO32
77 register double fzero asm("$f0");
78 register double ftmp0 asm("$f2");
79 register double ftmp1 asm("$f4");
80 register double ftmp2 asm("$f6");
81 register double ftmp3 asm("$f8");
82 register double ftmp4 asm("$f10");
83 register double ftmp5 asm("$f12");
84 register double ftmp6 asm("$f14");
85 register double ftmp7 asm("$f16");
86 register double ftmp8 asm("$f18");
87 register double ftmp9 asm("$f20");
88 register double ftmp10 asm("$f22");
89 register double ftmp11 asm("$f24");
90 #else
91 register double fzero asm("$f0");
92 register double ftmp0 asm("$f1");
93 register double ftmp1 asm("$f2");
94 register double ftmp2 asm("$f3");
95 register double ftmp3 asm("$f4");
96 register double ftmp4 asm("$f5");
97 register double ftmp5 asm("$f6");
98 register double ftmp6 asm("$f7");
99 register double ftmp7 asm("$f8");
100 register double ftmp8 asm("$f9");
101 register double ftmp9 asm("$f10");
102 register double ftmp10 asm("$f11");
103 register double ftmp11 asm("$f12");
104 #endif // _MIPS_SIM == _ABIO32
105
106 __asm__ volatile (
107 "ldc1 %[ftmp0], 0x00(%[vp8_filter]) \n\t"
108 "ldc1 %[ftmp1], 0x10(%[vp8_filter]) \n\t"
109 "ldc1 %[ftmp2], 0x20(%[vp8_filter]) \n\t"
110 "ldc1 %[ftmp3], 0x30(%[vp8_filter]) \n\t"
111 "ldc1 %[ftmp4], 0x40(%[vp8_filter]) \n\t"
112 "ldc1 %[ftmp5], 0x50(%[vp8_filter]) \n\t"
113 "xor %[fzero], %[fzero], %[fzero] \n\t"
114 "li %[tmp0], 0x07 \n\t"
115 "mtc1 %[tmp0], %[ftmp7] \n\t"
116 "li %[tmp0], 0x08 \n\t"
117 "mtc1 %[tmp0], %[ftmp11] \n\t"
118
119 "1: \n\t"
120 "gsldlc1 %[ftmp9], 0x05(%[src_ptr]) \n\t"
121 "gsldrc1 %[ftmp9], -0x02(%[src_ptr]) \n\t"
122 "gsldlc1 %[ftmp10], 0x06(%[src_ptr]) \n\t"
123 "gsldrc1 %[ftmp10], -0x01(%[src_ptr]) \n\t"
124
125 "punpcklbh %[ftmp6], %[ftmp9], %[fzero] \n\t"
126 "pmullh %[ftmp8], %[ftmp6], %[ftmp0] \n\t"
127
128 "punpckhbh %[ftmp6], %[ftmp9], %[fzero] \n\t"
129 "pmullh %[ftmp6], %[ftmp6], %[ftmp4] \n\t"
130 "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t"
131
132 "punpcklbh %[ftmp6], %[ftmp10], %[fzero] \n\t"
133 "pmullh %[ftmp6], %[ftmp6], %[ftmp1] \n\t"
134 "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t"
135
136 "punpckhbh %[ftmp6], %[ftmp10], %[fzero] \n\t"
137 "pmullh %[ftmp6], %[ftmp6], %[ftmp5] \n\t"
138 "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t"
139
140 "dsrl %[ftmp10], %[ftmp10], %[ftmp11] \n\t"
141 "punpcklbh %[ftmp6], %[ftmp10], %[fzero] \n\t"
142 "pmullh %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
143 "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t"
144
145 "dsrl %[ftmp10], %[ftmp10], %[ftmp11] \n\t"
146 "punpcklbh %[ftmp6], %[ftmp10], %[fzero] \n\t"
147 "pmullh %[ftmp6], %[ftmp6], %[ftmp3] \n\t"
148 "paddsh %[ftmp8], %[ftmp8], %[ftmp6] \n\t"
149
150 "paddsh %[ftmp8], %[ftmp8], %[ff_ph_40] \n\t"
151 "psrah %[ftmp8], %[ftmp8], %[ftmp7] \n\t"
152 "packushb %[ftmp8], %[ftmp8], %[fzero] \n\t"
153 "punpcklbh %[ftmp8], %[ftmp8], %[fzero] \n\t"
154 "gssdlc1 %[ftmp8], 0x07(%[output_ptr]) \n\t"
155 "gssdrc1 %[ftmp8], 0x00(%[output_ptr]) \n\t"
156
157 "addiu %[output_height], %[output_height], -0x01 \n\t"
158 MMI_ADDU(%[output_ptr], %[output_ptr], %[output_width])
159 MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixels_per_line])
160 "bnez %[output_height], 1b \n\t"
161 : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0),
162 [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2),
163 [ftmp3]"=&f"(ftmp3), [ftmp4]"=&f"(ftmp4),
164 [ftmp5]"=&f"(ftmp5), [ftmp6]"=&f"(ftmp6),
165 [ftmp7]"=&f"(ftmp7), [ftmp8]"=&f"(ftmp8),
166 [ftmp9]"=&f"(ftmp9), [ftmp10]"=&f"(ftmp10),
167 [ftmp11]"=&f"(ftmp11), [tmp0]"=&r"(tmp[0]),
168 [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height),
169 [src_ptr]"+&r"(src_ptr)
170 : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line),
171 [vp8_filter]"r"(vp8_filter), [output_width]"r"(output_width),
172 [ff_ph_40]"f"(ff_ph_40)
173 : "memory"
174 );
175 }
176
177 /* Horizontal filter: pixel_step is always W */
vp8_filter_block1dc_v6_mmi(uint16_t * src_ptr,unsigned char * output_ptr,unsigned int output_height,int output_pitch,unsigned int pixels_per_line,const int16_t * vp8_filter)178 static INLINE void vp8_filter_block1dc_v6_mmi(
179 uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height,
180 int output_pitch, unsigned int pixels_per_line, const int16_t *vp8_filter) {
181 DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
182 uint32_t tmp[1];
183 mips_reg addr[1];
184 #if _MIPS_SIM == _ABIO32
185 register double fzero asm("$f0");
186 register double ftmp0 asm("$f2");
187 register double ftmp1 asm("$f4");
188 register double ftmp2 asm("$f6");
189 register double ftmp3 asm("$f8");
190 register double ftmp4 asm("$f10");
191 register double ftmp5 asm("$f12");
192 register double ftmp6 asm("$f14");
193 register double ftmp7 asm("$f16");
194 register double ftmp8 asm("$f18");
195 register double ftmp9 asm("$f20");
196 register double ftmp10 asm("$f22");
197 register double ftmp11 asm("$f24");
198 register double ftmp12 asm("$f26");
199 register double ftmp13 asm("$f28");
200 #else
201 register double fzero asm("$f0");
202 register double ftmp0 asm("$f1");
203 register double ftmp1 asm("$f2");
204 register double ftmp2 asm("$f3");
205 register double ftmp3 asm("$f4");
206 register double ftmp4 asm("$f5");
207 register double ftmp5 asm("$f6");
208 register double ftmp6 asm("$f7");
209 register double ftmp7 asm("$f8");
210 register double ftmp8 asm("$f9");
211 register double ftmp9 asm("$f10");
212 register double ftmp10 asm("$f11");
213 register double ftmp11 asm("$f12");
214 register double ftmp12 asm("$f13");
215 register double ftmp13 asm("$f14");
216 #endif // _MIPS_SIM == _ABIO32
217
218 __asm__ volatile (
219 "ldc1 %[ftmp0], 0x00(%[vp8_filter]) \n\t"
220 "ldc1 %[ftmp1], 0x10(%[vp8_filter]) \n\t"
221 "ldc1 %[ftmp2], 0x20(%[vp8_filter]) \n\t"
222 "ldc1 %[ftmp3], 0x30(%[vp8_filter]) \n\t"
223 "ldc1 %[ftmp4], 0x40(%[vp8_filter]) \n\t"
224 "ldc1 %[ftmp5], 0x50(%[vp8_filter]) \n\t"
225 "xor %[fzero], %[fzero], %[fzero] \n\t"
226 "li %[tmp0], 0x07 \n\t"
227 "mtc1 %[tmp0], %[ftmp13] \n\t"
228
229 /* In order to make full use of memory load delay slot,
230 * Operation of memory loading and calculating has been rearranged.
231 */
232 "1: \n\t"
233 "gsldlc1 %[ftmp6], 0x07(%[src_ptr]) \n\t"
234 "gsldrc1 %[ftmp6], 0x00(%[src_ptr]) \n\t"
235 MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line])
236 "gsldlc1 %[ftmp7], 0x07(%[addr0]) \n\t"
237 "gsldrc1 %[ftmp7], 0x00(%[addr0]) \n\t"
238 MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x2])
239 "gsldlc1 %[ftmp8], 0x07(%[addr0]) \n\t"
240 "gsldrc1 %[ftmp8], 0x00(%[addr0]) \n\t"
241
242 MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x4])
243 "gsldlc1 %[ftmp9], 0x07(%[addr0]) \n\t"
244 "gsldrc1 %[ftmp9], 0x00(%[addr0]) \n\t"
245 MMI_ADDU(%[src_ptr], %[src_ptr], %[pixels_per_line])
246 MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x2])
247 "gsldlc1 %[ftmp10], 0x07(%[addr0]) \n\t"
248 "gsldrc1 %[ftmp10], 0x00(%[addr0]) \n\t"
249 MMI_ADDU(%[addr0], %[src_ptr], %[pixels_per_line_x4])
250 "gsldlc1 %[ftmp11], 0x07(%[addr0]) \n\t"
251 "gsldrc1 %[ftmp11], 0x00(%[addr0]) \n\t"
252
253 "pmullh %[ftmp12], %[ftmp6], %[ftmp0] \n\t"
254
255 "pmullh %[ftmp7], %[ftmp7], %[ftmp1] \n\t"
256 "paddsh %[ftmp12], %[ftmp12], %[ftmp7] \n\t"
257
258 "pmullh %[ftmp8], %[ftmp8], %[ftmp2] \n\t"
259 "paddsh %[ftmp12], %[ftmp12], %[ftmp8] \n\t"
260
261 "pmullh %[ftmp9], %[ftmp9], %[ftmp4] \n\t"
262 "paddsh %[ftmp12], %[ftmp12], %[ftmp9] \n\t"
263
264 "pmullh %[ftmp10], %[ftmp10], %[ftmp3] \n\t"
265 "paddsh %[ftmp12], %[ftmp12], %[ftmp10] \n\t"
266
267 "pmullh %[ftmp11], %[ftmp11], %[ftmp5] \n\t"
268 "paddsh %[ftmp12], %[ftmp12], %[ftmp11] \n\t"
269
270 "paddsh %[ftmp12], %[ftmp12], %[ff_ph_40] \n\t"
271 "psrah %[ftmp12], %[ftmp12], %[ftmp13] \n\t"
272 "packushb %[ftmp12], %[ftmp12], %[fzero] \n\t"
273 "gsswlc1 %[ftmp12], 0x03(%[output_ptr]) \n\t"
274 "gsswrc1 %[ftmp12], 0x00(%[output_ptr]) \n\t"
275
276 MMI_ADDIU(%[output_height], %[output_height], -0x01)
277 MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch])
278 "bnez %[output_height], 1b \n\t"
279 : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0),
280 [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2),
281 [ftmp3]"=&f"(ftmp3), [ftmp4]"=&f"(ftmp4),
282 [ftmp5]"=&f"(ftmp5), [ftmp6]"=&f"(ftmp6),
283 [ftmp7]"=&f"(ftmp7), [ftmp8]"=&f"(ftmp8),
284 [ftmp9]"=&f"(ftmp9), [ftmp10]"=&f"(ftmp10),
285 [ftmp11]"=&f"(ftmp11), [ftmp12]"=&f"(ftmp12),
286 [ftmp13]"=&f"(ftmp13), [tmp0]"=&r"(tmp[0]),
287 [addr0]"=&r"(addr[0]), [src_ptr]"+&r"(src_ptr),
288 [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height)
289 : [pixels_per_line]"r"((mips_reg)pixels_per_line),
290 [pixels_per_line_x2]"r"((mips_reg)(pixels_per_line<<1)),
291 [pixels_per_line_x4]"r"((mips_reg)(pixels_per_line<<2)),
292 [vp8_filter]"r"(vp8_filter),
293 [output_pitch]"r"((mips_reg)output_pitch),
294 [ff_ph_40]"f"(ff_ph_40)
295 : "memory"
296 );
297 }
298
299 /* When xoffset == 0, vp8_filter= {0,0,128,0,0,0},
300 function vp8_filter_block1d_h6_mmi and vp8_filter_block1d_v6_mmi can
301 be simplified */
vp8_filter_block1d_h6_filter0_mmi(unsigned char * src_ptr,uint16_t * output_ptr,unsigned int src_pixels_per_line,unsigned int output_height,unsigned int output_width)302 static INLINE void vp8_filter_block1d_h6_filter0_mmi(
303 unsigned char *src_ptr, uint16_t *output_ptr,
304 unsigned int src_pixels_per_line, unsigned int output_height,
305 unsigned int output_width) {
306 #if _MIPS_SIM == _ABIO32
307 register double fzero asm("$f0");
308 register double ftmp0 asm("$f2");
309 register double ftmp1 asm("$f4");
310 #else
311 register double fzero asm("$f0");
312 register double ftmp0 asm("$f1");
313 register double ftmp1 asm("$f2");
314 #endif // _MIPS_SIM == _ABIO32
315
316 __asm__ volatile (
317 "xor %[fzero], %[fzero], %[fzero] \n\t"
318
319 "1: \n\t"
320 "gsldlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t"
321 "gsldrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t"
322 MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixels_per_line])
323
324 "punpcklbh %[ftmp1], %[ftmp0], %[fzero] \n\t"
325 "gssdlc1 %[ftmp1], 0x07(%[output_ptr]) \n\t"
326 "gssdrc1 %[ftmp1], 0x00(%[output_ptr]) \n\t"
327
328 "addiu %[output_height], %[output_height], -0x01 \n\t"
329 MMI_ADDU(%[output_ptr], %[output_ptr], %[output_width])
330 "bnez %[output_height], 1b \n\t"
331 : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0),
332 [ftmp1]"=&f"(ftmp1), [src_ptr]"+&r"(src_ptr),
333 [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height)
334 : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line),
335 [output_width]"r"(output_width)
336 : "memory"
337 );
338 }
339
vp8_filter_block1dc_v6_filter0_mmi(uint16_t * src_ptr,unsigned char * output_ptr,unsigned int output_height,int output_pitch,unsigned int pixels_per_line)340 static INLINE void vp8_filter_block1dc_v6_filter0_mmi(
341 uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height,
342 int output_pitch, unsigned int pixels_per_line) {
343 #if _MIPS_SIM == _ABIO32
344 register double fzero asm("$f0");
345 register double ftmp0 asm("$f2");
346 register double ftmp1 asm("$f4");
347 #else
348 register double fzero asm("$f0");
349 register double ftmp0 asm("$f1");
350 register double ftmp1 asm("$f2");
351 #endif // _MIPS_SIM == _ABIO32
352
353 __asm__ volatile (
354 "xor %[fzero], %[fzero], %[fzero] \n\t"
355
356 "1: \n\t"
357 "gsldlc1 %[ftmp0], 0x07(%[src_ptr]) \n\t"
358 "gsldrc1 %[ftmp0], 0x00(%[src_ptr]) \n\t"
359 MMI_ADDU(%[src_ptr], %[src_ptr], %[pixels_per_line])
360 MMI_ADDIU(%[output_height], %[output_height], -0x01)
361 "packushb %[ftmp1], %[ftmp0], %[fzero] \n\t"
362 "gsswlc1 %[ftmp1], 0x03(%[output_ptr]) \n\t"
363 "gsswrc1 %[ftmp1], 0x00(%[output_ptr]) \n\t"
364
365 MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch])
366 "bnez %[output_height], 1b \n\t"
367 : [fzero]"=&f"(fzero), [ftmp0]"=&f"(ftmp0),
368 [ftmp1]"=&f"(ftmp1), [src_ptr]"+&r"(src_ptr),
369 [output_ptr]"+&r"(output_ptr), [output_height]"+&r"(output_height)
370 : [pixels_per_line]"r"((mips_reg)pixels_per_line),
371 [output_pitch]"r"((mips_reg)output_pitch)
372 : "memory"
373 );
374 }
375
376 #define sixtapNxM(n, m) \
377 void vp8_sixtap_predict##n##x##m##_mmi( \
378 unsigned char *src_ptr, int src_pixels_per_line, int xoffset, \
379 int yoffset, unsigned char *dst_ptr, int dst_pitch) { \
380 DECLARE_ALIGNED(16, uint16_t, \
381 FData2[(n + 5) * (n == 16 ? 24 : (n == 8 ? 16 : n))]); \
382 const int16_t *HFilter, *VFilter; \
383 int i, loop = n / 4; \
384 HFilter = vp8_six_tap_mmi[xoffset]; \
385 VFilter = vp8_six_tap_mmi[yoffset]; \
386 \
387 if (xoffset == 0) { \
388 for (i = 0; i < loop; ++i) { \
389 vp8_filter_block1d_h6_filter0_mmi( \
390 src_ptr - (2 * src_pixels_per_line) + i * 4, FData2 + i * 4, \
391 src_pixels_per_line, m + 5, n * 2); \
392 } \
393 } else { \
394 for (i = 0; i < loop; ++i) { \
395 vp8_filter_block1d_h6_mmi(src_ptr - (2 * src_pixels_per_line) + i * 4, \
396 FData2 + i * 4, src_pixels_per_line, m + 5, \
397 n * 2, HFilter); \
398 } \
399 } \
400 if (yoffset == 0) { \
401 for (i = 0; i < loop; ++i) { \
402 vp8_filter_block1dc_v6_filter0_mmi( \
403 FData2 + n * 2 + i * 4, dst_ptr + i * 4, m, dst_pitch, n * 2); \
404 } \
405 } else { \
406 for (i = 0; i < loop; ++i) { \
407 vp8_filter_block1dc_v6_mmi(FData2 + i * 4, dst_ptr + i * 4, m, \
408 dst_pitch, n * 2, VFilter); \
409 } \
410 } \
411 }
412
413 sixtapNxM(4, 4);
414 sixtapNxM(8, 8);
415 sixtapNxM(8, 4);
416 sixtapNxM(16, 16);
417