1 /*
2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "vp8/common/filter.h"
12 #include "vpx_ports/asmdefs_mmi.h"
13 
14 DECLARE_ALIGNED(8, static const int16_t, vp8_six_tap_mmi[8][6 * 8]) = {
15   { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
16     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
17     0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080,
18     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
19     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
20     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 },
21   { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
22     0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa,
23     0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b,
24     0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c,
25     0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
26     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 },
27   { 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002,
28     0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5,
29     0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c,
30     0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024,
31     0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8,
32     0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001 },
33   { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
34     0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7,
35     0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d,
36     0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032,
37     0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa,
38     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 },
39   { 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003,
40     0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0,
41     0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d,
42     0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d,
43     0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0,
44     0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003 },
45   { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
46     0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa,
47     0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032,
48     0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d,
49     0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7,
50     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 },
51   { 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001,
52     0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8,
53     0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024,
54     0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c,
55     0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5,
56     0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002 },
57   { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
58     0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
59     0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c,
60     0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b,
61     0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa,
62     0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }
63 };
64 
65 /* Horizontal filter:  pixel_step is 1, output_height and output_width are
66    the size of horizontal filtering output, output_height is always H + 5 */
vp8_filter_block1d_h6_mmi(unsigned char * src_ptr,uint16_t * output_ptr,unsigned int src_pixels_per_line,unsigned int output_height,unsigned int output_width,const int16_t * vp8_filter)67 static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr,
68                                              uint16_t *output_ptr,
69                                              unsigned int src_pixels_per_line,
70                                              unsigned int output_height,
71                                              unsigned int output_width,
72                                              const int16_t *vp8_filter) {
73   uint32_t tmp[1];
74   DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
75 
76 #if _MIPS_SIM == _ABIO32
77   register double fzero asm("$f0");
78   register double ftmp0 asm("$f2");
79   register double ftmp1 asm("$f4");
80   register double ftmp2 asm("$f6");
81   register double ftmp3 asm("$f8");
82   register double ftmp4 asm("$f10");
83   register double ftmp5 asm("$f12");
84   register double ftmp6 asm("$f14");
85   register double ftmp7 asm("$f16");
86   register double ftmp8 asm("$f18");
87   register double ftmp9 asm("$f20");
88   register double ftmp10 asm("$f22");
89   register double ftmp11 asm("$f24");
90 #else
91   register double fzero asm("$f0");
92   register double ftmp0 asm("$f1");
93   register double ftmp1 asm("$f2");
94   register double ftmp2 asm("$f3");
95   register double ftmp3 asm("$f4");
96   register double ftmp4 asm("$f5");
97   register double ftmp5 asm("$f6");
98   register double ftmp6 asm("$f7");
99   register double ftmp7 asm("$f8");
100   register double ftmp8 asm("$f9");
101   register double ftmp9 asm("$f10");
102   register double ftmp10 asm("$f11");
103   register double ftmp11 asm("$f12");
104 #endif  // _MIPS_SIM == _ABIO32
105 
106   __asm__ volatile (
107     "ldc1       %[ftmp0],       0x00(%[vp8_filter])                   \n\t"
108     "ldc1       %[ftmp1],       0x10(%[vp8_filter])                   \n\t"
109     "ldc1       %[ftmp2],       0x20(%[vp8_filter])                   \n\t"
110     "ldc1       %[ftmp3],       0x30(%[vp8_filter])                   \n\t"
111     "ldc1       %[ftmp4],       0x40(%[vp8_filter])                   \n\t"
112     "ldc1       %[ftmp5],       0x50(%[vp8_filter])                   \n\t"
113     "xor        %[fzero],       %[fzero],           %[fzero]          \n\t"
114     "li         %[tmp0],        0x07                                  \n\t"
115     "mtc1       %[tmp0],        %[ftmp7]                              \n\t"
116     "li         %[tmp0],        0x08                                  \n\t"
117     "mtc1       %[tmp0],        %[ftmp11]                             \n\t"
118 
119     "1:                                                               \n\t"
120     "gsldlc1    %[ftmp9],       0x05(%[src_ptr])                      \n\t"
121     "gsldrc1    %[ftmp9],       -0x02(%[src_ptr])                     \n\t"
122     "gsldlc1    %[ftmp10],      0x06(%[src_ptr])                      \n\t"
123     "gsldrc1    %[ftmp10],      -0x01(%[src_ptr])                     \n\t"
124 
125     "punpcklbh  %[ftmp6],       %[ftmp9],          %[fzero]           \n\t"
126     "pmullh     %[ftmp8],       %[ftmp6],          %[ftmp0]           \n\t"
127 
128     "punpckhbh  %[ftmp6],       %[ftmp9],          %[fzero]           \n\t"
129     "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp4]           \n\t"
130     "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
131 
132     "punpcklbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
133     "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp1]           \n\t"
134     "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
135 
136     "punpckhbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
137     "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp5]           \n\t"
138     "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
139 
140     "dsrl       %[ftmp10],      %[ftmp10],         %[ftmp11]          \n\t"
141     "punpcklbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
142     "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp2]           \n\t"
143     "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
144 
145     "dsrl       %[ftmp10],      %[ftmp10],         %[ftmp11]          \n\t"
146     "punpcklbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
147     "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp3]           \n\t"
148     "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
149 
150     "paddsh     %[ftmp8],       %[ftmp8],          %[ff_ph_40]        \n\t"
151     "psrah      %[ftmp8],       %[ftmp8],          %[ftmp7]           \n\t"
152     "packushb   %[ftmp8],       %[ftmp8],          %[fzero]           \n\t"
153     "punpcklbh  %[ftmp8],       %[ftmp8],          %[fzero]           \n\t"
154     "gssdlc1    %[ftmp8],       0x07(%[output_ptr])                   \n\t"
155     "gssdrc1    %[ftmp8],       0x00(%[output_ptr])                   \n\t"
156 
157     "addiu      %[output_height], %[output_height], -0x01             \n\t"
158     MMI_ADDU(%[output_ptr],  %[output_ptr],    %[output_width])
159     MMI_ADDU(%[src_ptr],  %[src_ptr], %[src_pixels_per_line])
160     "bnez       %[output_height],               1b                    \n\t"
161     : [fzero]"=&f"(fzero),              [ftmp0]"=&f"(ftmp0),
162       [ftmp1]"=&f"(ftmp1),              [ftmp2]"=&f"(ftmp2),
163       [ftmp3]"=&f"(ftmp3),              [ftmp4]"=&f"(ftmp4),
164       [ftmp5]"=&f"(ftmp5),              [ftmp6]"=&f"(ftmp6),
165       [ftmp7]"=&f"(ftmp7),              [ftmp8]"=&f"(ftmp8),
166       [ftmp9]"=&f"(ftmp9),              [ftmp10]"=&f"(ftmp10),
167       [ftmp11]"=&f"(ftmp11),            [tmp0]"=&r"(tmp[0]),
168       [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height),
169       [src_ptr]"+&r"(src_ptr)
170     : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line),
171       [vp8_filter]"r"(vp8_filter),      [output_width]"r"(output_width),
172       [ff_ph_40]"f"(ff_ph_40)
173     : "memory"
174     );
175 }
176 
177 /* Horizontal filter:  pixel_step is always W */
vp8_filter_block1dc_v6_mmi(uint16_t * src_ptr,unsigned char * output_ptr,unsigned int output_height,int output_pitch,unsigned int pixels_per_line,const int16_t * vp8_filter)178 static INLINE void vp8_filter_block1dc_v6_mmi(
179     uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height,
180     int output_pitch, unsigned int pixels_per_line, const int16_t *vp8_filter) {
181   DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
182   uint32_t tmp[1];
183   mips_reg addr[1];
184 #if _MIPS_SIM == _ABIO32
185   register double fzero asm("$f0");
186   register double ftmp0 asm("$f2");
187   register double ftmp1 asm("$f4");
188   register double ftmp2 asm("$f6");
189   register double ftmp3 asm("$f8");
190   register double ftmp4 asm("$f10");
191   register double ftmp5 asm("$f12");
192   register double ftmp6 asm("$f14");
193   register double ftmp7 asm("$f16");
194   register double ftmp8 asm("$f18");
195   register double ftmp9 asm("$f20");
196   register double ftmp10 asm("$f22");
197   register double ftmp11 asm("$f24");
198   register double ftmp12 asm("$f26");
199   register double ftmp13 asm("$f28");
200 #else
201   register double fzero asm("$f0");
202   register double ftmp0 asm("$f1");
203   register double ftmp1 asm("$f2");
204   register double ftmp2 asm("$f3");
205   register double ftmp3 asm("$f4");
206   register double ftmp4 asm("$f5");
207   register double ftmp5 asm("$f6");
208   register double ftmp6 asm("$f7");
209   register double ftmp7 asm("$f8");
210   register double ftmp8 asm("$f9");
211   register double ftmp9 asm("$f10");
212   register double ftmp10 asm("$f11");
213   register double ftmp11 asm("$f12");
214   register double ftmp12 asm("$f13");
215   register double ftmp13 asm("$f14");
216 #endif  // _MIPS_SIM == _ABIO32
217 
218   __asm__ volatile (
219     "ldc1       %[ftmp0],     0x00(%[vp8_filter])                     \n\t"
220     "ldc1       %[ftmp1],     0x10(%[vp8_filter])                     \n\t"
221     "ldc1       %[ftmp2],     0x20(%[vp8_filter])                     \n\t"
222     "ldc1       %[ftmp3],     0x30(%[vp8_filter])                     \n\t"
223     "ldc1       %[ftmp4],     0x40(%[vp8_filter])                     \n\t"
224     "ldc1       %[ftmp5],     0x50(%[vp8_filter])                     \n\t"
225     "xor        %[fzero],     %[fzero],        %[fzero]               \n\t"
226     "li         %[tmp0],      0x07                                    \n\t"
227     "mtc1       %[tmp0],      %[ftmp13]                               \n\t"
228 
229     /* In order to make full use of memory load delay slot,
230      * Operation of memory loading and calculating has been rearranged.
231      */
232     "1:                                                               \n\t"
233     "gsldlc1    %[ftmp6],     0x07(%[src_ptr])                        \n\t"
234     "gsldrc1    %[ftmp6],     0x00(%[src_ptr])                        \n\t"
235     MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line])
236     "gsldlc1    %[ftmp7],     0x07(%[addr0])                          \n\t"
237     "gsldrc1    %[ftmp7],     0x00(%[addr0])                          \n\t"
238     MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x2])
239     "gsldlc1    %[ftmp8],     0x07(%[addr0])                          \n\t"
240     "gsldrc1    %[ftmp8],     0x00(%[addr0])                          \n\t"
241 
242     MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x4])
243     "gsldlc1    %[ftmp9],     0x07(%[addr0])                          \n\t"
244     "gsldrc1    %[ftmp9],     0x00(%[addr0])                          \n\t"
245     MMI_ADDU(%[src_ptr],   %[src_ptr],      %[pixels_per_line])
246     MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x2])
247     "gsldlc1    %[ftmp10],    0x07(%[addr0])                          \n\t"
248     "gsldrc1    %[ftmp10],    0x00(%[addr0])                          \n\t"
249     MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x4])
250     "gsldlc1    %[ftmp11],    0x07(%[addr0])                          \n\t"
251     "gsldrc1    %[ftmp11],    0x00(%[addr0])                          \n\t"
252 
253     "pmullh     %[ftmp12],    %[ftmp6],        %[ftmp0]               \n\t"
254 
255     "pmullh     %[ftmp7],     %[ftmp7],        %[ftmp1]               \n\t"
256     "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp7]               \n\t"
257 
258     "pmullh     %[ftmp8],     %[ftmp8],        %[ftmp2]               \n\t"
259     "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp8]               \n\t"
260 
261     "pmullh     %[ftmp9],     %[ftmp9],        %[ftmp4]               \n\t"
262     "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp9]               \n\t"
263 
264     "pmullh     %[ftmp10],    %[ftmp10],       %[ftmp3]               \n\t"
265     "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp10]              \n\t"
266 
267     "pmullh     %[ftmp11],    %[ftmp11],       %[ftmp5]               \n\t"
268     "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp11]              \n\t"
269 
270     "paddsh     %[ftmp12],    %[ftmp12],       %[ff_ph_40]            \n\t"
271     "psrah      %[ftmp12],    %[ftmp12],       %[ftmp13]              \n\t"
272     "packushb   %[ftmp12],    %[ftmp12],       %[fzero]               \n\t"
273     "gsswlc1    %[ftmp12],    0x03(%[output_ptr])                     \n\t"
274     "gsswrc1    %[ftmp12],    0x00(%[output_ptr])                     \n\t"
275 
276     MMI_ADDIU(%[output_height], %[output_height], -0x01)
277     MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch])
278     "bnez       %[output_height], 1b                                  \n\t"
279     : [fzero]"=&f"(fzero),              [ftmp0]"=&f"(ftmp0),
280       [ftmp1]"=&f"(ftmp1),              [ftmp2]"=&f"(ftmp2),
281       [ftmp3]"=&f"(ftmp3),              [ftmp4]"=&f"(ftmp4),
282       [ftmp5]"=&f"(ftmp5),              [ftmp6]"=&f"(ftmp6),
283       [ftmp7]"=&f"(ftmp7),              [ftmp8]"=&f"(ftmp8),
284       [ftmp9]"=&f"(ftmp9),              [ftmp10]"=&f"(ftmp10),
285       [ftmp11]"=&f"(ftmp11),            [ftmp12]"=&f"(ftmp12),
286       [ftmp13]"=&f"(ftmp13),            [tmp0]"=&r"(tmp[0]),
287       [addr0]"=&r"(addr[0]),            [src_ptr]"+&r"(src_ptr),
288       [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height)
289     : [pixels_per_line]"r"((mips_reg)pixels_per_line),
290       [pixels_per_line_x2]"r"((mips_reg)(pixels_per_line<<1)),
291       [pixels_per_line_x4]"r"((mips_reg)(pixels_per_line<<2)),
292       [vp8_filter]"r"(vp8_filter),
293       [output_pitch]"r"((mips_reg)output_pitch),
294       [ff_ph_40]"f"(ff_ph_40)
295     : "memory"
296     );
297 }
298 
299 /* When xoffset == 0, vp8_filter= {0,0,128,0,0,0},
300    function vp8_filter_block1d_h6_mmi and vp8_filter_block1d_v6_mmi can
301    be simplified */
vp8_filter_block1d_h6_filter0_mmi(unsigned char * src_ptr,uint16_t * output_ptr,unsigned int src_pixels_per_line,unsigned int output_height,unsigned int output_width)302 static INLINE void vp8_filter_block1d_h6_filter0_mmi(
303     unsigned char *src_ptr, uint16_t *output_ptr,
304     unsigned int src_pixels_per_line, unsigned int output_height,
305     unsigned int output_width) {
306 #if _MIPS_SIM == _ABIO32
307   register double fzero asm("$f0");
308   register double ftmp0 asm("$f2");
309   register double ftmp1 asm("$f4");
310 #else
311   register double fzero asm("$f0");
312   register double ftmp0 asm("$f1");
313   register double ftmp1 asm("$f2");
314 #endif  // _MIPS_SIM == _ABIO32
315 
316   __asm__ volatile (
317     "xor        %[fzero],       %[fzero],           %[fzero]          \n\t"
318 
319     "1:                                                               \n\t"
320     "gsldlc1    %[ftmp0],       0x07(%[src_ptr])                      \n\t"
321     "gsldrc1    %[ftmp0],       0x00(%[src_ptr])                      \n\t"
322     MMI_ADDU(%[src_ptr],  %[src_ptr], %[src_pixels_per_line])
323 
324     "punpcklbh  %[ftmp1],       %[ftmp0],          %[fzero]           \n\t"
325     "gssdlc1    %[ftmp1],       0x07(%[output_ptr])                   \n\t"
326     "gssdrc1    %[ftmp1],       0x00(%[output_ptr])                   \n\t"
327 
328     "addiu      %[output_height], %[output_height], -0x01             \n\t"
329     MMI_ADDU(%[output_ptr],  %[output_ptr],    %[output_width])
330     "bnez       %[output_height],               1b                    \n\t"
331     : [fzero]"=&f"(fzero),              [ftmp0]"=&f"(ftmp0),
332       [ftmp1]"=&f"(ftmp1),              [src_ptr]"+&r"(src_ptr),
333       [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height)
334     : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line),
335       [output_width]"r"(output_width)
336     : "memory"
337     );
338 }
339 
vp8_filter_block1dc_v6_filter0_mmi(uint16_t * src_ptr,unsigned char * output_ptr,unsigned int output_height,int output_pitch,unsigned int pixels_per_line)340 static INLINE void vp8_filter_block1dc_v6_filter0_mmi(
341     uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height,
342     int output_pitch, unsigned int pixels_per_line) {
343 #if _MIPS_SIM == _ABIO32
344   register double fzero asm("$f0");
345   register double ftmp0 asm("$f2");
346   register double ftmp1 asm("$f4");
347 #else
348   register double fzero asm("$f0");
349   register double ftmp0 asm("$f1");
350   register double ftmp1 asm("$f2");
351 #endif  // _MIPS_SIM == _ABIO32
352 
353   __asm__ volatile (
354     "xor        %[fzero],     %[fzero],        %[fzero]               \n\t"
355 
356     "1:                                                               \n\t"
357     "gsldlc1    %[ftmp0],     0x07(%[src_ptr])                        \n\t"
358     "gsldrc1    %[ftmp0],     0x00(%[src_ptr])                        \n\t"
359     MMI_ADDU(%[src_ptr],   %[src_ptr],      %[pixels_per_line])
360     MMI_ADDIU(%[output_height], %[output_height], -0x01)
361     "packushb   %[ftmp1],     %[ftmp0],        %[fzero]               \n\t"
362     "gsswlc1    %[ftmp1],     0x03(%[output_ptr])                     \n\t"
363     "gsswrc1    %[ftmp1],     0x00(%[output_ptr])                     \n\t"
364 
365     MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch])
366     "bnez       %[output_height], 1b                                  \n\t"
367     : [fzero]"=&f"(fzero),              [ftmp0]"=&f"(ftmp0),
368       [ftmp1]"=&f"(ftmp1),              [src_ptr]"+&r"(src_ptr),
369       [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height)
370     : [pixels_per_line]"r"((mips_reg)pixels_per_line),
371       [output_pitch]"r"((mips_reg)output_pitch)
372     : "memory"
373     );
374 }
375 
376 #define sixtapNxM(n, m)                                                        \
377   void vp8_sixtap_predict##n##x##m##_mmi(                                      \
378       unsigned char *src_ptr, int src_pixels_per_line, int xoffset,            \
379       int yoffset, unsigned char *dst_ptr, int dst_pitch) {                    \
380     DECLARE_ALIGNED(16, uint16_t,                                              \
381                     FData2[(n + 5) * (n == 16 ? 24 : (n == 8 ? 16 : n))]);     \
382     const int16_t *HFilter, *VFilter;                                          \
383     int i, loop = n / 4;                                                       \
384     HFilter = vp8_six_tap_mmi[xoffset];                                        \
385     VFilter = vp8_six_tap_mmi[yoffset];                                        \
386                                                                                \
387     if (xoffset == 0) {                                                        \
388       for (i = 0; i < loop; ++i) {                                             \
389         vp8_filter_block1d_h6_filter0_mmi(                                     \
390             src_ptr - (2 * src_pixels_per_line) + i * 4, FData2 + i * 4,       \
391             src_pixels_per_line, m + 5, n * 2);                                \
392       }                                                                        \
393     } else {                                                                   \
394       for (i = 0; i < loop; ++i) {                                             \
395         vp8_filter_block1d_h6_mmi(src_ptr - (2 * src_pixels_per_line) + i * 4, \
396                                   FData2 + i * 4, src_pixels_per_line, m + 5,  \
397                                   n * 2, HFilter);                             \
398       }                                                                        \
399     }                                                                          \
400     if (yoffset == 0) {                                                        \
401       for (i = 0; i < loop; ++i) {                                             \
402         vp8_filter_block1dc_v6_filter0_mmi(                                    \
403             FData2 + n * 2 + i * 4, dst_ptr + i * 4, m, dst_pitch, n * 2);     \
404       }                                                                        \
405     } else {                                                                   \
406       for (i = 0; i < loop; ++i) {                                             \
407         vp8_filter_block1dc_v6_mmi(FData2 + i * 4, dst_ptr + i * 4, m,         \
408                                    dst_pitch, n * 2, VFilter);                 \
409       }                                                                        \
410     }                                                                          \
411   }
412 
413 sixtapNxM(4, 4);
414 sixtapNxM(8, 8);
415 sixtapNxM(8, 4);
416 sixtapNxM(16, 16);
417