1 /*
2  *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include <string.h>
13 
14 #include "./vpx_config.h"
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx/vpx_integer.h"
17 #include "vpx_dsp/vpx_dsp_common.h"
18 #include "vpx_dsp/vpx_filter.h"
19 #include "vpx_ports/asmdefs_mmi.h"
20 #include "vpx_ports/mem.h"
21 
22 #define GET_DATA_H_MMI                                     \
23   "pmaddhw    %[ftmp4],    %[ftmp4],   %[filter1]    \n\t" \
24   "pmaddhw    %[ftmp5],    %[ftmp5],   %[filter2]    \n\t" \
25   "paddw      %[ftmp4],    %[ftmp4],   %[ftmp5]      \n\t" \
26   "punpckhwd  %[ftmp5],    %[ftmp4],   %[ftmp0]      \n\t" \
27   "paddw      %[ftmp4],    %[ftmp4],   %[ftmp5]      \n\t" \
28   "pmaddhw    %[ftmp6],    %[ftmp6],   %[filter1]    \n\t" \
29   "pmaddhw    %[ftmp7],    %[ftmp7],   %[filter2]    \n\t" \
30   "paddw      %[ftmp6],    %[ftmp6],   %[ftmp7]      \n\t" \
31   "punpckhwd  %[ftmp7],    %[ftmp6],   %[ftmp0]      \n\t" \
32   "paddw      %[ftmp6],    %[ftmp6],   %[ftmp7]      \n\t" \
33   "punpcklwd  %[srcl],     %[ftmp4],   %[ftmp6]      \n\t" \
34   "pmaddhw    %[ftmp8],    %[ftmp8],   %[filter1]    \n\t" \
35   "pmaddhw    %[ftmp9],    %[ftmp9],   %[filter2]    \n\t" \
36   "paddw      %[ftmp8],    %[ftmp8],   %[ftmp9]      \n\t" \
37   "punpckhwd  %[ftmp9],    %[ftmp8],   %[ftmp0]      \n\t" \
38   "paddw      %[ftmp8],    %[ftmp8],   %[ftmp9]      \n\t" \
39   "pmaddhw    %[ftmp10],   %[ftmp10],  %[filter1]    \n\t" \
40   "pmaddhw    %[ftmp11],   %[ftmp11],  %[filter2]    \n\t" \
41   "paddw      %[ftmp10],   %[ftmp10],  %[ftmp11]     \n\t" \
42   "punpckhwd  %[ftmp11],   %[ftmp10],  %[ftmp0]      \n\t" \
43   "paddw      %[ftmp10],   %[ftmp10],  %[ftmp11]     \n\t" \
44   "punpcklwd  %[srch],     %[ftmp8],   %[ftmp10]     \n\t"
45 
46 #define GET_DATA_V_MMI                                     \
47   "punpcklhw  %[srcl],     %[ftmp4],   %[ftmp5]      \n\t" \
48   "pmaddhw    %[srcl],     %[srcl],    %[filter10]   \n\t" \
49   "punpcklhw  %[ftmp12],   %[ftmp6],   %[ftmp7]      \n\t" \
50   "pmaddhw    %[ftmp12],   %[ftmp12],  %[filter32]   \n\t" \
51   "paddw      %[srcl],     %[srcl],    %[ftmp12]     \n\t" \
52   "punpcklhw  %[ftmp12],   %[ftmp8],   %[ftmp9]      \n\t" \
53   "pmaddhw    %[ftmp12],   %[ftmp12],  %[filter54]   \n\t" \
54   "paddw      %[srcl],     %[srcl],    %[ftmp12]     \n\t" \
55   "punpcklhw  %[ftmp12],   %[ftmp10],  %[ftmp11]     \n\t" \
56   "pmaddhw    %[ftmp12],   %[ftmp12],  %[filter76]   \n\t" \
57   "paddw      %[srcl],     %[srcl],    %[ftmp12]     \n\t" \
58   "punpckhhw  %[srch],     %[ftmp4],   %[ftmp5]      \n\t" \
59   "pmaddhw    %[srch],     %[srch],    %[filter10]   \n\t" \
60   "punpckhhw  %[ftmp12],   %[ftmp6],   %[ftmp7]      \n\t" \
61   "pmaddhw    %[ftmp12],   %[ftmp12],  %[filter32]   \n\t" \
62   "paddw      %[srch],     %[srch],    %[ftmp12]     \n\t" \
63   "punpckhhw  %[ftmp12],   %[ftmp8],   %[ftmp9]      \n\t" \
64   "pmaddhw    %[ftmp12],   %[ftmp12],  %[filter54]   \n\t" \
65   "paddw      %[srch],     %[srch],    %[ftmp12]     \n\t" \
66   "punpckhhw  %[ftmp12],   %[ftmp10],  %[ftmp11]     \n\t" \
67   "pmaddhw    %[ftmp12],   %[ftmp12],  %[filter76]   \n\t" \
68   "paddw      %[srch],     %[srch],    %[ftmp12]     \n\t"
69 
70 /* clang-format off */
71 #define ROUND_POWER_OF_TWO_MMI                             \
72   /* Add para[0] */                                        \
73   "lw         %[tmp0],     0x00(%[para])             \n\t" \
74   MMI_MTC1(%[tmp0],     %[ftmp6])                          \
75   "punpcklwd  %[ftmp6],    %[ftmp6],    %[ftmp6]     \n\t" \
76   "paddw      %[srcl],     %[srcl],     %[ftmp6]     \n\t" \
77   "paddw      %[srch],     %[srch],     %[ftmp6]     \n\t" \
78   /* Arithmetic right shift para[1] bits */                \
79   "lw         %[tmp0],     0x04(%[para])             \n\t" \
80   MMI_MTC1(%[tmp0],     %[ftmp5])                          \
81   "psraw      %[srcl],     %[srcl],     %[ftmp5]     \n\t" \
82   "psraw      %[srch],     %[srch],     %[ftmp5]     \n\t"
83 /* clang-format on */
84 
85 #define CLIP_PIXEL_MMI                                     \
86   /* Staturated operation */                               \
87   "packsswh   %[srcl],     %[srcl],     %[srch]      \n\t" \
88   "packushb   %[ftmp12],   %[srcl],     %[ftmp0]     \n\t"
89 
convolve_horiz_mmi(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int32_t w,int32_t h)90 static void convolve_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
91                                uint8_t *dst, ptrdiff_t dst_stride,
92                                const InterpKernel *filter, int x0_q4,
93                                int x_step_q4, int32_t w, int32_t h) {
94   const int16_t *filter_x = filter[x0_q4];
95   double ftmp[14];
96   uint32_t tmp[2];
97   uint32_t para[5];
98   para[0] = (1 << ((FILTER_BITS)-1));
99   para[1] = FILTER_BITS;
100   src -= SUBPEL_TAPS / 2 - 1;
101   src_stride -= w;
102   dst_stride -= w;
103   (void)x_step_q4;
104 
105   /* clang-format off */
106   __asm__ volatile(
107     "move       %[tmp1],    %[width]                   \n\t"
108     "pxor       %[ftmp0],   %[ftmp0],    %[ftmp0]      \n\t"
109     "gsldlc1    %[filter1], 0x03(%[filter])            \n\t"
110     "gsldrc1    %[filter1], 0x00(%[filter])            \n\t"
111     "gsldlc1    %[filter2], 0x0b(%[filter])            \n\t"
112     "gsldrc1    %[filter2], 0x08(%[filter])            \n\t"
113     "1:                                                \n\t"
114     /* Get 8 data per row */
115     "gsldlc1    %[ftmp5],   0x07(%[src])               \n\t"
116     "gsldrc1    %[ftmp5],   0x00(%[src])               \n\t"
117     "gsldlc1    %[ftmp7],   0x08(%[src])               \n\t"
118     "gsldrc1    %[ftmp7],   0x01(%[src])               \n\t"
119     "gsldlc1    %[ftmp9],   0x09(%[src])               \n\t"
120     "gsldrc1    %[ftmp9],   0x02(%[src])               \n\t"
121     "gsldlc1    %[ftmp11],  0x0A(%[src])               \n\t"
122     "gsldrc1    %[ftmp11],  0x03(%[src])               \n\t"
123     "punpcklbh  %[ftmp4],   %[ftmp5],    %[ftmp0]      \n\t"
124     "punpckhbh  %[ftmp5],   %[ftmp5],    %[ftmp0]      \n\t"
125     "punpcklbh  %[ftmp6],   %[ftmp7],    %[ftmp0]      \n\t"
126     "punpckhbh  %[ftmp7],   %[ftmp7],    %[ftmp0]      \n\t"
127     "punpcklbh  %[ftmp8],   %[ftmp9],    %[ftmp0]      \n\t"
128     "punpckhbh  %[ftmp9],   %[ftmp9],    %[ftmp0]      \n\t"
129     "punpcklbh  %[ftmp10],  %[ftmp11],   %[ftmp0]      \n\t"
130     "punpckhbh  %[ftmp11],  %[ftmp11],   %[ftmp0]      \n\t"
131     MMI_ADDIU(%[width],   %[width],    -0x04)
132     /* Get raw data */
133     GET_DATA_H_MMI
134     ROUND_POWER_OF_TWO_MMI
135     CLIP_PIXEL_MMI
136     "swc1       %[ftmp12],  0x00(%[dst])               \n\t"
137     MMI_ADDIU(%[dst],     %[dst],      0x04)
138     MMI_ADDIU(%[src],     %[src],      0x04)
139     /* Loop count */
140     "bnez       %[width],   1b                         \n\t"
141     "move       %[width],   %[tmp1]                    \n\t"
142     MMI_ADDU(%[src],      %[src],      %[src_stride])
143     MMI_ADDU(%[dst],      %[dst],      %[dst_stride])
144     MMI_ADDIU(%[height],  %[height],   -0x01)
145     "bnez       %[height],  1b                         \n\t"
146     : [srcl]"=&f"(ftmp[0]),     [srch]"=&f"(ftmp[1]),
147       [filter1]"=&f"(ftmp[2]),  [filter2]"=&f"(ftmp[3]),
148       [ftmp0]"=&f"(ftmp[4]),    [ftmp4]"=&f"(ftmp[5]),
149       [ftmp5]"=&f"(ftmp[6]),    [ftmp6]"=&f"(ftmp[7]),
150       [ftmp7]"=&f"(ftmp[8]),    [ftmp8]"=&f"(ftmp[9]),
151       [ftmp9]"=&f"(ftmp[10]),   [ftmp10]"=&f"(ftmp[11]),
152       [ftmp11]"=&f"(ftmp[12]),  [ftmp12]"=&f"(ftmp[13]),
153       [tmp0]"=&r"(tmp[0]),      [tmp1]"=&r"(tmp[1]),
154       [src]"+&r"(src),          [width]"+&r"(w),
155       [dst]"+&r"(dst),          [height]"+&r"(h)
156     : [filter]"r"(filter_x),    [para]"r"(para),
157       [src_stride]"r"((mips_reg)src_stride),
158       [dst_stride]"r"((mips_reg)dst_stride)
159     : "memory"
160   );
161   /* clang-format on */
162 }
163 
convolve_vert_mmi(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int y0_q4,int y_step_q4,int32_t w,int32_t h)164 static void convolve_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
165                               uint8_t *dst, ptrdiff_t dst_stride,
166                               const InterpKernel *filter, int y0_q4,
167                               int y_step_q4, int32_t w, int32_t h) {
168   const int16_t *filter_y = filter[y0_q4];
169   double ftmp[16];
170   uint32_t tmp[1];
171   uint32_t para[2];
172   ptrdiff_t addr = src_stride;
173   para[0] = (1 << ((FILTER_BITS)-1));
174   para[1] = FILTER_BITS;
175   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
176   src_stride -= w;
177   dst_stride -= w;
178   (void)y_step_q4;
179 
180   __asm__ volatile(
181     "pxor       %[ftmp0],    %[ftmp0],   %[ftmp0]      \n\t"
182     "gsldlc1    %[ftmp4],    0x03(%[filter])           \n\t"
183     "gsldrc1    %[ftmp4],    0x00(%[filter])           \n\t"
184     "gsldlc1    %[ftmp5],    0x0b(%[filter])           \n\t"
185     "gsldrc1    %[ftmp5],    0x08(%[filter])           \n\t"
186     "punpcklwd  %[filter10], %[ftmp4],   %[ftmp4]      \n\t"
187     "punpckhwd  %[filter32], %[ftmp4],   %[ftmp4]      \n\t"
188     "punpcklwd  %[filter54], %[ftmp5],   %[ftmp5]      \n\t"
189     "punpckhwd  %[filter76], %[ftmp5],   %[ftmp5]      \n\t"
190     "1:                                                \n\t"
191     /* Get 8 data per column */
192     "gsldlc1    %[ftmp4],    0x07(%[src])              \n\t"
193     "gsldrc1    %[ftmp4],    0x00(%[src])              \n\t"
194     MMI_ADDU(%[tmp0],     %[src],     %[addr])
195     "gsldlc1    %[ftmp5],    0x07(%[tmp0])             \n\t"
196     "gsldrc1    %[ftmp5],    0x00(%[tmp0])             \n\t"
197     MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
198     "gsldlc1    %[ftmp6],    0x07(%[tmp0])             \n\t"
199     "gsldrc1    %[ftmp6],    0x00(%[tmp0])             \n\t"
200     MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
201     "gsldlc1    %[ftmp7],    0x07(%[tmp0])             \n\t"
202     "gsldrc1    %[ftmp7],    0x00(%[tmp0])             \n\t"
203     MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
204     "gsldlc1    %[ftmp8],    0x07(%[tmp0])             \n\t"
205     "gsldrc1    %[ftmp8],    0x00(%[tmp0])             \n\t"
206     MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
207     "gsldlc1    %[ftmp9],    0x07(%[tmp0])             \n\t"
208     "gsldrc1    %[ftmp9],    0x00(%[tmp0])             \n\t"
209     MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
210     "gsldlc1    %[ftmp10],   0x07(%[tmp0])             \n\t"
211     "gsldrc1    %[ftmp10],   0x00(%[tmp0])             \n\t"
212     MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
213     "gsldlc1    %[ftmp11],   0x07(%[tmp0])             \n\t"
214     "gsldrc1    %[ftmp11],   0x00(%[tmp0])             \n\t"
215     "punpcklbh  %[ftmp4],    %[ftmp4],   %[ftmp0]      \n\t"
216     "punpcklbh  %[ftmp5],    %[ftmp5],   %[ftmp0]      \n\t"
217     "punpcklbh  %[ftmp6],    %[ftmp6],   %[ftmp0]      \n\t"
218     "punpcklbh  %[ftmp7],    %[ftmp7],   %[ftmp0]      \n\t"
219     "punpcklbh  %[ftmp8],    %[ftmp8],   %[ftmp0]      \n\t"
220     "punpcklbh  %[ftmp9],    %[ftmp9],   %[ftmp0]      \n\t"
221     "punpcklbh  %[ftmp10],   %[ftmp10],  %[ftmp0]      \n\t"
222     "punpcklbh  %[ftmp11],   %[ftmp11],  %[ftmp0]      \n\t"
223     MMI_ADDIU(%[width],   %[width],   -0x04)
224     /* Get raw data */
225     GET_DATA_V_MMI
226     ROUND_POWER_OF_TWO_MMI
227     CLIP_PIXEL_MMI
228     "swc1       %[ftmp12],   0x00(%[dst])              \n\t"
229     MMI_ADDIU(%[dst],     %[dst],      0x04)
230     MMI_ADDIU(%[src],     %[src],      0x04)
231     /* Loop count */
232     "bnez       %[width],    1b                        \n\t"
233     MMI_SUBU(%[width],    %[addr],     %[src_stride])
234     MMI_ADDU(%[src],      %[src],      %[src_stride])
235     MMI_ADDU(%[dst],      %[dst],      %[dst_stride])
236     MMI_ADDIU(%[height],  %[height],   -0x01)
237     "bnez       %[height],   1b                        \n\t"
238     : [srcl]"=&f"(ftmp[0]),     [srch]"=&f"(ftmp[1]),
239       [filter10]"=&f"(ftmp[2]), [filter32]"=&f"(ftmp[3]),
240       [filter54]"=&f"(ftmp[4]), [filter76]"=&f"(ftmp[5]),
241       [ftmp0]"=&f"(ftmp[6]),    [ftmp4]"=&f"(ftmp[7]),
242       [ftmp5]"=&f"(ftmp[8]),    [ftmp6]"=&f"(ftmp[9]),
243       [ftmp7]"=&f"(ftmp[10]),   [ftmp8]"=&f"(ftmp[11]),
244       [ftmp9]"=&f"(ftmp[12]),   [ftmp10]"=&f"(ftmp[13]),
245       [ftmp11]"=&f"(ftmp[14]),  [ftmp12]"=&f"(ftmp[15]),
246       [src]"+&r"(src),          [dst]"+&r"(dst),
247       [width]"+&r"(w),          [height]"+&r"(h),
248       [tmp0]"=&r"(tmp[0])
249     : [filter]"r"(filter_y),    [para]"r"(para),
250       [src_stride]"r"((mips_reg)src_stride),
251       [dst_stride]"r"((mips_reg)dst_stride),
252       [addr]"r"((mips_reg)addr)
253     : "memory"
254   );
255 }
256 
convolve_avg_horiz_mmi(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int32_t w,int32_t h)257 static void convolve_avg_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
258                                    uint8_t *dst, ptrdiff_t dst_stride,
259                                    const InterpKernel *filter, int x0_q4,
260                                    int x_step_q4, int32_t w, int32_t h) {
261   const int16_t *filter_x = filter[x0_q4];
262   double ftmp[14];
263   uint32_t tmp[2];
264   uint32_t para[2];
265   para[0] = (1 << ((FILTER_BITS)-1));
266   para[1] = FILTER_BITS;
267   src -= SUBPEL_TAPS / 2 - 1;
268   src_stride -= w;
269   dst_stride -= w;
270   (void)x_step_q4;
271 
272   __asm__ volatile(
273     "move       %[tmp1],    %[width]                   \n\t"
274     "pxor       %[ftmp0],   %[ftmp0],    %[ftmp0]      \n\t"
275     "gsldlc1    %[filter1], 0x03(%[filter])            \n\t"
276     "gsldrc1    %[filter1], 0x00(%[filter])            \n\t"
277     "gsldlc1    %[filter2], 0x0b(%[filter])            \n\t"
278     "gsldrc1    %[filter2], 0x08(%[filter])            \n\t"
279     "1:                                                \n\t"
280     /* Get 8 data per row */
281     "gsldlc1    %[ftmp5],   0x07(%[src])               \n\t"
282     "gsldrc1    %[ftmp5],   0x00(%[src])               \n\t"
283     "gsldlc1    %[ftmp7],   0x08(%[src])               \n\t"
284     "gsldrc1    %[ftmp7],   0x01(%[src])               \n\t"
285     "gsldlc1    %[ftmp9],   0x09(%[src])               \n\t"
286     "gsldrc1    %[ftmp9],   0x02(%[src])               \n\t"
287     "gsldlc1    %[ftmp11],  0x0A(%[src])               \n\t"
288     "gsldrc1    %[ftmp11],  0x03(%[src])               \n\t"
289     "punpcklbh  %[ftmp4],   %[ftmp5],    %[ftmp0]      \n\t"
290     "punpckhbh  %[ftmp5],   %[ftmp5],    %[ftmp0]      \n\t"
291     "punpcklbh  %[ftmp6],   %[ftmp7],    %[ftmp0]      \n\t"
292     "punpckhbh  %[ftmp7],   %[ftmp7],    %[ftmp0]      \n\t"
293     "punpcklbh  %[ftmp8],   %[ftmp9],    %[ftmp0]      \n\t"
294     "punpckhbh  %[ftmp9],   %[ftmp9],    %[ftmp0]      \n\t"
295     "punpcklbh  %[ftmp10],  %[ftmp11],   %[ftmp0]      \n\t"
296     "punpckhbh  %[ftmp11],  %[ftmp11],   %[ftmp0]      \n\t"
297     MMI_ADDIU(%[width],   %[width],    -0x04)
298     /* Get raw data */
299     GET_DATA_H_MMI
300     ROUND_POWER_OF_TWO_MMI
301     CLIP_PIXEL_MMI
302     "punpcklbh  %[ftmp12],  %[ftmp12],   %[ftmp0]      \n\t"
303     "gsldlc1    %[ftmp4],   0x07(%[dst])               \n\t"
304     "gsldrc1    %[ftmp4],   0x00(%[dst])               \n\t"
305     "punpcklbh  %[ftmp4],   %[ftmp4],    %[ftmp0]      \n\t"
306     "paddh      %[ftmp12],  %[ftmp12],   %[ftmp4]      \n\t"
307     "li         %[tmp0],    0x10001                    \n\t"
308     MMI_MTC1(%[tmp0],     %[ftmp5])
309     "punpcklhw  %[ftmp5],   %[ftmp5],    %[ftmp5]      \n\t"
310     "paddh      %[ftmp12],  %[ftmp12],   %[ftmp5]      \n\t"
311     "psrah      %[ftmp12],  %[ftmp12],   %[ftmp5]      \n\t"
312     "packushb   %[ftmp12],  %[ftmp12],   %[ftmp0]      \n\t"
313     "swc1       %[ftmp12],  0x00(%[dst])               \n\t"
314     MMI_ADDIU(%[dst],     %[dst],      0x04)
315     MMI_ADDIU(%[src],     %[src],      0x04)
316     /* Loop count */
317     "bnez       %[width],   1b                         \n\t"
318     "move       %[width],   %[tmp1]                    \n\t"
319     MMI_ADDU(%[src],      %[src],      %[src_stride])
320     MMI_ADDU(%[dst],      %[dst],      %[dst_stride])
321     MMI_ADDIU(%[height],  %[height],   -0x01)
322     "bnez       %[height],  1b                         \n\t"
323     : [srcl]"=&f"(ftmp[0]),     [srch]"=&f"(ftmp[1]),
324       [filter1]"=&f"(ftmp[2]),  [filter2]"=&f"(ftmp[3]),
325       [ftmp0]"=&f"(ftmp[4]),    [ftmp4]"=&f"(ftmp[5]),
326       [ftmp5]"=&f"(ftmp[6]),    [ftmp6]"=&f"(ftmp[7]),
327       [ftmp7]"=&f"(ftmp[8]),    [ftmp8]"=&f"(ftmp[9]),
328       [ftmp9]"=&f"(ftmp[10]),   [ftmp10]"=&f"(ftmp[11]),
329       [ftmp11]"=&f"(ftmp[12]),  [ftmp12]"=&f"(ftmp[13]),
330       [tmp0]"=&r"(tmp[0]),      [tmp1]"=&r"(tmp[1]),
331       [src]"+&r"(src),          [width]"+&r"(w),
332       [dst]"+&r"(dst),          [height]"+&r"(h)
333     : [filter]"r"(filter_x),    [para]"r"(para),
334       [src_stride]"r"((mips_reg)src_stride),
335       [dst_stride]"r"((mips_reg)dst_stride)
336     : "memory"
337   );
338 }
339 
convolve_avg_vert_mmi(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int y0_q4,int y_step_q4,int32_t w,int32_t h)340 static void convolve_avg_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
341                                   uint8_t *dst, ptrdiff_t dst_stride,
342                                   const InterpKernel *filter, int y0_q4,
343                                   int y_step_q4, int32_t w, int32_t h) {
344   const int16_t *filter_y = filter[y0_q4];
345   double ftmp[16];
346   uint32_t tmp[1];
347   uint32_t para[2];
348   ptrdiff_t addr = src_stride;
349   para[0] = (1 << ((FILTER_BITS)-1));
350   para[1] = FILTER_BITS;
351   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
352   src_stride -= w;
353   dst_stride -= w;
354   (void)y_step_q4;
355 
356   __asm__ volatile(
357     "pxor       %[ftmp0],    %[ftmp0],   %[ftmp0]      \n\t"
358     "gsldlc1    %[ftmp4],    0x03(%[filter])           \n\t"
359     "gsldrc1    %[ftmp4],    0x00(%[filter])           \n\t"
360     "gsldlc1    %[ftmp5],    0x0b(%[filter])           \n\t"
361     "gsldrc1    %[ftmp5],    0x08(%[filter])           \n\t"
362     "punpcklwd  %[filter10], %[ftmp4],   %[ftmp4]      \n\t"
363     "punpckhwd  %[filter32], %[ftmp4],   %[ftmp4]      \n\t"
364     "punpcklwd  %[filter54], %[ftmp5],   %[ftmp5]      \n\t"
365     "punpckhwd  %[filter76], %[ftmp5],   %[ftmp5]      \n\t"
366     "1:                                                \n\t"
367     /* Get 8 data per column */
368     "gsldlc1    %[ftmp4],    0x07(%[src])              \n\t"
369     "gsldrc1    %[ftmp4],    0x00(%[src])              \n\t"
370     MMI_ADDU(%[tmp0],     %[src],     %[addr])
371     "gsldlc1    %[ftmp5],    0x07(%[tmp0])             \n\t"
372     "gsldrc1    %[ftmp5],    0x00(%[tmp0])             \n\t"
373     MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
374     "gsldlc1    %[ftmp6],    0x07(%[tmp0])             \n\t"
375     "gsldrc1    %[ftmp6],    0x00(%[tmp0])             \n\t"
376     MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
377     "gsldlc1    %[ftmp7],    0x07(%[tmp0])             \n\t"
378     "gsldrc1    %[ftmp7],    0x00(%[tmp0])             \n\t"
379     MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
380     "gsldlc1    %[ftmp8],    0x07(%[tmp0])             \n\t"
381     "gsldrc1    %[ftmp8],    0x00(%[tmp0])             \n\t"
382     MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
383     "gsldlc1    %[ftmp9],    0x07(%[tmp0])             \n\t"
384     "gsldrc1    %[ftmp9],    0x00(%[tmp0])             \n\t"
385     MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
386     "gsldlc1    %[ftmp10],   0x07(%[tmp0])             \n\t"
387     "gsldrc1    %[ftmp10],   0x00(%[tmp0])             \n\t"
388     MMI_ADDU(%[tmp0],     %[tmp0],    %[addr])
389     "gsldlc1    %[ftmp11],   0x07(%[tmp0])             \n\t"
390     "gsldrc1    %[ftmp11],   0x00(%[tmp0])             \n\t"
391     "punpcklbh  %[ftmp4],    %[ftmp4],   %[ftmp0]      \n\t"
392     "punpcklbh  %[ftmp5],    %[ftmp5],   %[ftmp0]      \n\t"
393     "punpcklbh  %[ftmp6],    %[ftmp6],   %[ftmp0]      \n\t"
394     "punpcklbh  %[ftmp7],    %[ftmp7],   %[ftmp0]      \n\t"
395     "punpcklbh  %[ftmp8],    %[ftmp8],   %[ftmp0]      \n\t"
396     "punpcklbh  %[ftmp9],    %[ftmp9],   %[ftmp0]      \n\t"
397     "punpcklbh  %[ftmp10],   %[ftmp10],  %[ftmp0]      \n\t"
398     "punpcklbh  %[ftmp11],   %[ftmp11],  %[ftmp0]      \n\t"
399     MMI_ADDIU(%[width],   %[width],   -0x04)
400     /* Get raw data */
401     GET_DATA_V_MMI
402     ROUND_POWER_OF_TWO_MMI
403     CLIP_PIXEL_MMI
404     "punpcklbh  %[ftmp12],   %[ftmp12],  %[ftmp0]      \n\t"
405     "gsldlc1    %[ftmp4],    0x07(%[dst])              \n\t"
406     "gsldrc1    %[ftmp4],    0x00(%[dst])              \n\t"
407     "punpcklbh  %[ftmp4],    %[ftmp4],   %[ftmp0]      \n\t"
408     "paddh      %[ftmp12],   %[ftmp12],  %[ftmp4]      \n\t"
409     "li         %[tmp0],     0x10001                   \n\t"
410     MMI_MTC1(%[tmp0],     %[ftmp5])
411     "punpcklhw  %[ftmp5],    %[ftmp5],   %[ftmp5]      \n\t"
412     "paddh      %[ftmp12],   %[ftmp12],  %[ftmp5]      \n\t"
413     "psrah      %[ftmp12],   %[ftmp12],  %[ftmp5]      \n\t"
414     "packushb   %[ftmp12],   %[ftmp12],  %[ftmp0]      \n\t"
415     "swc1       %[ftmp12],   0x00(%[dst])              \n\t"
416     MMI_ADDIU(%[dst],     %[dst],      0x04)
417     MMI_ADDIU(%[src],     %[src],      0x04)
418     /* Loop count */
419     "bnez       %[width],    1b                        \n\t"
420     MMI_SUBU(%[width],    %[addr],     %[src_stride])
421     MMI_ADDU(%[src],      %[src],      %[src_stride])
422     MMI_ADDU(%[dst],      %[dst],      %[dst_stride])
423     MMI_ADDIU(%[height],  %[height],   -0x01)
424     "bnez       %[height],   1b                        \n\t"
425     : [srcl]"=&f"(ftmp[0]),     [srch]"=&f"(ftmp[1]),
426       [filter10]"=&f"(ftmp[2]), [filter32]"=&f"(ftmp[3]),
427       [filter54]"=&f"(ftmp[4]), [filter76]"=&f"(ftmp[5]),
428       [ftmp0]"=&f"(ftmp[6]),    [ftmp4]"=&f"(ftmp[7]),
429       [ftmp5]"=&f"(ftmp[8]),    [ftmp6]"=&f"(ftmp[9]),
430       [ftmp7]"=&f"(ftmp[10]),   [ftmp8]"=&f"(ftmp[11]),
431       [ftmp9]"=&f"(ftmp[12]),   [ftmp10]"=&f"(ftmp[13]),
432       [ftmp11]"=&f"(ftmp[14]),  [ftmp12]"=&f"(ftmp[15]),
433       [src]"+&r"(src),          [dst]"+&r"(dst),
434       [width]"+&r"(w),          [height]"+&r"(h),
435       [tmp0]"=&r"(tmp[0])
436     : [filter]"r"(filter_y),    [para]"r"(para),
437       [src_stride]"r"((mips_reg)src_stride),
438       [dst_stride]"r"((mips_reg)dst_stride),
439       [addr]"r"((mips_reg)addr)
440     : "memory"
441   );
442 }
443 
vpx_convolve_avg_mmi(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)444 void vpx_convolve_avg_mmi(const uint8_t *src, ptrdiff_t src_stride,
445                           uint8_t *dst, ptrdiff_t dst_stride,
446                           const InterpKernel *filter, int x0_q4, int x_step_q4,
447                           int y0_q4, int y_step_q4, int w, int h) {
448   int x, y;
449 
450   (void)filter;
451   (void)x0_q4;
452   (void)x_step_q4;
453   (void)y0_q4;
454   (void)y_step_q4;
455 
456   if (w & 0x03) {
457     for (y = 0; y < h; ++y) {
458       for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
459       src += src_stride;
460       dst += dst_stride;
461     }
462   } else {
463     double ftmp[4];
464     uint32_t tmp[2];
465     src_stride -= w;
466     dst_stride -= w;
467 
468     __asm__ volatile(
469       "move       %[tmp1],    %[width]                  \n\t"
470       "pxor       %[ftmp0],   %[ftmp0],   %[ftmp0]      \n\t"
471       "li         %[tmp0],    0x10001                   \n\t"
472       MMI_MTC1(%[tmp0],    %[ftmp3])
473       "punpcklhw  %[ftmp3],   %[ftmp3],   %[ftmp3]      \n\t"
474       "1:                                               \n\t"
475       "gsldlc1    %[ftmp1],   0x07(%[src])              \n\t"
476       "gsldrc1    %[ftmp1],   0x00(%[src])              \n\t"
477       "gsldlc1    %[ftmp2],   0x07(%[dst])              \n\t"
478       "gsldrc1    %[ftmp2],   0x00(%[dst])              \n\t"
479       "punpcklbh  %[ftmp1],   %[ftmp1],   %[ftmp0]      \n\t"
480       "punpcklbh  %[ftmp2],   %[ftmp2],   %[ftmp0]      \n\t"
481       "paddh      %[ftmp1],   %[ftmp1],   %[ftmp2]      \n\t"
482       "paddh      %[ftmp1],   %[ftmp1],   %[ftmp3]      \n\t"
483       "psrah      %[ftmp1],   %[ftmp1],   %[ftmp3]      \n\t"
484       "packushb   %[ftmp1],   %[ftmp1],   %[ftmp0]      \n\t"
485       "swc1       %[ftmp1],   0x00(%[dst])              \n\t"
486       MMI_ADDIU(%[width],  %[width],   -0x04)
487       MMI_ADDIU(%[dst],    %[dst],     0x04)
488       MMI_ADDIU(%[src],    %[src],     0x04)
489       "bnez       %[width],   1b                        \n\t"
490       "move       %[width],   %[tmp1]                   \n\t"
491       MMI_ADDU(%[dst],     %[dst],     %[dst_stride])
492       MMI_ADDU(%[src],     %[src],     %[src_stride])
493       MMI_ADDIU(%[height], %[height],  -0x01)
494       "bnez       %[height],  1b                        \n\t"
495       : [ftmp0]"=&f"(ftmp[0]),  [ftmp1]"=&f"(ftmp[1]),
496         [ftmp2]"=&f"(ftmp[2]),  [ftmp3]"=&f"(ftmp[3]),
497         [tmp0]"=&r"(tmp[0]),    [tmp1]"=&r"(tmp[1]),
498         [src]"+&r"(src),        [dst]"+&r"(dst),
499         [width]"+&r"(w),        [height]"+&r"(h)
500       : [src_stride]"r"((mips_reg)src_stride),
501         [dst_stride]"r"((mips_reg)dst_stride)
502       : "memory"
503     );
504   }
505 }
506 
convolve_horiz(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h)507 static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
508                            uint8_t *dst, ptrdiff_t dst_stride,
509                            const InterpKernel *x_filters, int x0_q4,
510                            int x_step_q4, int w, int h) {
511   int x, y;
512   src -= SUBPEL_TAPS / 2 - 1;
513 
514   for (y = 0; y < h; ++y) {
515     int x_q4 = x0_q4;
516     for (x = 0; x < w; ++x) {
517       const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
518       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
519       int k, sum = 0;
520       for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
521       dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
522       x_q4 += x_step_q4;
523     }
524     src += src_stride;
525     dst += dst_stride;
526   }
527 }
528 
convolve_vert(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)529 static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
530                           uint8_t *dst, ptrdiff_t dst_stride,
531                           const InterpKernel *y_filters, int y0_q4,
532                           int y_step_q4, int w, int h) {
533   int x, y;
534   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
535 
536   for (x = 0; x < w; ++x) {
537     int y_q4 = y0_q4;
538     for (y = 0; y < h; ++y) {
539       const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
540       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
541       int k, sum = 0;
542       for (k = 0; k < SUBPEL_TAPS; ++k)
543         sum += src_y[k * src_stride] * y_filter[k];
544       dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
545       y_q4 += y_step_q4;
546     }
547     ++src;
548     ++dst;
549   }
550 }
551 
convolve_avg_vert(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)552 static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
553                               uint8_t *dst, ptrdiff_t dst_stride,
554                               const InterpKernel *y_filters, int y0_q4,
555                               int y_step_q4, int w, int h) {
556   int x, y;
557   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
558 
559   for (x = 0; x < w; ++x) {
560     int y_q4 = y0_q4;
561     for (y = 0; y < h; ++y) {
562       const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
563       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
564       int k, sum = 0;
565       for (k = 0; k < SUBPEL_TAPS; ++k)
566         sum += src_y[k * src_stride] * y_filter[k];
567       dst[y * dst_stride] = ROUND_POWER_OF_TWO(
568           dst[y * dst_stride] +
569               clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
570           1);
571       y_q4 += y_step_q4;
572     }
573     ++src;
574     ++dst;
575   }
576 }
577 
convolve_avg_horiz(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h)578 static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
579                                uint8_t *dst, ptrdiff_t dst_stride,
580                                const InterpKernel *x_filters, int x0_q4,
581                                int x_step_q4, int w, int h) {
582   int x, y;
583   src -= SUBPEL_TAPS / 2 - 1;
584 
585   for (y = 0; y < h; ++y) {
586     int x_q4 = x0_q4;
587     for (x = 0; x < w; ++x) {
588       const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
589       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
590       int k, sum = 0;
591       for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
592       dst[x] = ROUND_POWER_OF_TWO(
593           dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
594       x_q4 += x_step_q4;
595     }
596     src += src_stride;
597     dst += dst_stride;
598   }
599 }
600 
vpx_convolve8_mmi(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int32_t x_step_q4,int y0_q4,int32_t y_step_q4,int32_t w,int32_t h)601 void vpx_convolve8_mmi(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
602                        ptrdiff_t dst_stride, const InterpKernel *filter,
603                        int x0_q4, int32_t x_step_q4, int y0_q4,
604                        int32_t y_step_q4, int32_t w, int32_t h) {
605   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
606   // 2d filtering proceeds in 2 steps:
607   //   (1) Interpolate horizontally into an intermediate buffer, temp.
608   //   (2) Interpolate temp vertically to derive the sub-pixel result.
609   // Deriving the maximum number of rows in the temp buffer (135):
610   // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
611   // --Largest block size is 64x64 pixels.
612   // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
613   //   original frame (in 1/16th pixel units).
614   // --Must round-up because block may be located at sub-pixel position.
615   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
616   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
617   // When calling in frame scaling function, the smallest scaling factor is x1/4
618   // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
619   // big enough.
620   uint8_t temp[64 * 135];
621   const int intermediate_height =
622       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
623 
624   assert(w <= 64);
625   assert(h <= 64);
626   assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
627   assert(x_step_q4 <= 64);
628 
629   if (w & 0x03) {
630     convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp,
631                    64, filter, x0_q4, x_step_q4, w, intermediate_height);
632     convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
633                   filter, y0_q4, y_step_q4, w, h);
634   } else {
635     convolve_horiz_mmi(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
636                        temp, 64, filter, x0_q4, x_step_q4, w,
637                        intermediate_height);
638     convolve_vert_mmi(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
639                       filter, y0_q4, y_step_q4, w, h);
640   }
641 }
642 
vpx_convolve8_horiz_mmi(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int32_t x_step_q4,int y0_q4,int32_t y_step_q4,int32_t w,int32_t h)643 void vpx_convolve8_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
644                              uint8_t *dst, ptrdiff_t dst_stride,
645                              const InterpKernel *filter, int x0_q4,
646                              int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
647                              int32_t w, int32_t h) {
648   (void)y0_q4;
649   (void)y_step_q4;
650   if (w & 0x03)
651     convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
652                    w, h);
653   else
654     convolve_horiz_mmi(src, src_stride, dst, dst_stride, filter, x0_q4,
655                        x_step_q4, w, h);
656 }
657 
vpx_convolve8_vert_mmi(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int32_t x_step_q4,int y0_q4,int y_step_q4,int w,int h)658 void vpx_convolve8_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
659                             uint8_t *dst, ptrdiff_t dst_stride,
660                             const InterpKernel *filter, int x0_q4,
661                             int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
662                             int h) {
663   (void)x0_q4;
664   (void)x_step_q4;
665   if (w & 0x03)
666     convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w,
667                   h);
668   else
669     convolve_vert_mmi(src, src_stride, dst, dst_stride, filter, y0_q4,
670                       y_step_q4, w, h);
671 }
672 
vpx_convolve8_avg_horiz_mmi(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int32_t x_step_q4,int y0_q4,int y_step_q4,int w,int h)673 void vpx_convolve8_avg_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
674                                  uint8_t *dst, ptrdiff_t dst_stride,
675                                  const InterpKernel *filter, int x0_q4,
676                                  int32_t x_step_q4, int y0_q4, int y_step_q4,
677                                  int w, int h) {
678   (void)y0_q4;
679   (void)y_step_q4;
680   if (w & 0x03)
681     convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4,
682                        x_step_q4, w, h);
683   else
684     convolve_avg_horiz_mmi(src, src_stride, dst, dst_stride, filter, x0_q4,
685                            x_step_q4, w, h);
686 }
687 
vpx_convolve8_avg_vert_mmi(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int32_t x_step_q4,int y0_q4,int y_step_q4,int w,int h)688 void vpx_convolve8_avg_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
689                                 uint8_t *dst, ptrdiff_t dst_stride,
690                                 const InterpKernel *filter, int x0_q4,
691                                 int32_t x_step_q4, int y0_q4, int y_step_q4,
692                                 int w, int h) {
693   (void)x0_q4;
694   (void)x_step_q4;
695   if (w & 0x03)
696     convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4,
697                       y_step_q4, w, h);
698   else
699     convolve_avg_vert_mmi(src, src_stride, dst, dst_stride, filter, y0_q4,
700                           y_step_q4, w, h);
701 }
702 
vpx_convolve8_avg_mmi(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int32_t x_step_q4,int y0_q4,int32_t y_step_q4,int32_t w,int32_t h)703 void vpx_convolve8_avg_mmi(const uint8_t *src, ptrdiff_t src_stride,
704                            uint8_t *dst, ptrdiff_t dst_stride,
705                            const InterpKernel *filter, int x0_q4,
706                            int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
707                            int32_t w, int32_t h) {
708   // Fixed size intermediate buffer places limits on parameters.
709   DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
710   assert(w <= 64);
711   assert(h <= 64);
712 
713   vpx_convolve8_mmi(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4,
714                     y_step_q4, w, h);
715   vpx_convolve_avg_mmi(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h);
716 }
717