1 /*
2 * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <string.h>
13
14 #include "./vpx_config.h"
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx/vpx_integer.h"
17 #include "vpx_dsp/vpx_dsp_common.h"
18 #include "vpx_dsp/vpx_filter.h"
19 #include "vpx_ports/asmdefs_mmi.h"
20 #include "vpx_ports/mem.h"
21
22 #define GET_DATA_H_MMI \
23 "pmaddhw %[ftmp4], %[ftmp4], %[filter1] \n\t" \
24 "pmaddhw %[ftmp5], %[ftmp5], %[filter2] \n\t" \
25 "paddw %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \
26 "punpckhwd %[ftmp5], %[ftmp4], %[ftmp0] \n\t" \
27 "paddw %[ftmp4], %[ftmp4], %[ftmp5] \n\t" \
28 "pmaddhw %[ftmp6], %[ftmp6], %[filter1] \n\t" \
29 "pmaddhw %[ftmp7], %[ftmp7], %[filter2] \n\t" \
30 "paddw %[ftmp6], %[ftmp6], %[ftmp7] \n\t" \
31 "punpckhwd %[ftmp7], %[ftmp6], %[ftmp0] \n\t" \
32 "paddw %[ftmp6], %[ftmp6], %[ftmp7] \n\t" \
33 "punpcklwd %[srcl], %[ftmp4], %[ftmp6] \n\t" \
34 "pmaddhw %[ftmp8], %[ftmp8], %[filter1] \n\t" \
35 "pmaddhw %[ftmp9], %[ftmp9], %[filter2] \n\t" \
36 "paddw %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \
37 "punpckhwd %[ftmp9], %[ftmp8], %[ftmp0] \n\t" \
38 "paddw %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \
39 "pmaddhw %[ftmp10], %[ftmp10], %[filter1] \n\t" \
40 "pmaddhw %[ftmp11], %[ftmp11], %[filter2] \n\t" \
41 "paddw %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \
42 "punpckhwd %[ftmp11], %[ftmp10], %[ftmp0] \n\t" \
43 "paddw %[ftmp10], %[ftmp10], %[ftmp11] \n\t" \
44 "punpcklwd %[srch], %[ftmp8], %[ftmp10] \n\t"
45
46 #define GET_DATA_V_MMI \
47 "punpcklhw %[srcl], %[ftmp4], %[ftmp5] \n\t" \
48 "pmaddhw %[srcl], %[srcl], %[filter10] \n\t" \
49 "punpcklhw %[ftmp12], %[ftmp6], %[ftmp7] \n\t" \
50 "pmaddhw %[ftmp12], %[ftmp12], %[filter32] \n\t" \
51 "paddw %[srcl], %[srcl], %[ftmp12] \n\t" \
52 "punpcklhw %[ftmp12], %[ftmp8], %[ftmp9] \n\t" \
53 "pmaddhw %[ftmp12], %[ftmp12], %[filter54] \n\t" \
54 "paddw %[srcl], %[srcl], %[ftmp12] \n\t" \
55 "punpcklhw %[ftmp12], %[ftmp10], %[ftmp11] \n\t" \
56 "pmaddhw %[ftmp12], %[ftmp12], %[filter76] \n\t" \
57 "paddw %[srcl], %[srcl], %[ftmp12] \n\t" \
58 "punpckhhw %[srch], %[ftmp4], %[ftmp5] \n\t" \
59 "pmaddhw %[srch], %[srch], %[filter10] \n\t" \
60 "punpckhhw %[ftmp12], %[ftmp6], %[ftmp7] \n\t" \
61 "pmaddhw %[ftmp12], %[ftmp12], %[filter32] \n\t" \
62 "paddw %[srch], %[srch], %[ftmp12] \n\t" \
63 "punpckhhw %[ftmp12], %[ftmp8], %[ftmp9] \n\t" \
64 "pmaddhw %[ftmp12], %[ftmp12], %[filter54] \n\t" \
65 "paddw %[srch], %[srch], %[ftmp12] \n\t" \
66 "punpckhhw %[ftmp12], %[ftmp10], %[ftmp11] \n\t" \
67 "pmaddhw %[ftmp12], %[ftmp12], %[filter76] \n\t" \
68 "paddw %[srch], %[srch], %[ftmp12] \n\t"
69
70 /* clang-format off */
71 #define ROUND_POWER_OF_TWO_MMI \
72 /* Add para[0] */ \
73 "lw %[tmp0], 0x00(%[para]) \n\t" \
74 MMI_MTC1(%[tmp0], %[ftmp6]) \
75 "punpcklwd %[ftmp6], %[ftmp6], %[ftmp6] \n\t" \
76 "paddw %[srcl], %[srcl], %[ftmp6] \n\t" \
77 "paddw %[srch], %[srch], %[ftmp6] \n\t" \
78 /* Arithmetic right shift para[1] bits */ \
79 "lw %[tmp0], 0x04(%[para]) \n\t" \
80 MMI_MTC1(%[tmp0], %[ftmp5]) \
81 "psraw %[srcl], %[srcl], %[ftmp5] \n\t" \
82 "psraw %[srch], %[srch], %[ftmp5] \n\t"
83 /* clang-format on */
84
85 #define CLIP_PIXEL_MMI \
86 /* Staturated operation */ \
87 "packsswh %[srcl], %[srcl], %[srch] \n\t" \
88 "packushb %[ftmp12], %[srcl], %[ftmp0] \n\t"
89
convolve_horiz_mmi(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int32_t w,int32_t h)90 static void convolve_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
91 uint8_t *dst, ptrdiff_t dst_stride,
92 const InterpKernel *filter, int x0_q4,
93 int x_step_q4, int32_t w, int32_t h) {
94 const int16_t *filter_x = filter[x0_q4];
95 double ftmp[14];
96 uint32_t tmp[2];
97 uint32_t para[5];
98 para[0] = (1 << ((FILTER_BITS)-1));
99 para[1] = FILTER_BITS;
100 src -= SUBPEL_TAPS / 2 - 1;
101 src_stride -= w;
102 dst_stride -= w;
103 (void)x_step_q4;
104
105 /* clang-format off */
106 __asm__ volatile(
107 "move %[tmp1], %[width] \n\t"
108 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
109 "gsldlc1 %[filter1], 0x03(%[filter]) \n\t"
110 "gsldrc1 %[filter1], 0x00(%[filter]) \n\t"
111 "gsldlc1 %[filter2], 0x0b(%[filter]) \n\t"
112 "gsldrc1 %[filter2], 0x08(%[filter]) \n\t"
113 "1: \n\t"
114 /* Get 8 data per row */
115 "gsldlc1 %[ftmp5], 0x07(%[src]) \n\t"
116 "gsldrc1 %[ftmp5], 0x00(%[src]) \n\t"
117 "gsldlc1 %[ftmp7], 0x08(%[src]) \n\t"
118 "gsldrc1 %[ftmp7], 0x01(%[src]) \n\t"
119 "gsldlc1 %[ftmp9], 0x09(%[src]) \n\t"
120 "gsldrc1 %[ftmp9], 0x02(%[src]) \n\t"
121 "gsldlc1 %[ftmp11], 0x0A(%[src]) \n\t"
122 "gsldrc1 %[ftmp11], 0x03(%[src]) \n\t"
123 "punpcklbh %[ftmp4], %[ftmp5], %[ftmp0] \n\t"
124 "punpckhbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
125 "punpcklbh %[ftmp6], %[ftmp7], %[ftmp0] \n\t"
126 "punpckhbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
127 "punpcklbh %[ftmp8], %[ftmp9], %[ftmp0] \n\t"
128 "punpckhbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
129 "punpcklbh %[ftmp10], %[ftmp11], %[ftmp0] \n\t"
130 "punpckhbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t"
131 MMI_ADDIU(%[width], %[width], -0x04)
132 /* Get raw data */
133 GET_DATA_H_MMI
134 ROUND_POWER_OF_TWO_MMI
135 CLIP_PIXEL_MMI
136 "swc1 %[ftmp12], 0x00(%[dst]) \n\t"
137 MMI_ADDIU(%[dst], %[dst], 0x04)
138 MMI_ADDIU(%[src], %[src], 0x04)
139 /* Loop count */
140 "bnez %[width], 1b \n\t"
141 "move %[width], %[tmp1] \n\t"
142 MMI_ADDU(%[src], %[src], %[src_stride])
143 MMI_ADDU(%[dst], %[dst], %[dst_stride])
144 MMI_ADDIU(%[height], %[height], -0x01)
145 "bnez %[height], 1b \n\t"
146 : [srcl]"=&f"(ftmp[0]), [srch]"=&f"(ftmp[1]),
147 [filter1]"=&f"(ftmp[2]), [filter2]"=&f"(ftmp[3]),
148 [ftmp0]"=&f"(ftmp[4]), [ftmp4]"=&f"(ftmp[5]),
149 [ftmp5]"=&f"(ftmp[6]), [ftmp6]"=&f"(ftmp[7]),
150 [ftmp7]"=&f"(ftmp[8]), [ftmp8]"=&f"(ftmp[9]),
151 [ftmp9]"=&f"(ftmp[10]), [ftmp10]"=&f"(ftmp[11]),
152 [ftmp11]"=&f"(ftmp[12]), [ftmp12]"=&f"(ftmp[13]),
153 [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
154 [src]"+&r"(src), [width]"+&r"(w),
155 [dst]"+&r"(dst), [height]"+&r"(h)
156 : [filter]"r"(filter_x), [para]"r"(para),
157 [src_stride]"r"((mips_reg)src_stride),
158 [dst_stride]"r"((mips_reg)dst_stride)
159 : "memory"
160 );
161 /* clang-format on */
162 }
163
convolve_vert_mmi(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int y0_q4,int y_step_q4,int32_t w,int32_t h)164 static void convolve_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
165 uint8_t *dst, ptrdiff_t dst_stride,
166 const InterpKernel *filter, int y0_q4,
167 int y_step_q4, int32_t w, int32_t h) {
168 const int16_t *filter_y = filter[y0_q4];
169 double ftmp[16];
170 uint32_t tmp[1];
171 uint32_t para[2];
172 ptrdiff_t addr = src_stride;
173 para[0] = (1 << ((FILTER_BITS)-1));
174 para[1] = FILTER_BITS;
175 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
176 src_stride -= w;
177 dst_stride -= w;
178 (void)y_step_q4;
179
180 __asm__ volatile(
181 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
182 "gsldlc1 %[ftmp4], 0x03(%[filter]) \n\t"
183 "gsldrc1 %[ftmp4], 0x00(%[filter]) \n\t"
184 "gsldlc1 %[ftmp5], 0x0b(%[filter]) \n\t"
185 "gsldrc1 %[ftmp5], 0x08(%[filter]) \n\t"
186 "punpcklwd %[filter10], %[ftmp4], %[ftmp4] \n\t"
187 "punpckhwd %[filter32], %[ftmp4], %[ftmp4] \n\t"
188 "punpcklwd %[filter54], %[ftmp5], %[ftmp5] \n\t"
189 "punpckhwd %[filter76], %[ftmp5], %[ftmp5] \n\t"
190 "1: \n\t"
191 /* Get 8 data per column */
192 "gsldlc1 %[ftmp4], 0x07(%[src]) \n\t"
193 "gsldrc1 %[ftmp4], 0x00(%[src]) \n\t"
194 MMI_ADDU(%[tmp0], %[src], %[addr])
195 "gsldlc1 %[ftmp5], 0x07(%[tmp0]) \n\t"
196 "gsldrc1 %[ftmp5], 0x00(%[tmp0]) \n\t"
197 MMI_ADDU(%[tmp0], %[tmp0], %[addr])
198 "gsldlc1 %[ftmp6], 0x07(%[tmp0]) \n\t"
199 "gsldrc1 %[ftmp6], 0x00(%[tmp0]) \n\t"
200 MMI_ADDU(%[tmp0], %[tmp0], %[addr])
201 "gsldlc1 %[ftmp7], 0x07(%[tmp0]) \n\t"
202 "gsldrc1 %[ftmp7], 0x00(%[tmp0]) \n\t"
203 MMI_ADDU(%[tmp0], %[tmp0], %[addr])
204 "gsldlc1 %[ftmp8], 0x07(%[tmp0]) \n\t"
205 "gsldrc1 %[ftmp8], 0x00(%[tmp0]) \n\t"
206 MMI_ADDU(%[tmp0], %[tmp0], %[addr])
207 "gsldlc1 %[ftmp9], 0x07(%[tmp0]) \n\t"
208 "gsldrc1 %[ftmp9], 0x00(%[tmp0]) \n\t"
209 MMI_ADDU(%[tmp0], %[tmp0], %[addr])
210 "gsldlc1 %[ftmp10], 0x07(%[tmp0]) \n\t"
211 "gsldrc1 %[ftmp10], 0x00(%[tmp0]) \n\t"
212 MMI_ADDU(%[tmp0], %[tmp0], %[addr])
213 "gsldlc1 %[ftmp11], 0x07(%[tmp0]) \n\t"
214 "gsldrc1 %[ftmp11], 0x00(%[tmp0]) \n\t"
215 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
216 "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
217 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
218 "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
219 "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
220 "punpcklbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
221 "punpcklbh %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
222 "punpcklbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t"
223 MMI_ADDIU(%[width], %[width], -0x04)
224 /* Get raw data */
225 GET_DATA_V_MMI
226 ROUND_POWER_OF_TWO_MMI
227 CLIP_PIXEL_MMI
228 "swc1 %[ftmp12], 0x00(%[dst]) \n\t"
229 MMI_ADDIU(%[dst], %[dst], 0x04)
230 MMI_ADDIU(%[src], %[src], 0x04)
231 /* Loop count */
232 "bnez %[width], 1b \n\t"
233 MMI_SUBU(%[width], %[addr], %[src_stride])
234 MMI_ADDU(%[src], %[src], %[src_stride])
235 MMI_ADDU(%[dst], %[dst], %[dst_stride])
236 MMI_ADDIU(%[height], %[height], -0x01)
237 "bnez %[height], 1b \n\t"
238 : [srcl]"=&f"(ftmp[0]), [srch]"=&f"(ftmp[1]),
239 [filter10]"=&f"(ftmp[2]), [filter32]"=&f"(ftmp[3]),
240 [filter54]"=&f"(ftmp[4]), [filter76]"=&f"(ftmp[5]),
241 [ftmp0]"=&f"(ftmp[6]), [ftmp4]"=&f"(ftmp[7]),
242 [ftmp5]"=&f"(ftmp[8]), [ftmp6]"=&f"(ftmp[9]),
243 [ftmp7]"=&f"(ftmp[10]), [ftmp8]"=&f"(ftmp[11]),
244 [ftmp9]"=&f"(ftmp[12]), [ftmp10]"=&f"(ftmp[13]),
245 [ftmp11]"=&f"(ftmp[14]), [ftmp12]"=&f"(ftmp[15]),
246 [src]"+&r"(src), [dst]"+&r"(dst),
247 [width]"+&r"(w), [height]"+&r"(h),
248 [tmp0]"=&r"(tmp[0])
249 : [filter]"r"(filter_y), [para]"r"(para),
250 [src_stride]"r"((mips_reg)src_stride),
251 [dst_stride]"r"((mips_reg)dst_stride),
252 [addr]"r"((mips_reg)addr)
253 : "memory"
254 );
255 }
256
convolve_avg_horiz_mmi(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int32_t w,int32_t h)257 static void convolve_avg_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
258 uint8_t *dst, ptrdiff_t dst_stride,
259 const InterpKernel *filter, int x0_q4,
260 int x_step_q4, int32_t w, int32_t h) {
261 const int16_t *filter_x = filter[x0_q4];
262 double ftmp[14];
263 uint32_t tmp[2];
264 uint32_t para[2];
265 para[0] = (1 << ((FILTER_BITS)-1));
266 para[1] = FILTER_BITS;
267 src -= SUBPEL_TAPS / 2 - 1;
268 src_stride -= w;
269 dst_stride -= w;
270 (void)x_step_q4;
271
272 __asm__ volatile(
273 "move %[tmp1], %[width] \n\t"
274 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
275 "gsldlc1 %[filter1], 0x03(%[filter]) \n\t"
276 "gsldrc1 %[filter1], 0x00(%[filter]) \n\t"
277 "gsldlc1 %[filter2], 0x0b(%[filter]) \n\t"
278 "gsldrc1 %[filter2], 0x08(%[filter]) \n\t"
279 "1: \n\t"
280 /* Get 8 data per row */
281 "gsldlc1 %[ftmp5], 0x07(%[src]) \n\t"
282 "gsldrc1 %[ftmp5], 0x00(%[src]) \n\t"
283 "gsldlc1 %[ftmp7], 0x08(%[src]) \n\t"
284 "gsldrc1 %[ftmp7], 0x01(%[src]) \n\t"
285 "gsldlc1 %[ftmp9], 0x09(%[src]) \n\t"
286 "gsldrc1 %[ftmp9], 0x02(%[src]) \n\t"
287 "gsldlc1 %[ftmp11], 0x0A(%[src]) \n\t"
288 "gsldrc1 %[ftmp11], 0x03(%[src]) \n\t"
289 "punpcklbh %[ftmp4], %[ftmp5], %[ftmp0] \n\t"
290 "punpckhbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
291 "punpcklbh %[ftmp6], %[ftmp7], %[ftmp0] \n\t"
292 "punpckhbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
293 "punpcklbh %[ftmp8], %[ftmp9], %[ftmp0] \n\t"
294 "punpckhbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
295 "punpcklbh %[ftmp10], %[ftmp11], %[ftmp0] \n\t"
296 "punpckhbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t"
297 MMI_ADDIU(%[width], %[width], -0x04)
298 /* Get raw data */
299 GET_DATA_H_MMI
300 ROUND_POWER_OF_TWO_MMI
301 CLIP_PIXEL_MMI
302 "punpcklbh %[ftmp12], %[ftmp12], %[ftmp0] \n\t"
303 "gsldlc1 %[ftmp4], 0x07(%[dst]) \n\t"
304 "gsldrc1 %[ftmp4], 0x00(%[dst]) \n\t"
305 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
306 "paddh %[ftmp12], %[ftmp12], %[ftmp4] \n\t"
307 "li %[tmp0], 0x10001 \n\t"
308 MMI_MTC1(%[tmp0], %[ftmp5])
309 "punpcklhw %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
310 "paddh %[ftmp12], %[ftmp12], %[ftmp5] \n\t"
311 "psrah %[ftmp12], %[ftmp12], %[ftmp5] \n\t"
312 "packushb %[ftmp12], %[ftmp12], %[ftmp0] \n\t"
313 "swc1 %[ftmp12], 0x00(%[dst]) \n\t"
314 MMI_ADDIU(%[dst], %[dst], 0x04)
315 MMI_ADDIU(%[src], %[src], 0x04)
316 /* Loop count */
317 "bnez %[width], 1b \n\t"
318 "move %[width], %[tmp1] \n\t"
319 MMI_ADDU(%[src], %[src], %[src_stride])
320 MMI_ADDU(%[dst], %[dst], %[dst_stride])
321 MMI_ADDIU(%[height], %[height], -0x01)
322 "bnez %[height], 1b \n\t"
323 : [srcl]"=&f"(ftmp[0]), [srch]"=&f"(ftmp[1]),
324 [filter1]"=&f"(ftmp[2]), [filter2]"=&f"(ftmp[3]),
325 [ftmp0]"=&f"(ftmp[4]), [ftmp4]"=&f"(ftmp[5]),
326 [ftmp5]"=&f"(ftmp[6]), [ftmp6]"=&f"(ftmp[7]),
327 [ftmp7]"=&f"(ftmp[8]), [ftmp8]"=&f"(ftmp[9]),
328 [ftmp9]"=&f"(ftmp[10]), [ftmp10]"=&f"(ftmp[11]),
329 [ftmp11]"=&f"(ftmp[12]), [ftmp12]"=&f"(ftmp[13]),
330 [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
331 [src]"+&r"(src), [width]"+&r"(w),
332 [dst]"+&r"(dst), [height]"+&r"(h)
333 : [filter]"r"(filter_x), [para]"r"(para),
334 [src_stride]"r"((mips_reg)src_stride),
335 [dst_stride]"r"((mips_reg)dst_stride)
336 : "memory"
337 );
338 }
339
convolve_avg_vert_mmi(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int y0_q4,int y_step_q4,int32_t w,int32_t h)340 static void convolve_avg_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
341 uint8_t *dst, ptrdiff_t dst_stride,
342 const InterpKernel *filter, int y0_q4,
343 int y_step_q4, int32_t w, int32_t h) {
344 const int16_t *filter_y = filter[y0_q4];
345 double ftmp[16];
346 uint32_t tmp[1];
347 uint32_t para[2];
348 ptrdiff_t addr = src_stride;
349 para[0] = (1 << ((FILTER_BITS)-1));
350 para[1] = FILTER_BITS;
351 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
352 src_stride -= w;
353 dst_stride -= w;
354 (void)y_step_q4;
355
356 __asm__ volatile(
357 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
358 "gsldlc1 %[ftmp4], 0x03(%[filter]) \n\t"
359 "gsldrc1 %[ftmp4], 0x00(%[filter]) \n\t"
360 "gsldlc1 %[ftmp5], 0x0b(%[filter]) \n\t"
361 "gsldrc1 %[ftmp5], 0x08(%[filter]) \n\t"
362 "punpcklwd %[filter10], %[ftmp4], %[ftmp4] \n\t"
363 "punpckhwd %[filter32], %[ftmp4], %[ftmp4] \n\t"
364 "punpcklwd %[filter54], %[ftmp5], %[ftmp5] \n\t"
365 "punpckhwd %[filter76], %[ftmp5], %[ftmp5] \n\t"
366 "1: \n\t"
367 /* Get 8 data per column */
368 "gsldlc1 %[ftmp4], 0x07(%[src]) \n\t"
369 "gsldrc1 %[ftmp4], 0x00(%[src]) \n\t"
370 MMI_ADDU(%[tmp0], %[src], %[addr])
371 "gsldlc1 %[ftmp5], 0x07(%[tmp0]) \n\t"
372 "gsldrc1 %[ftmp5], 0x00(%[tmp0]) \n\t"
373 MMI_ADDU(%[tmp0], %[tmp0], %[addr])
374 "gsldlc1 %[ftmp6], 0x07(%[tmp0]) \n\t"
375 "gsldrc1 %[ftmp6], 0x00(%[tmp0]) \n\t"
376 MMI_ADDU(%[tmp0], %[tmp0], %[addr])
377 "gsldlc1 %[ftmp7], 0x07(%[tmp0]) \n\t"
378 "gsldrc1 %[ftmp7], 0x00(%[tmp0]) \n\t"
379 MMI_ADDU(%[tmp0], %[tmp0], %[addr])
380 "gsldlc1 %[ftmp8], 0x07(%[tmp0]) \n\t"
381 "gsldrc1 %[ftmp8], 0x00(%[tmp0]) \n\t"
382 MMI_ADDU(%[tmp0], %[tmp0], %[addr])
383 "gsldlc1 %[ftmp9], 0x07(%[tmp0]) \n\t"
384 "gsldrc1 %[ftmp9], 0x00(%[tmp0]) \n\t"
385 MMI_ADDU(%[tmp0], %[tmp0], %[addr])
386 "gsldlc1 %[ftmp10], 0x07(%[tmp0]) \n\t"
387 "gsldrc1 %[ftmp10], 0x00(%[tmp0]) \n\t"
388 MMI_ADDU(%[tmp0], %[tmp0], %[addr])
389 "gsldlc1 %[ftmp11], 0x07(%[tmp0]) \n\t"
390 "gsldrc1 %[ftmp11], 0x00(%[tmp0]) \n\t"
391 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
392 "punpcklbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
393 "punpcklbh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
394 "punpcklbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
395 "punpcklbh %[ftmp8], %[ftmp8], %[ftmp0] \n\t"
396 "punpcklbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
397 "punpcklbh %[ftmp10], %[ftmp10], %[ftmp0] \n\t"
398 "punpcklbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t"
399 MMI_ADDIU(%[width], %[width], -0x04)
400 /* Get raw data */
401 GET_DATA_V_MMI
402 ROUND_POWER_OF_TWO_MMI
403 CLIP_PIXEL_MMI
404 "punpcklbh %[ftmp12], %[ftmp12], %[ftmp0] \n\t"
405 "gsldlc1 %[ftmp4], 0x07(%[dst]) \n\t"
406 "gsldrc1 %[ftmp4], 0x00(%[dst]) \n\t"
407 "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
408 "paddh %[ftmp12], %[ftmp12], %[ftmp4] \n\t"
409 "li %[tmp0], 0x10001 \n\t"
410 MMI_MTC1(%[tmp0], %[ftmp5])
411 "punpcklhw %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
412 "paddh %[ftmp12], %[ftmp12], %[ftmp5] \n\t"
413 "psrah %[ftmp12], %[ftmp12], %[ftmp5] \n\t"
414 "packushb %[ftmp12], %[ftmp12], %[ftmp0] \n\t"
415 "swc1 %[ftmp12], 0x00(%[dst]) \n\t"
416 MMI_ADDIU(%[dst], %[dst], 0x04)
417 MMI_ADDIU(%[src], %[src], 0x04)
418 /* Loop count */
419 "bnez %[width], 1b \n\t"
420 MMI_SUBU(%[width], %[addr], %[src_stride])
421 MMI_ADDU(%[src], %[src], %[src_stride])
422 MMI_ADDU(%[dst], %[dst], %[dst_stride])
423 MMI_ADDIU(%[height], %[height], -0x01)
424 "bnez %[height], 1b \n\t"
425 : [srcl]"=&f"(ftmp[0]), [srch]"=&f"(ftmp[1]),
426 [filter10]"=&f"(ftmp[2]), [filter32]"=&f"(ftmp[3]),
427 [filter54]"=&f"(ftmp[4]), [filter76]"=&f"(ftmp[5]),
428 [ftmp0]"=&f"(ftmp[6]), [ftmp4]"=&f"(ftmp[7]),
429 [ftmp5]"=&f"(ftmp[8]), [ftmp6]"=&f"(ftmp[9]),
430 [ftmp7]"=&f"(ftmp[10]), [ftmp8]"=&f"(ftmp[11]),
431 [ftmp9]"=&f"(ftmp[12]), [ftmp10]"=&f"(ftmp[13]),
432 [ftmp11]"=&f"(ftmp[14]), [ftmp12]"=&f"(ftmp[15]),
433 [src]"+&r"(src), [dst]"+&r"(dst),
434 [width]"+&r"(w), [height]"+&r"(h),
435 [tmp0]"=&r"(tmp[0])
436 : [filter]"r"(filter_y), [para]"r"(para),
437 [src_stride]"r"((mips_reg)src_stride),
438 [dst_stride]"r"((mips_reg)dst_stride),
439 [addr]"r"((mips_reg)addr)
440 : "memory"
441 );
442 }
443
vpx_convolve_avg_mmi(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)444 void vpx_convolve_avg_mmi(const uint8_t *src, ptrdiff_t src_stride,
445 uint8_t *dst, ptrdiff_t dst_stride,
446 const InterpKernel *filter, int x0_q4, int x_step_q4,
447 int y0_q4, int y_step_q4, int w, int h) {
448 int x, y;
449
450 (void)filter;
451 (void)x0_q4;
452 (void)x_step_q4;
453 (void)y0_q4;
454 (void)y_step_q4;
455
456 if (w & 0x03) {
457 for (y = 0; y < h; ++y) {
458 for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
459 src += src_stride;
460 dst += dst_stride;
461 }
462 } else {
463 double ftmp[4];
464 uint32_t tmp[2];
465 src_stride -= w;
466 dst_stride -= w;
467
468 __asm__ volatile(
469 "move %[tmp1], %[width] \n\t"
470 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
471 "li %[tmp0], 0x10001 \n\t"
472 MMI_MTC1(%[tmp0], %[ftmp3])
473 "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
474 "1: \n\t"
475 "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t"
476 "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t"
477 "gsldlc1 %[ftmp2], 0x07(%[dst]) \n\t"
478 "gsldrc1 %[ftmp2], 0x00(%[dst]) \n\t"
479 "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
480 "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
481 "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
482 "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
483 "psrah %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
484 "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
485 "swc1 %[ftmp1], 0x00(%[dst]) \n\t"
486 MMI_ADDIU(%[width], %[width], -0x04)
487 MMI_ADDIU(%[dst], %[dst], 0x04)
488 MMI_ADDIU(%[src], %[src], 0x04)
489 "bnez %[width], 1b \n\t"
490 "move %[width], %[tmp1] \n\t"
491 MMI_ADDU(%[dst], %[dst], %[dst_stride])
492 MMI_ADDU(%[src], %[src], %[src_stride])
493 MMI_ADDIU(%[height], %[height], -0x01)
494 "bnez %[height], 1b \n\t"
495 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
496 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
497 [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
498 [src]"+&r"(src), [dst]"+&r"(dst),
499 [width]"+&r"(w), [height]"+&r"(h)
500 : [src_stride]"r"((mips_reg)src_stride),
501 [dst_stride]"r"((mips_reg)dst_stride)
502 : "memory"
503 );
504 }
505 }
506
convolve_horiz(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h)507 static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
508 uint8_t *dst, ptrdiff_t dst_stride,
509 const InterpKernel *x_filters, int x0_q4,
510 int x_step_q4, int w, int h) {
511 int x, y;
512 src -= SUBPEL_TAPS / 2 - 1;
513
514 for (y = 0; y < h; ++y) {
515 int x_q4 = x0_q4;
516 for (x = 0; x < w; ++x) {
517 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
518 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
519 int k, sum = 0;
520 for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
521 dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
522 x_q4 += x_step_q4;
523 }
524 src += src_stride;
525 dst += dst_stride;
526 }
527 }
528
convolve_vert(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)529 static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
530 uint8_t *dst, ptrdiff_t dst_stride,
531 const InterpKernel *y_filters, int y0_q4,
532 int y_step_q4, int w, int h) {
533 int x, y;
534 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
535
536 for (x = 0; x < w; ++x) {
537 int y_q4 = y0_q4;
538 for (y = 0; y < h; ++y) {
539 const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
540 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
541 int k, sum = 0;
542 for (k = 0; k < SUBPEL_TAPS; ++k)
543 sum += src_y[k * src_stride] * y_filter[k];
544 dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
545 y_q4 += y_step_q4;
546 }
547 ++src;
548 ++dst;
549 }
550 }
551
convolve_avg_vert(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)552 static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
553 uint8_t *dst, ptrdiff_t dst_stride,
554 const InterpKernel *y_filters, int y0_q4,
555 int y_step_q4, int w, int h) {
556 int x, y;
557 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
558
559 for (x = 0; x < w; ++x) {
560 int y_q4 = y0_q4;
561 for (y = 0; y < h; ++y) {
562 const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
563 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
564 int k, sum = 0;
565 for (k = 0; k < SUBPEL_TAPS; ++k)
566 sum += src_y[k * src_stride] * y_filter[k];
567 dst[y * dst_stride] = ROUND_POWER_OF_TWO(
568 dst[y * dst_stride] +
569 clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
570 1);
571 y_q4 += y_step_q4;
572 }
573 ++src;
574 ++dst;
575 }
576 }
577
convolve_avg_horiz(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h)578 static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
579 uint8_t *dst, ptrdiff_t dst_stride,
580 const InterpKernel *x_filters, int x0_q4,
581 int x_step_q4, int w, int h) {
582 int x, y;
583 src -= SUBPEL_TAPS / 2 - 1;
584
585 for (y = 0; y < h; ++y) {
586 int x_q4 = x0_q4;
587 for (x = 0; x < w; ++x) {
588 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
589 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
590 int k, sum = 0;
591 for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
592 dst[x] = ROUND_POWER_OF_TWO(
593 dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
594 x_q4 += x_step_q4;
595 }
596 src += src_stride;
597 dst += dst_stride;
598 }
599 }
600
vpx_convolve8_mmi(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int32_t x_step_q4,int y0_q4,int32_t y_step_q4,int32_t w,int32_t h)601 void vpx_convolve8_mmi(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
602 ptrdiff_t dst_stride, const InterpKernel *filter,
603 int x0_q4, int32_t x_step_q4, int y0_q4,
604 int32_t y_step_q4, int32_t w, int32_t h) {
605 // Note: Fixed size intermediate buffer, temp, places limits on parameters.
606 // 2d filtering proceeds in 2 steps:
607 // (1) Interpolate horizontally into an intermediate buffer, temp.
608 // (2) Interpolate temp vertically to derive the sub-pixel result.
609 // Deriving the maximum number of rows in the temp buffer (135):
610 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
611 // --Largest block size is 64x64 pixels.
612 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
613 // original frame (in 1/16th pixel units).
614 // --Must round-up because block may be located at sub-pixel position.
615 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
616 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
617 // When calling in frame scaling function, the smallest scaling factor is x1/4
618 // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
619 // big enough.
620 uint8_t temp[64 * 135];
621 const int intermediate_height =
622 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
623
624 assert(w <= 64);
625 assert(h <= 64);
626 assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
627 assert(x_step_q4 <= 64);
628
629 if (w & 0x03) {
630 convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp,
631 64, filter, x0_q4, x_step_q4, w, intermediate_height);
632 convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
633 filter, y0_q4, y_step_q4, w, h);
634 } else {
635 convolve_horiz_mmi(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
636 temp, 64, filter, x0_q4, x_step_q4, w,
637 intermediate_height);
638 convolve_vert_mmi(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
639 filter, y0_q4, y_step_q4, w, h);
640 }
641 }
642
vpx_convolve8_horiz_mmi(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int32_t x_step_q4,int y0_q4,int32_t y_step_q4,int32_t w,int32_t h)643 void vpx_convolve8_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
644 uint8_t *dst, ptrdiff_t dst_stride,
645 const InterpKernel *filter, int x0_q4,
646 int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
647 int32_t w, int32_t h) {
648 (void)y0_q4;
649 (void)y_step_q4;
650 if (w & 0x03)
651 convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
652 w, h);
653 else
654 convolve_horiz_mmi(src, src_stride, dst, dst_stride, filter, x0_q4,
655 x_step_q4, w, h);
656 }
657
vpx_convolve8_vert_mmi(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int32_t x_step_q4,int y0_q4,int y_step_q4,int w,int h)658 void vpx_convolve8_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
659 uint8_t *dst, ptrdiff_t dst_stride,
660 const InterpKernel *filter, int x0_q4,
661 int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
662 int h) {
663 (void)x0_q4;
664 (void)x_step_q4;
665 if (w & 0x03)
666 convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w,
667 h);
668 else
669 convolve_vert_mmi(src, src_stride, dst, dst_stride, filter, y0_q4,
670 y_step_q4, w, h);
671 }
672
vpx_convolve8_avg_horiz_mmi(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int32_t x_step_q4,int y0_q4,int y_step_q4,int w,int h)673 void vpx_convolve8_avg_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
674 uint8_t *dst, ptrdiff_t dst_stride,
675 const InterpKernel *filter, int x0_q4,
676 int32_t x_step_q4, int y0_q4, int y_step_q4,
677 int w, int h) {
678 (void)y0_q4;
679 (void)y_step_q4;
680 if (w & 0x03)
681 convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4,
682 x_step_q4, w, h);
683 else
684 convolve_avg_horiz_mmi(src, src_stride, dst, dst_stride, filter, x0_q4,
685 x_step_q4, w, h);
686 }
687
vpx_convolve8_avg_vert_mmi(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int32_t x_step_q4,int y0_q4,int y_step_q4,int w,int h)688 void vpx_convolve8_avg_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
689 uint8_t *dst, ptrdiff_t dst_stride,
690 const InterpKernel *filter, int x0_q4,
691 int32_t x_step_q4, int y0_q4, int y_step_q4,
692 int w, int h) {
693 (void)x0_q4;
694 (void)x_step_q4;
695 if (w & 0x03)
696 convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4,
697 y_step_q4, w, h);
698 else
699 convolve_avg_vert_mmi(src, src_stride, dst, dst_stride, filter, y0_q4,
700 y_step_q4, w, h);
701 }
702
vpx_convolve8_avg_mmi(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int32_t x_step_q4,int y0_q4,int32_t y_step_q4,int32_t w,int32_t h)703 void vpx_convolve8_avg_mmi(const uint8_t *src, ptrdiff_t src_stride,
704 uint8_t *dst, ptrdiff_t dst_stride,
705 const InterpKernel *filter, int x0_q4,
706 int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
707 int32_t w, int32_t h) {
708 // Fixed size intermediate buffer places limits on parameters.
709 DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
710 assert(w <= 64);
711 assert(h <= 64);
712
713 vpx_convolve8_mmi(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4,
714 y_step_q4, w, h);
715 vpx_convolve_avg_mmi(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h);
716 }
717