1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include "./vpx_dsp_rtcd.h"
13 #include "vpx_dsp/mips/vpx_convolve_msa.h"
14
common_vt_8t_and_aver_dst_4w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)15 static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
16 int32_t src_stride, uint8_t *dst,
17 int32_t dst_stride, int8_t *filter,
18 int32_t height) {
19 uint32_t loop_cnt;
20 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
21 v16u8 dst0, dst1, dst2, dst3, out;
22 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
23 v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
24 v16i8 src10998, filt0, filt1, filt2, filt3;
25 v8i16 filt, out10, out32;
26
27 src -= (3 * src_stride);
28
29 filt = LD_SH(filter);
30 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
31
32 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
33 src += (7 * src_stride);
34
35 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
36 src54_r, src21_r);
37 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
38 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
39 src4332, src6554);
40 XORI_B3_128_SB(src2110, src4332, src6554);
41
42 for (loop_cnt = (height >> 2); loop_cnt--;) {
43 LD_SB4(src, src_stride, src7, src8, src9, src10);
44 src += (4 * src_stride);
45
46 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
47 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
48 src87_r, src98_r, src109_r);
49 ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
50 XORI_B2_128_SB(src8776, src10998);
51 out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
52 filt1, filt2, filt3);
53 out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
54 filt1, filt2, filt3);
55 SRARI_H2_SH(out10, out32, FILTER_BITS);
56 SAT_SH2_SH(out10, out32, 7);
57 out = PCKEV_XORI128_UB(out10, out32);
58 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
59
60 dst0 = (v16u8)__msa_ilvr_d((v2i64)dst2, (v2i64)dst0);
61 out = __msa_aver_u_b(out, dst0);
62
63 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
64 dst += (4 * dst_stride);
65
66 src2110 = src6554;
67 src4332 = src8776;
68 src6554 = src10998;
69 src6 = src10;
70 }
71 }
72
common_vt_8t_and_aver_dst_8w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)73 static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
74 int32_t src_stride, uint8_t *dst,
75 int32_t dst_stride, int8_t *filter,
76 int32_t height) {
77 uint32_t loop_cnt;
78 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
79 v16u8 dst0, dst1, dst2, dst3;
80 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
81 v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
82 v8i16 filt, out0, out1, out2, out3;
83
84 src -= (3 * src_stride);
85
86 filt = LD_SH(filter);
87 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
88
89 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
90 src += (7 * src_stride);
91
92 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
93 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
94 src54_r, src21_r);
95 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
96
97 for (loop_cnt = (height >> 2); loop_cnt--;) {
98 LD_SB4(src, src_stride, src7, src8, src9, src10);
99 src += (4 * src_stride);
100
101 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
102 XORI_B4_128_SB(src7, src8, src9, src10);
103 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
104 src87_r, src98_r, src109_r);
105 out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0, filt1,
106 filt2, filt3);
107 out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0, filt1,
108 filt2, filt3);
109 out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0, filt1,
110 filt2, filt3);
111 out3 = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
112 filt1, filt2, filt3);
113 SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
114 SAT_SH4_SH(out0, out1, out2, out3, 7);
115 CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst,
116 dst_stride);
117 dst += (4 * dst_stride);
118
119 src10_r = src54_r;
120 src32_r = src76_r;
121 src54_r = src98_r;
122 src21_r = src65_r;
123 src43_r = src87_r;
124 src65_r = src109_r;
125 src6 = src10;
126 }
127 }
128
common_vt_8t_and_aver_dst_16w_mult_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height,int32_t width)129 static void common_vt_8t_and_aver_dst_16w_mult_msa(
130 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
131 int8_t *filter, int32_t height, int32_t width) {
132 const uint8_t *src_tmp;
133 uint8_t *dst_tmp;
134 uint32_t loop_cnt, cnt;
135 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
136 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
137 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
138 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
139 v16i8 filt0, filt1, filt2, filt3;
140 v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
141 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
142
143 src -= (3 * src_stride);
144
145 filt = LD_SH(filter);
146 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
147
148 for (cnt = (width >> 4); cnt--;) {
149 src_tmp = src;
150 dst_tmp = dst;
151
152 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
153 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
154 src_tmp += (7 * src_stride);
155
156 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
157 src54_r, src21_r);
158 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
159 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
160 src54_l, src21_l);
161 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
162
163 for (loop_cnt = (height >> 2); loop_cnt--;) {
164 LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
165 src_tmp += (4 * src_stride);
166
167 LD_UB4(dst_tmp, dst_stride, dst0, dst1, dst2, dst3);
168 XORI_B4_128_SB(src7, src8, src9, src10);
169 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
170 src87_r, src98_r, src109_r);
171 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
172 src87_l, src98_l, src109_l);
173 out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
174 filt1, filt2, filt3);
175 out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
176 filt1, filt2, filt3);
177 out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
178 filt1, filt2, filt3);
179 out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
180 filt1, filt2, filt3);
181 out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
182 filt1, filt2, filt3);
183 out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
184 filt1, filt2, filt3);
185 out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
186 filt1, filt2, filt3);
187 out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
188 filt1, filt2, filt3);
189 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, FILTER_BITS);
190 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, FILTER_BITS);
191 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
192 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
193 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
194 out3_r, tmp0, tmp1, tmp2, tmp3);
195 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
196 AVER_UB4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst0, dst1,
197 dst2, dst3);
198 ST_UB4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride);
199 dst_tmp += (4 * dst_stride);
200
201 src10_r = src54_r;
202 src32_r = src76_r;
203 src54_r = src98_r;
204 src21_r = src65_r;
205 src43_r = src87_r;
206 src65_r = src109_r;
207 src10_l = src54_l;
208 src32_l = src76_l;
209 src54_l = src98_l;
210 src21_l = src65_l;
211 src43_l = src87_l;
212 src65_l = src109_l;
213 src6 = src10;
214 }
215
216 src += 16;
217 dst += 16;
218 }
219 }
220
common_vt_8t_and_aver_dst_16w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)221 static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src,
222 int32_t src_stride, uint8_t *dst,
223 int32_t dst_stride,
224 int8_t *filter, int32_t height) {
225 common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
226 filter, height, 16);
227 }
228
common_vt_8t_and_aver_dst_32w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)229 static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src,
230 int32_t src_stride, uint8_t *dst,
231 int32_t dst_stride,
232 int8_t *filter, int32_t height) {
233 common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
234 filter, height, 32);
235 }
236
common_vt_8t_and_aver_dst_64w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)237 static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src,
238 int32_t src_stride, uint8_t *dst,
239 int32_t dst_stride,
240 int8_t *filter, int32_t height) {
241 common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
242 filter, height, 64);
243 }
244
common_vt_2t_and_aver_dst_4x4_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter)245 static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
246 int32_t src_stride, uint8_t *dst,
247 int32_t dst_stride,
248 int8_t *filter) {
249 v16i8 src0, src1, src2, src3, src4;
250 v16u8 dst0, dst1, dst2, dst3, out, filt0, src2110, src4332;
251 v16i8 src10_r, src32_r, src21_r, src43_r;
252 v8i16 filt;
253 v8u16 tmp0, tmp1;
254
255 filt = LD_SH(filter);
256 filt0 = (v16u8)__msa_splati_h(filt, 0);
257
258 LD_SB4(src, src_stride, src0, src1, src2, src3);
259 src += (4 * src_stride);
260
261 src4 = LD_SB(src);
262 src += src_stride;
263
264 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
265 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
266 dst0 = (v16u8)__msa_ilvr_d((v2i64)dst1, (v2i64)dst0);
267 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
268 src32_r, src43_r);
269 ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
270 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
271 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
272
273 out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
274 out = __msa_aver_u_b(out, dst0);
275
276 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
277 }
278
common_vt_2t_and_aver_dst_4x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter)279 static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
280 int32_t src_stride, uint8_t *dst,
281 int32_t dst_stride,
282 int8_t *filter) {
283 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
284 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r;
285 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
286 v16u8 src2110, src4332, src6554, src8776, filt0;
287 v8u16 tmp0, tmp1, tmp2, tmp3;
288 v8i16 filt;
289
290 filt = LD_SH(filter);
291 filt0 = (v16u8)__msa_splati_h(filt, 0);
292
293 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
294 src += (8 * src_stride);
295 src8 = LD_SB(src);
296
297 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
298 ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1, dst2,
299 dst3);
300 ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
301 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
302 src32_r, src43_r);
303 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
304 src76_r, src87_r);
305 ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
306 src76_r, src2110, src4332, src6554, src8776);
307 DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
308 tmp0, tmp1, tmp2, tmp3);
309 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
310 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
311 AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332);
312 ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
313 dst += (4 * dst_stride);
314 ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst, dst_stride);
315 }
316
common_vt_2t_and_aver_dst_4w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)317 static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src,
318 int32_t src_stride, uint8_t *dst,
319 int32_t dst_stride, int8_t *filter,
320 int32_t height) {
321 if (4 == height) {
322 common_vt_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride, filter);
323 } else if (8 == height) {
324 common_vt_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride, filter);
325 }
326 }
327
common_vt_2t_and_aver_dst_8x4_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter)328 static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
329 int32_t src_stride, uint8_t *dst,
330 int32_t dst_stride,
331 int8_t *filter) {
332 v16u8 src0, src1, src2, src3, src4;
333 v16u8 dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0;
334 v8u16 tmp0, tmp1, tmp2, tmp3;
335 v8i16 filt;
336
337 /* rearranging filter_y */
338 filt = LD_SH(filter);
339 filt0 = (v16u8)__msa_splati_h(filt, 0);
340
341 LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
342 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
343 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
344 ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
345 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
346 tmp2, tmp3);
347 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
348 PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
349 dst_stride);
350 }
351
common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)352 static void common_vt_2t_and_aver_dst_8x8mult_msa(
353 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
354 int8_t *filter, int32_t height) {
355 uint32_t loop_cnt;
356 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
357 v16u8 dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
358 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
359 v8u16 tmp0, tmp1, tmp2, tmp3;
360 v8i16 filt;
361
362 /* rearranging filter_y */
363 filt = LD_SH(filter);
364 filt0 = (v16u8)__msa_splati_h(filt, 0);
365
366 src0 = LD_UB(src);
367 src += src_stride;
368
369 for (loop_cnt = (height >> 3); loop_cnt--;) {
370 LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
371 src += (8 * src_stride);
372 LD_UB8(dst, dst_stride, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8);
373
374 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
375 vec3);
376 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, vec4, vec5, vec6,
377 vec7);
378 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
379 tmp2, tmp3);
380 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
381 PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4, dst,
382 dst_stride);
383 dst += (4 * dst_stride);
384
385 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
386 tmp2, tmp3);
387 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
388 PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8, dst,
389 dst_stride);
390 dst += (4 * dst_stride);
391
392 src0 = src8;
393 }
394 }
395
common_vt_2t_and_aver_dst_8w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)396 static void common_vt_2t_and_aver_dst_8w_msa(const uint8_t *src,
397 int32_t src_stride, uint8_t *dst,
398 int32_t dst_stride, int8_t *filter,
399 int32_t height) {
400 if (4 == height) {
401 common_vt_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride, filter);
402 } else {
403 common_vt_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
404 filter, height);
405 }
406 }
407
common_vt_2t_and_aver_dst_16w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)408 static void common_vt_2t_and_aver_dst_16w_msa(const uint8_t *src,
409 int32_t src_stride, uint8_t *dst,
410 int32_t dst_stride,
411 int8_t *filter, int32_t height) {
412 uint32_t loop_cnt;
413 v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0;
414 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
415 v8u16 tmp0, tmp1, tmp2, tmp3, filt;
416
417 /* rearranging filter_y */
418 filt = LD_UH(filter);
419 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
420
421 src0 = LD_UB(src);
422 src += src_stride;
423
424 for (loop_cnt = (height >> 2); loop_cnt--;) {
425 LD_UB4(src, src_stride, src1, src2, src3, src4);
426 src += (4 * src_stride);
427
428 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
429 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
430 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
431 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
432 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
433 PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
434 dst += dst_stride;
435
436 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
437 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
438 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
439 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
440 PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst);
441 dst += dst_stride;
442
443 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
444 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
445 PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
446 dst += dst_stride;
447
448 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
449 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
450 PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst);
451 dst += dst_stride;
452
453 src0 = src4;
454 }
455 }
456
common_vt_2t_and_aver_dst_32w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)457 static void common_vt_2t_and_aver_dst_32w_msa(const uint8_t *src,
458 int32_t src_stride, uint8_t *dst,
459 int32_t dst_stride,
460 int8_t *filter, int32_t height) {
461 uint32_t loop_cnt;
462 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
463 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
464 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
465 v8u16 tmp0, tmp1, tmp2, tmp3, filt;
466
467 /* rearranging filter_y */
468 filt = LD_UH(filter);
469 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
470
471 LD_UB2(src, 16, src0, src5);
472 src += src_stride;
473
474 for (loop_cnt = (height >> 2); loop_cnt--;) {
475 LD_UB4(src, src_stride, src1, src2, src3, src4);
476 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
477 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
478 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
479
480 LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
481 LD_UB4(dst + 16, dst_stride, dst4, dst5, dst6, dst7);
482 src += (4 * src_stride);
483
484 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
485 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
486 PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
487
488 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
489 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
490 PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
491
492 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
493 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
494 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
495 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
496 PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst + 2 * dst_stride);
497
498 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
499 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
500 PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst + 3 * dst_stride);
501
502 ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
503 ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
504 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
505 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
506 PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 16);
507
508 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
509 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
510 PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 16 + dst_stride);
511
512 ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
513 ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
514 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
515 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
516 PCKEV_AVG_ST_UB(tmp1, tmp0, dst6, dst + 16 + 2 * dst_stride);
517
518 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
519 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
520 PCKEV_AVG_ST_UB(tmp3, tmp2, dst7, dst + 16 + 3 * dst_stride);
521 dst += (4 * dst_stride);
522
523 src0 = src4;
524 src5 = src9;
525 }
526 }
527
common_vt_2t_and_aver_dst_64w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter,int32_t height)528 static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src,
529 int32_t src_stride, uint8_t *dst,
530 int32_t dst_stride,
531 int8_t *filter, int32_t height) {
532 uint32_t loop_cnt;
533 v16u8 src0, src1, src2, src3, src4, src5;
534 v16u8 src6, src7, src8, src9, src10, src11, filt0;
535 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
536 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
537 v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
538 v8u16 filt;
539
540 /* rearranging filter_y */
541 filt = LD_UH(filter);
542 filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
543
544 LD_UB4(src, 16, src0, src3, src6, src9);
545 src += src_stride;
546
547 for (loop_cnt = (height >> 1); loop_cnt--;) {
548 LD_UB2(src, src_stride, src1, src2);
549 LD_UB2(dst, dst_stride, dst0, dst1);
550 LD_UB2(src + 16, src_stride, src4, src5);
551 LD_UB2(dst + 16, dst_stride, dst2, dst3);
552 LD_UB2(src + 32, src_stride, src7, src8);
553 LD_UB2(dst + 32, dst_stride, dst4, dst5);
554 LD_UB2(src + 48, src_stride, src10, src11);
555 LD_UB2(dst + 48, dst_stride, dst6, dst7);
556 src += (2 * src_stride);
557
558 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
559 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
560 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
561 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
562 PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
563
564 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
565 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
566 PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
567
568 ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
569 ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
570 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
571 SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
572 PCKEV_AVG_ST_UB(tmp5, tmp4, dst2, dst + 16);
573
574 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
575 SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
576 PCKEV_AVG_ST_UB(tmp7, tmp6, dst3, dst + 16 + dst_stride);
577
578 ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
579 ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
580 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
581 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
582 PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 32);
583
584 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
585 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
586 PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 32 + dst_stride);
587
588 ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
589 ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
590 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
591 SRARI_H2_UH(tmp4, tmp5, FILTER_BITS);
592 PCKEV_AVG_ST_UB(tmp5, tmp4, dst6, (dst + 48));
593
594 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
595 SRARI_H2_UH(tmp6, tmp7, FILTER_BITS);
596 PCKEV_AVG_ST_UB(tmp7, tmp6, dst7, dst + 48 + dst_stride);
597 dst += (2 * dst_stride);
598
599 src0 = src2;
600 src3 = src5;
601 src6 = src8;
602 src9 = src11;
603 }
604 }
605
vpx_convolve8_avg_vert_msa(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)606 void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
607 uint8_t *dst, ptrdiff_t dst_stride,
608 const int16_t *filter_x, int x_step_q4,
609 const int16_t *filter_y, int y_step_q4, int w,
610 int h) {
611 int8_t cnt, filt_ver[8];
612
613 assert(y_step_q4 == 16);
614 assert(((const int32_t *)filter_y)[1] != 0x800000);
615
616 for (cnt = 0; cnt < 8; ++cnt) {
617 filt_ver[cnt] = filter_y[cnt];
618 }
619
620 if (((const int32_t *)filter_y)[0] == 0) {
621 switch (w) {
622 case 4:
623 common_vt_2t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
624 (int32_t)dst_stride, &filt_ver[3], h);
625 break;
626 case 8:
627 common_vt_2t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
628 (int32_t)dst_stride, &filt_ver[3], h);
629 break;
630 case 16:
631 common_vt_2t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
632 (int32_t)dst_stride, &filt_ver[3], h);
633 break;
634 case 32:
635 common_vt_2t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
636 (int32_t)dst_stride, &filt_ver[3], h);
637 break;
638 case 64:
639 common_vt_2t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
640 (int32_t)dst_stride, &filt_ver[3], h);
641 break;
642 default:
643 vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
644 x_step_q4, filter_y, y_step_q4, w, h);
645 break;
646 }
647 } else {
648 switch (w) {
649 case 4:
650 common_vt_8t_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
651 (int32_t)dst_stride, filt_ver, h);
652 break;
653 case 8:
654 common_vt_8t_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
655 (int32_t)dst_stride, filt_ver, h);
656 break;
657 case 16:
658 common_vt_8t_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
659 (int32_t)dst_stride, filt_ver, h);
660
661 break;
662 case 32:
663 common_vt_8t_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
664 (int32_t)dst_stride, filt_ver, h);
665 break;
666 case 64:
667 common_vt_8t_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
668 (int32_t)dst_stride, filt_ver, h);
669 break;
670 default:
671 vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
672 x_step_q4, filter_y, y_step_q4, w, h);
673 break;
674 }
675 }
676 }
677