1 /*
2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include "./vpx_dsp_rtcd.h"
13 #include "vpx_dsp/mips/vpx_convolve_msa.h"
14
common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)15 static void common_hv_8ht_8vt_and_aver_dst_4w_msa(
16 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
17 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
18 uint32_t loop_cnt;
19 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
20 v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1;
21 v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
22 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
23 v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4;
24 v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
25
26 mask0 = LD_UB(&mc_filt_mask_arr[16]);
27 src -= (3 + 3 * src_stride);
28
29 /* rearranging filter */
30 filt = LD_SH(filter_horiz);
31 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
32
33 mask1 = mask0 + 2;
34 mask2 = mask0 + 4;
35 mask3 = mask0 + 6;
36
37 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
38 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
39 src += (7 * src_stride);
40
41 hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
42 filt_hz1, filt_hz2, filt_hz3);
43 hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
44 filt_hz1, filt_hz2, filt_hz3);
45 hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
46 filt_hz1, filt_hz2, filt_hz3);
47 hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
48 filt_hz1, filt_hz2, filt_hz3);
49 SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
50
51 filt = LD_SH(filter_vert);
52 SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
53
54 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
55 vec2 = (v8i16)__msa_ilvev_b((v16i8)hz_out5, (v16i8)hz_out4);
56
57 for (loop_cnt = (height >> 2); loop_cnt--;) {
58 LD_SB4(src, src_stride, src7, src8, src9, src10);
59 XORI_B4_128_SB(src7, src8, src9, src10);
60 src += (4 * src_stride);
61
62 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
63 hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
64 filt_hz1, filt_hz2, filt_hz3);
65 hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
66 vec3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
67 res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1,
68 filt_vt2, filt_vt3);
69
70 hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
71 filt_hz1, filt_hz2, filt_hz3);
72 hz_out8 = (v8i16)__msa_sldi_b((v16i8)hz_out9, (v16i8)hz_out7, 8);
73 vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
74 res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1,
75 filt_vt2, filt_vt3);
76 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
77
78 SRARI_H2_SH(res0, res1, FILTER_BITS);
79 SAT_SH2_SH(res0, res1, 7);
80 PCKEV_B2_UB(res0, res0, res1, res1, tmp0, tmp1);
81 XORI_B2_128_UB(tmp0, tmp1);
82 AVER_UB2_UB(tmp0, dst0, tmp1, dst2, tmp0, tmp1);
83 ST4x4_UB(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
84 dst += (4 * dst_stride);
85
86 hz_out5 = hz_out9;
87 vec0 = vec2;
88 vec1 = vec3;
89 vec2 = vec4;
90 }
91 }
92
common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)93 static void common_hv_8ht_8vt_and_aver_dst_8w_msa(
94 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
95 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
96 uint32_t loop_cnt;
97 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
98 v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
99 v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
100 v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3;
101 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
102 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
103 v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
104
105 mask0 = LD_UB(&mc_filt_mask_arr[0]);
106 src -= (3 + 3 * src_stride);
107
108 /* rearranging filter */
109 filt = LD_SH(filter_horiz);
110 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
111
112 mask1 = mask0 + 2;
113 mask2 = mask0 + 4;
114 mask3 = mask0 + 6;
115
116 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
117 src += (7 * src_stride);
118
119 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
120 hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
121 filt_hz1, filt_hz2, filt_hz3);
122 hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
123 filt_hz1, filt_hz2, filt_hz3);
124 hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
125 filt_hz1, filt_hz2, filt_hz3);
126 hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
127 filt_hz1, filt_hz2, filt_hz3);
128 hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
129 filt_hz1, filt_hz2, filt_hz3);
130 hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
131 filt_hz1, filt_hz2, filt_hz3);
132 hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
133 filt_hz1, filt_hz2, filt_hz3);
134
135 filt = LD_SH(filter_vert);
136 SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
137
138 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
139 ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
140 ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
141
142 for (loop_cnt = (height >> 2); loop_cnt--;) {
143 LD_SB4(src, src_stride, src7, src8, src9, src10);
144 XORI_B4_128_SB(src7, src8, src9, src10);
145 src += (4 * src_stride);
146
147 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
148
149 hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
150 filt_hz1, filt_hz2, filt_hz3);
151 out3 = (v8i16)__msa_ilvev_b((v16i8)hz_out7, (v16i8)hz_out6);
152 tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
153 filt_vt2, filt_vt3);
154
155 hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
156 filt_hz1, filt_hz2, filt_hz3);
157 out7 = (v8i16)__msa_ilvev_b((v16i8)hz_out8, (v16i8)hz_out7);
158 tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
159 filt_vt2, filt_vt3);
160
161 hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
162 filt_hz1, filt_hz2, filt_hz3);
163 out8 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
164 tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
165 filt_vt2, filt_vt3);
166
167 hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
168 filt_hz0, filt_hz1, filt_hz2, filt_hz3);
169 out9 = (v8i16)__msa_ilvev_b((v16i8)hz_out10, (v16i8)hz_out9);
170 tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
171 filt_vt2, filt_vt3);
172
173 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
174 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
175 CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3, dst,
176 dst_stride);
177 dst += (4 * dst_stride);
178
179 hz_out6 = hz_out10;
180 out0 = out2;
181 out1 = out3;
182 out2 = out8;
183 out4 = out6;
184 out5 = out7;
185 out6 = out9;
186 }
187 }
188
common_hv_8ht_8vt_and_aver_dst_16w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)189 static void common_hv_8ht_8vt_and_aver_dst_16w_msa(
190 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
191 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
192 int32_t multiple8_cnt;
193 for (multiple8_cnt = 2; multiple8_cnt--;) {
194 common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
195 filter_horiz, filter_vert, height);
196 src += 8;
197 dst += 8;
198 }
199 }
200
common_hv_8ht_8vt_and_aver_dst_32w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)201 static void common_hv_8ht_8vt_and_aver_dst_32w_msa(
202 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
203 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
204 int32_t multiple8_cnt;
205 for (multiple8_cnt = 4; multiple8_cnt--;) {
206 common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
207 filter_horiz, filter_vert, height);
208 src += 8;
209 dst += 8;
210 }
211 }
212
common_hv_8ht_8vt_and_aver_dst_64w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)213 static void common_hv_8ht_8vt_and_aver_dst_64w_msa(
214 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
215 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
216 int32_t multiple8_cnt;
217 for (multiple8_cnt = 8; multiple8_cnt--;) {
218 common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
219 filter_horiz, filter_vert, height);
220 src += 8;
221 dst += 8;
222 }
223 }
224
common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert)225 static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(
226 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
227 int8_t *filter_horiz, int8_t *filter_vert) {
228 v16i8 src0, src1, src2, src3, src4, mask;
229 v16u8 filt_hz, filt_vt, vec0, vec1;
230 v16u8 dst0, dst1, dst2, dst3, res0, res1;
231 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt;
232
233 mask = LD_SB(&mc_filt_mask_arr[16]);
234
235 /* rearranging filter */
236 filt = LD_UH(filter_horiz);
237 filt_hz = (v16u8)__msa_splati_h((v8i16)filt, 0);
238
239 filt = LD_UH(filter_vert);
240 filt_vt = (v16u8)__msa_splati_h((v8i16)filt, 0);
241
242 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
243
244 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
245 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
246 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
247 hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
248 hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
249 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
250
251 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
252 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
253 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
254 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
255 PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
256 AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
257 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
258 }
259
common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert)260 static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(
261 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
262 int8_t *filter_horiz, int8_t *filter_vert) {
263 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
264 v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
265 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
266 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
267 v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
268 v8i16 filt;
269
270 mask = LD_SB(&mc_filt_mask_arr[16]);
271
272 /* rearranging filter */
273 filt = LD_SH(filter_horiz);
274 filt_hz = (v16u8)__msa_splati_h(filt, 0);
275
276 filt = LD_SH(filter_vert);
277 filt_vt = (v16u8)__msa_splati_h(filt, 0);
278
279 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
280 src += (8 * src_stride);
281 src8 = LD_SB(src);
282
283 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
284 hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
285 hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, FILTER_BITS);
286 hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, FILTER_BITS);
287 hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, FILTER_BITS);
288 SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
289 hz_out3, hz_out5, 8);
290 hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
291
292 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
293 ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
294 dst6);
295 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
296 ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
297 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, tmp0,
298 tmp1, tmp2, tmp3);
299 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
300 PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1, res2,
301 res3);
302 AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2,
303 res3);
304 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
305 dst += (4 * dst_stride);
306 ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
307 }
308
common_hv_2ht_2vt_and_aver_dst_4w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)309 static void common_hv_2ht_2vt_and_aver_dst_4w_msa(
310 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
311 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
312 if (4 == height) {
313 common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
314 filter_horiz, filter_vert);
315 } else if (8 == height) {
316 common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
317 filter_horiz, filter_vert);
318 }
319 }
320
common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert)321 static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(
322 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
323 int8_t *filter_horiz, int8_t *filter_vert) {
324 v16i8 src0, src1, src2, src3, src4, mask;
325 v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
326 v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
327 v8i16 filt;
328
329 mask = LD_SB(&mc_filt_mask_arr[0]);
330
331 /* rearranging filter */
332 filt = LD_SH(filter_horiz);
333 filt_hz = (v16u8)__msa_splati_h(filt, 0);
334
335 filt = LD_SH(filter_vert);
336 filt_vt = (v16u8)__msa_splati_h(filt, 0);
337
338 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
339 src += (5 * src_stride);
340
341 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
342 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
343 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
344 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
345 tmp0 = __msa_dotp_u_h(vec0, filt_vt);
346
347 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
348 vec1 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
349 tmp1 = __msa_dotp_u_h(vec1, filt_vt);
350
351 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
352 vec2 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
353 tmp2 = __msa_dotp_u_h(vec2, filt_vt);
354
355 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
356 vec3 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
357 tmp3 = __msa_dotp_u_h(vec3, filt_vt);
358
359 SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
360 PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
361 dst_stride);
362 }
363
common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)364 static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
365 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
366 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
367 uint32_t loop_cnt;
368 v16i8 src0, src1, src2, src3, src4, mask;
369 v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3;
370 v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
371 v8i16 filt;
372
373 mask = LD_SB(&mc_filt_mask_arr[0]);
374
375 /* rearranging filter */
376 filt = LD_SH(filter_horiz);
377 filt_hz = (v16u8)__msa_splati_h(filt, 0);
378
379 filt = LD_SH(filter_vert);
380 filt_vt = (v16u8)__msa_splati_h(filt, 0);
381
382 src0 = LD_SB(src);
383 src += src_stride;
384
385 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
386
387 for (loop_cnt = (height >> 2); loop_cnt--;) {
388 LD_SB4(src, src_stride, src1, src2, src3, src4);
389 src += (4 * src_stride);
390
391 hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
392 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
393 tmp0 = __msa_dotp_u_h(vec0, filt_vt);
394
395 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
396 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
397 tmp1 = __msa_dotp_u_h(vec0, filt_vt);
398
399 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
400
401 hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
402 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
403 tmp2 = __msa_dotp_u_h(vec0, filt_vt);
404
405 hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
406 vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
407 tmp3 = __msa_dotp_u_h(vec0, filt_vt);
408
409 SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
410 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
411 PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
412 dst_stride);
413 dst += (4 * dst_stride);
414 }
415 }
416
common_hv_2ht_2vt_and_aver_dst_8w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)417 static void common_hv_2ht_2vt_and_aver_dst_8w_msa(
418 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
419 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
420 if (4 == height) {
421 common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
422 filter_horiz, filter_vert);
423 } else {
424 common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
425 src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height);
426 }
427 }
428
common_hv_2ht_2vt_and_aver_dst_16w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)429 static void common_hv_2ht_2vt_and_aver_dst_16w_msa(
430 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
431 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
432 uint32_t loop_cnt;
433 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
434 v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
435 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
436 v8i16 filt;
437
438 mask = LD_SB(&mc_filt_mask_arr[0]);
439
440 /* rearranging filter */
441 filt = LD_SH(filter_horiz);
442 filt_hz = (v16u8)__msa_splati_h(filt, 0);
443
444 filt = LD_SH(filter_vert);
445 filt_vt = (v16u8)__msa_splati_h(filt, 0);
446
447 LD_SB2(src, 8, src0, src1);
448 src += src_stride;
449
450 hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
451 hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
452
453 for (loop_cnt = (height >> 2); loop_cnt--;) {
454 LD_SB4(src, src_stride, src0, src2, src4, src6);
455 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
456 src += (4 * src_stride);
457 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
458
459 hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
460 hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
461 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
462 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
463 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
464 PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
465 dst += dst_stride;
466
467 hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
468 hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
469 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
470 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
471 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
472 PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst);
473 dst += dst_stride;
474
475 hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
476 hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
477 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
478 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
479 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
480 PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
481 dst += dst_stride;
482
483 hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
484 hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
485 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
486 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
487 SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
488 PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst);
489 dst += dst_stride;
490 }
491 }
492
common_hv_2ht_2vt_and_aver_dst_32w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)493 static void common_hv_2ht_2vt_and_aver_dst_32w_msa(
494 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
495 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
496 int32_t multiple8_cnt;
497 for (multiple8_cnt = 2; multiple8_cnt--;) {
498 common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
499 filter_horiz, filter_vert, height);
500 src += 16;
501 dst += 16;
502 }
503 }
504
common_hv_2ht_2vt_and_aver_dst_64w_msa(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,int8_t * filter_horiz,int8_t * filter_vert,int32_t height)505 static void common_hv_2ht_2vt_and_aver_dst_64w_msa(
506 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
507 int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
508 int32_t multiple8_cnt;
509 for (multiple8_cnt = 4; multiple8_cnt--;) {
510 common_hv_2ht_2vt_and_aver_dst_16w_msa(src, src_stride, dst, dst_stride,
511 filter_horiz, filter_vert, height);
512 src += 16;
513 dst += 16;
514 }
515 }
516
vpx_convolve8_avg_msa(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)517 void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
518 uint8_t *dst, ptrdiff_t dst_stride,
519 const int16_t *filter_x, int x_step_q4,
520 const int16_t *filter_y, int y_step_q4, int w,
521 int h) {
522 int8_t cnt, filt_hor[8], filt_ver[8];
523
524 assert(x_step_q4 == 16);
525 assert(y_step_q4 == 16);
526 assert(((const int32_t *)filter_x)[1] != 0x800000);
527 assert(((const int32_t *)filter_y)[1] != 0x800000);
528
529 for (cnt = 0; cnt < 8; ++cnt) {
530 filt_hor[cnt] = filter_x[cnt];
531 filt_ver[cnt] = filter_y[cnt];
532 }
533
534 if (((const int32_t *)filter_x)[0] == 0 &&
535 ((const int32_t *)filter_y)[0] == 0) {
536 switch (w) {
537 case 4:
538 common_hv_2ht_2vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
539 (int32_t)dst_stride, &filt_hor[3],
540 &filt_ver[3], h);
541 break;
542 case 8:
543 common_hv_2ht_2vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
544 (int32_t)dst_stride, &filt_hor[3],
545 &filt_ver[3], h);
546 break;
547 case 16:
548 common_hv_2ht_2vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
549 (int32_t)dst_stride,
550 &filt_hor[3], &filt_ver[3], h);
551 break;
552 case 32:
553 common_hv_2ht_2vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
554 (int32_t)dst_stride,
555 &filt_hor[3], &filt_ver[3], h);
556 break;
557 case 64:
558 common_hv_2ht_2vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
559 (int32_t)dst_stride,
560 &filt_hor[3], &filt_ver[3], h);
561 break;
562 default:
563 vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,
564 x_step_q4, filter_y, y_step_q4, w, h);
565 break;
566 }
567 } else if (((const int32_t *)filter_x)[0] == 0 ||
568 ((const int32_t *)filter_y)[0] == 0) {
569 vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
570 filter_y, y_step_q4, w, h);
571 } else {
572 switch (w) {
573 case 4:
574 common_hv_8ht_8vt_and_aver_dst_4w_msa(src, (int32_t)src_stride, dst,
575 (int32_t)dst_stride, filt_hor,
576 filt_ver, h);
577 break;
578 case 8:
579 common_hv_8ht_8vt_and_aver_dst_8w_msa(src, (int32_t)src_stride, dst,
580 (int32_t)dst_stride, filt_hor,
581 filt_ver, h);
582 break;
583 case 16:
584 common_hv_8ht_8vt_and_aver_dst_16w_msa(src, (int32_t)src_stride, dst,
585 (int32_t)dst_stride, filt_hor,
586 filt_ver, h);
587 break;
588 case 32:
589 common_hv_8ht_8vt_and_aver_dst_32w_msa(src, (int32_t)src_stride, dst,
590 (int32_t)dst_stride, filt_hor,
591 filt_ver, h);
592 break;
593 case 64:
594 common_hv_8ht_8vt_and_aver_dst_64w_msa(src, (int32_t)src_stride, dst,
595 (int32_t)dst_stride, filt_hor,
596 filt_ver, h);
597 break;
598 default:
599 vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,
600 x_step_q4, filter_y, y_step_q4, w, h);
601 break;
602 }
603 }
604 }
605