1 /*
2 * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "libavcodec/mips/hevcdsp_mips.h"
23 #include "libavcodec/mips/hevc_macros_msa.h"
24
25 static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
26 /* 8 width cases */
27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28 /* 4 width cases */
29 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
30 };
31
hevc_copy_4w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,int32_t height)32 static void hevc_copy_4w_msa(uint8_t *src, int32_t src_stride,
33 int16_t *dst, int32_t dst_stride,
34 int32_t height)
35 {
36 v16i8 zero = { 0 };
37
38 if (2 == height) {
39 v16i8 src0, src1;
40 v8i16 in0;
41
42 LD_SB2(src, src_stride, src0, src1);
43
44 src0 = (v16i8) __msa_ilvr_w((v4i32) src1, (v4i32) src0);
45 in0 = (v8i16) __msa_ilvr_b(zero, src0);
46 in0 <<= 6;
47 ST_D2(in0, 0, 1, dst, dst_stride);
48 } else if (4 == height) {
49 v16i8 src0, src1, src2, src3;
50 v8i16 in0, in1;
51
52 LD_SB4(src, src_stride, src0, src1, src2, src3);
53
54 ILVR_W2_SB(src1, src0, src3, src2, src0, src1);
55 ILVR_B2_SH(zero, src0, zero, src1, in0, in1);
56 in0 <<= 6;
57 in1 <<= 6;
58 ST_D4(in0, in1, 0, 1, 0, 1, dst, dst_stride);
59 } else if (0 == height % 8) {
60 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
61 v8i16 in0, in1, in2, in3;
62 uint32_t loop_cnt;
63
64 for (loop_cnt = (height >> 3); loop_cnt--;) {
65 LD_SB8(src, src_stride,
66 src0, src1, src2, src3, src4, src5, src6, src7);
67 src += (8 * src_stride);
68
69 ILVR_W4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
70 src0, src1, src2, src3);
71 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
72 in0, in1, in2, in3);
73 SLLI_4V(in0, in1, in2, in3, 6);
74 ST_D8(in0, in1, in2, in3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
75 dst += (8 * dst_stride);
76 }
77 }
78 }
79
hevc_copy_6w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,int32_t height)80 static void hevc_copy_6w_msa(uint8_t *src, int32_t src_stride,
81 int16_t *dst, int32_t dst_stride,
82 int32_t height)
83 {
84 uint32_t loop_cnt;
85 v16i8 zero = { 0 };
86 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
87 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
88
89 for (loop_cnt = (height >> 3); loop_cnt--;) {
90 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
91 src += (8 * src_stride);
92
93 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
94 in0, in1, in2, in3);
95 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
96 in4, in5, in6, in7);
97 SLLI_4V(in0, in1, in2, in3, 6);
98 SLLI_4V(in4, in5, in6, in7, 6);
99 ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, dst, 2 * dst_stride);
100 dst += (8 * dst_stride);
101 }
102 }
103
hevc_copy_8w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,int32_t height)104 static void hevc_copy_8w_msa(uint8_t *src, int32_t src_stride,
105 int16_t *dst, int32_t dst_stride,
106 int32_t height)
107 {
108 v16i8 zero = { 0 };
109
110 if (2 == height) {
111 v16i8 src0, src1;
112 v8i16 in0, in1;
113
114 LD_SB2(src, src_stride, src0, src1);
115
116 ILVR_B2_SH(zero, src0, zero, src1, in0, in1);
117 in0 <<= 6;
118 in1 <<= 6;
119 ST_SH2(in0, in1, dst, dst_stride);
120 } else if (4 == height) {
121 v16i8 src0, src1, src2, src3;
122 v8i16 in0, in1, in2, in3;
123
124 LD_SB4(src, src_stride, src0, src1, src2, src3);
125
126 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
127 in0, in1, in2, in3);
128 SLLI_4V(in0, in1, in2, in3, 6);
129 ST_SH4(in0, in1, in2, in3, dst, dst_stride);
130 } else if (6 == height) {
131 v16i8 src0, src1, src2, src3, src4, src5;
132 v8i16 in0, in1, in2, in3, in4, in5;
133
134 LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5);
135
136 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
137 in0, in1, in2, in3);
138 ILVR_B2_SH(zero, src4, zero, src5, in4, in5);
139 SLLI_4V(in0, in1, in2, in3, 6);
140 in4 <<= 6;
141 in5 <<= 6;
142 ST_SH6(in0, in1, in2, in3, in4, in5, dst, dst_stride);
143 } else if (0 == height % 8) {
144 uint32_t loop_cnt;
145 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
146 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
147
148 for (loop_cnt = (height >> 3); loop_cnt--;) {
149 LD_SB8(src, src_stride,
150 src0, src1, src2, src3, src4, src5, src6, src7);
151 src += (8 * src_stride);
152
153 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
154 in0, in1, in2, in3);
155 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
156 in4, in5, in6, in7);
157 SLLI_4V(in0, in1, in2, in3, 6);
158 SLLI_4V(in4, in5, in6, in7, 6);
159 ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, dst, dst_stride);
160 dst += (8 * dst_stride);
161 }
162 }
163 }
164
hevc_copy_12w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,int32_t height)165 static void hevc_copy_12w_msa(uint8_t *src, int32_t src_stride,
166 int16_t *dst, int32_t dst_stride,
167 int32_t height)
168 {
169 uint32_t loop_cnt;
170 v16i8 zero = { 0 };
171 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
172 v8i16 in0, in1, in0_r, in1_r, in2_r, in3_r;
173
174 for (loop_cnt = (height >> 3); loop_cnt--;) {
175 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
176 src += (8 * src_stride);
177
178 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
179 in0_r, in1_r, in2_r, in3_r);
180 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
181 ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
182 ILVR_B2_SH(zero, src0, zero, src1, in0, in1);
183 in0 <<= 6;
184 in1 <<= 6;
185 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
186 ST_D4(in0, in1, 0, 1, 0, 1, dst + 8, dst_stride);
187 dst += (4 * dst_stride);
188
189 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
190 in0_r, in1_r, in2_r, in3_r);
191 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
192 ILVL_W2_SB(src5, src4, src7, src6, src0, src1);
193 ILVR_B2_SH(zero, src0, zero, src1, in0, in1);
194 in0 <<= 6;
195 in1 <<= 6;
196 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
197 ST_D4(in0, in1, 0, 1, 0, 1, dst + 8, dst_stride);
198 dst += (4 * dst_stride);
199 }
200 }
201
hevc_copy_16w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,int32_t height)202 static void hevc_copy_16w_msa(uint8_t *src, int32_t src_stride,
203 int16_t *dst, int32_t dst_stride,
204 int32_t height)
205 {
206 v16i8 zero = { 0 };
207
208 if (4 == height) {
209 v16i8 src0, src1, src2, src3;
210 v8i16 in0_r, in1_r, in2_r, in3_r;
211 v8i16 in0_l, in1_l, in2_l, in3_l;
212
213 LD_SB4(src, src_stride, src0, src1, src2, src3);
214
215 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
216 in0_r, in1_r, in2_r, in3_r);
217 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
218 in0_l, in1_l, in2_l, in3_l);
219 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
220 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
221 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
222 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
223 } else if (12 == height) {
224 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
225 v16i8 src8, src9, src10, src11;
226 v8i16 in0_r, in1_r, in2_r, in3_r;
227 v8i16 in0_l, in1_l, in2_l, in3_l;
228
229 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
230 src += (8 * src_stride);
231 LD_SB4(src, src_stride, src8, src9, src10, src11);
232
233 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
234 in0_r, in1_r, in2_r, in3_r);
235 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
236 in0_l, in1_l, in2_l, in3_l);
237 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
238 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
239 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
240 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
241 dst += (4 * dst_stride);
242
243 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
244 in0_r, in1_r, in2_r, in3_r);
245 ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
246 in0_l, in1_l, in2_l, in3_l);
247 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
248 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
249 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
250 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
251 dst += (4 * dst_stride);
252
253 ILVR_B4_SH(zero, src8, zero, src9, zero, src10, zero, src11,
254 in0_r, in1_r, in2_r, in3_r);
255 ILVL_B4_SH(zero, src8, zero, src9, zero, src10, zero, src11,
256 in0_l, in1_l, in2_l, in3_l);
257 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
258 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
259 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
260 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
261 } else if (0 == (height % 8)) {
262 uint32_t loop_cnt;
263 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
264 v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
265
266 for (loop_cnt = (height >> 3); loop_cnt--;) {
267 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6,
268 src7);
269 src += (8 * src_stride);
270 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_r,
271 in1_r, in2_r, in3_r);
272 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_l,
273 in1_l, in2_l, in3_l);
274 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
275 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
276 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
277 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
278 dst += (4 * dst_stride);
279
280 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_r,
281 in1_r, in2_r, in3_r);
282 ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_l,
283 in1_l, in2_l, in3_l);
284 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
285 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
286 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
287 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
288 dst += (4 * dst_stride);
289 }
290 }
291 }
292
hevc_copy_24w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,int32_t height)293 static void hevc_copy_24w_msa(uint8_t *src, int32_t src_stride,
294 int16_t *dst, int32_t dst_stride,
295 int32_t height)
296 {
297 uint32_t loop_cnt;
298 v16i8 zero = { 0 };
299 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
300 v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
301
302 for (loop_cnt = (height >> 2); loop_cnt--;) {
303 LD_SB4(src, src_stride, src0, src1, src2, src3);
304 LD_SB4((src + 16), src_stride, src4, src5, src6, src7);
305 src += (4 * src_stride);
306 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_r, in1_r,
307 in2_r, in3_r);
308 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_l, in1_l,
309 in2_l, in3_l);
310 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
311 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
312 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
313 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
314 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_r, in1_r,
315 in2_r, in3_r);
316 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
317 ST_SH4(in0_r, in1_r, in2_r, in3_r, (dst + 16), dst_stride);
318 dst += (4 * dst_stride);
319 }
320 }
321
hevc_copy_32w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,int32_t height)322 static void hevc_copy_32w_msa(uint8_t *src, int32_t src_stride,
323 int16_t *dst, int32_t dst_stride,
324 int32_t height)
325 {
326 uint32_t loop_cnt;
327 v16i8 zero = { 0 };
328 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
329 v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
330
331 for (loop_cnt = (height >> 2); loop_cnt--;) {
332 LD_SB4(src, src_stride, src0, src2, src4, src6);
333 LD_SB4((src + 16), src_stride, src1, src3, src5, src7);
334 src += (4 * src_stride);
335
336 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_r, in1_r,
337 in2_r, in3_r);
338 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_l, in1_l,
339 in2_l, in3_l);
340 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
341 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
342 ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
343 dst += dst_stride;
344 ST_SH4(in2_r, in2_l, in3_r, in3_l, dst, 8);
345 dst += dst_stride;
346
347 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_r, in1_r,
348 in2_r, in3_r);
349 ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_l, in1_l,
350 in2_l, in3_l);
351 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
352 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
353 ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
354 dst += dst_stride;
355 ST_SH4(in2_r, in2_l, in3_r, in3_l, dst, 8);
356 dst += dst_stride;
357 }
358 }
359
hevc_copy_48w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,int32_t height)360 static void hevc_copy_48w_msa(uint8_t *src, int32_t src_stride,
361 int16_t *dst, int32_t dst_stride,
362 int32_t height)
363 {
364 uint32_t loop_cnt;
365 v16i8 zero = { 0 };
366 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
367 v16i8 src8, src9, src10, src11;
368 v8i16 in0_r, in1_r, in2_r, in3_r, in4_r, in5_r;
369 v8i16 in0_l, in1_l, in2_l, in3_l, in4_l, in5_l;
370
371 for (loop_cnt = (height >> 2); loop_cnt--;) {
372 LD_SB3(src, 16, src0, src1, src2);
373 src += src_stride;
374 LD_SB3(src, 16, src3, src4, src5);
375 src += src_stride;
376 LD_SB3(src, 16, src6, src7, src8);
377 src += src_stride;
378 LD_SB3(src, 16, src9, src10, src11);
379 src += src_stride;
380
381 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
382 in0_r, in1_r, in2_r, in3_r);
383 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
384 in0_l, in1_l, in2_l, in3_l);
385 ILVR_B2_SH(zero, src4, zero, src5, in4_r, in5_r);
386 ILVL_B2_SH(zero, src4, zero, src5, in4_l, in5_l);
387 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
388 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
389 SLLI_4V(in4_r, in5_r, in4_l, in5_l, 6);
390 ST_SH6(in0_r, in0_l, in1_r, in1_l, in2_r, in2_l, dst, 8);
391 dst += dst_stride;
392 ST_SH6(in3_r, in3_l, in4_r, in4_l, in5_r, in5_l, dst, 8);
393 dst += dst_stride;
394
395 ILVR_B4_SH(zero, src6, zero, src7, zero, src8, zero, src9,
396 in0_r, in1_r, in2_r, in3_r);
397 ILVL_B4_SH(zero, src6, zero, src7, zero, src8, zero, src9,
398 in0_l, in1_l, in2_l, in3_l);
399 ILVR_B2_SH(zero, src10, zero, src11, in4_r, in5_r);
400 ILVL_B2_SH(zero, src10, zero, src11, in4_l, in5_l);
401 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
402 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
403 SLLI_4V(in4_r, in5_r, in4_l, in5_l, 6);
404 ST_SH6(in0_r, in0_l, in1_r, in1_l, in2_r, in2_l, dst, 8);
405 dst += dst_stride;
406 ST_SH6(in3_r, in3_l, in4_r, in4_l, in5_r, in5_l, dst, 8);
407 dst += dst_stride;
408 }
409 }
410
hevc_copy_64w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,int32_t height)411 static void hevc_copy_64w_msa(uint8_t *src, int32_t src_stride,
412 int16_t *dst, int32_t dst_stride,
413 int32_t height)
414 {
415 uint32_t loop_cnt;
416 v16i8 zero = { 0 };
417 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
418 v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
419
420 for (loop_cnt = (height >> 1); loop_cnt--;) {
421 LD_SB4(src, 16, src0, src1, src2, src3);
422 src += src_stride;
423 LD_SB4(src, 16, src4, src5, src6, src7);
424 src += src_stride;
425
426 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
427 in0_r, in1_r, in2_r, in3_r);
428 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
429 in0_l, in1_l, in2_l, in3_l);
430 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
431 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
432 ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
433 ST_SH4(in2_r, in2_l, in3_r, in3_l, (dst + 32), 8);
434 dst += dst_stride;
435
436 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
437 in0_r, in1_r, in2_r, in3_r);
438 ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
439 in0_l, in1_l, in2_l, in3_l);
440 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
441 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
442 ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
443 ST_SH4(in2_r, in2_l, in3_r, in3_l, (dst + 32), 8);
444 dst += dst_stride;
445 }
446 }
447
hevc_hz_8t_4w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)448 static void hevc_hz_8t_4w_msa(uint8_t *src, int32_t src_stride,
449 int16_t *dst, int32_t dst_stride,
450 const int8_t *filter, int32_t height)
451 {
452 uint32_t loop_cnt;
453 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
454 v8i16 filt0, filt1, filt2, filt3;
455 v16i8 mask1, mask2, mask3;
456 v16i8 vec0, vec1, vec2, vec3;
457 v8i16 dst0, dst1, dst2, dst3;
458 v8i16 filter_vec, const_vec;
459 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
460
461 src -= 3;
462 const_vec = __msa_ldi_h(128);
463 const_vec <<= 6;
464
465 filter_vec = LD_SH(filter);
466 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
467
468 mask1 = mask0 + 2;
469 mask2 = mask0 + 4;
470 mask3 = mask0 + 6;
471
472 for (loop_cnt = (height >> 3); loop_cnt--;) {
473 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
474 src += (8 * src_stride);
475 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
476
477 VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
478 vec0, vec1, vec2, vec3);
479 dst0 = const_vec;
480 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
481 dst0, dst0, dst0, dst0);
482 VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
483 vec0, vec1, vec2, vec3);
484 dst1 = const_vec;
485 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
486 dst1, dst1, dst1, dst1);
487 VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
488 vec0, vec1, vec2, vec3);
489 dst2 = const_vec;
490 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
491 dst2, dst2, dst2, dst2);
492 VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
493 vec0, vec1, vec2, vec3);
494 dst3 = const_vec;
495 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
496 dst3, dst3, dst3, dst3);
497
498 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
499 dst += (8 * dst_stride);
500 }
501 }
502
hevc_hz_8t_8w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)503 static void hevc_hz_8t_8w_msa(uint8_t *src, int32_t src_stride,
504 int16_t *dst, int32_t dst_stride,
505 const int8_t *filter, int32_t height)
506 {
507 uint32_t loop_cnt;
508 v16i8 src0, src1, src2, src3;
509 v8i16 filt0, filt1, filt2, filt3;
510 v16i8 mask1, mask2, mask3;
511 v16i8 vec0, vec1, vec2, vec3;
512 v8i16 dst0, dst1, dst2, dst3;
513 v8i16 filter_vec, const_vec;
514 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
515
516 src -= 3;
517 const_vec = __msa_ldi_h(128);
518 const_vec <<= 6;
519
520 filter_vec = LD_SH(filter);
521 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
522
523 mask1 = mask0 + 2;
524 mask2 = mask0 + 4;
525 mask3 = mask0 + 6;
526
527 for (loop_cnt = (height >> 2); loop_cnt--;) {
528 LD_SB4(src, src_stride, src0, src1, src2, src3);
529 src += (4 * src_stride);
530 XORI_B4_128_SB(src0, src1, src2, src3);
531
532 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
533 vec0, vec1, vec2, vec3);
534 dst0 = const_vec;
535 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
536 dst0, dst0, dst0, dst0);
537 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
538 vec0, vec1, vec2, vec3);
539 dst1 = const_vec;
540 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
541 dst1, dst1, dst1, dst1);
542 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
543 vec0, vec1, vec2, vec3);
544 dst2 = const_vec;
545 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
546 dst2, dst2, dst2, dst2);
547 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
548 vec0, vec1, vec2, vec3);
549 dst3 = const_vec;
550 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
551 dst3, dst3, dst3, dst3);
552
553 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
554 dst += (4 * dst_stride);
555 }
556 }
557
hevc_hz_8t_12w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)558 static void hevc_hz_8t_12w_msa(uint8_t *src, int32_t src_stride,
559 int16_t *dst, int32_t dst_stride,
560 const int8_t *filter, int32_t height)
561 {
562 uint32_t loop_cnt;
563 int64_t res0, res1, res2, res3;
564 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
565 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
566 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
567 v8i16 filt0, filt1, filt2, filt3, dst0, dst1, dst2, dst3, dst4, dst5;
568 v8i16 filter_vec, const_vec;
569
570 src -= 3;
571 const_vec = __msa_ldi_h(128);
572 const_vec <<= 6;
573
574 filter_vec = LD_SH(filter);
575 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
576
577 mask0 = LD_SB(ff_hevc_mask_arr);
578 mask1 = mask0 + 2;
579 mask2 = mask0 + 4;
580 mask3 = mask0 + 6;
581 mask4 = LD_SB(ff_hevc_mask_arr + 16);
582 mask5 = mask4 + 2;
583 mask6 = mask4 + 4;
584 mask7 = mask4 + 6;
585
586 for (loop_cnt = 4; loop_cnt--;) {
587 LD_SB4(src, src_stride, src0, src1, src2, src3);
588 LD_SB4(src + 8, src_stride, src4, src5, src6, src7);
589 src += (4 * src_stride);
590 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
591
592 dst0 = const_vec;
593 dst1 = const_vec;
594 dst2 = const_vec;
595 dst3 = const_vec;
596 dst4 = const_vec;
597 dst5 = const_vec;
598 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
599 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
600 VSHF_B2_SB(src4, src5, src6, src7, mask4, mask4, vec4, vec5);
601 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
602 dst1, dst2, dst3);
603 DPADD_SB2_SH(vec4, vec5, filt0, filt0, dst4, dst5);
604 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
605 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
606 VSHF_B2_SB(src4, src5, src6, src7, mask5, mask5, vec4, vec5);
607 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
608 dst1, dst2, dst3);
609 DPADD_SB2_SH(vec4, vec5, filt1, filt1, dst4, dst5);
610 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
611 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
612 VSHF_B2_SB(src4, src5, src6, src7, mask6, mask6, vec4, vec5);
613 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
614 dst1, dst2, dst3);
615 DPADD_SB2_SH(vec4, vec5, filt2, filt2, dst4, dst5);
616 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec0, vec1);
617 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
618 VSHF_B2_SB(src4, src5, src6, src7, mask7, mask7, vec4, vec5);
619 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
620 dst1, dst2, dst3);
621 DPADD_SB2_SH(vec4, vec5, filt3, filt3, dst4, dst5);
622
623 res0 = __msa_copy_s_d((v2i64) dst4, 0);
624 res1 = __msa_copy_s_d((v2i64) dst4, 1);
625 res2 = __msa_copy_s_d((v2i64) dst5, 0);
626 res3 = __msa_copy_s_d((v2i64) dst5, 1);
627 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
628 SD4(res0, res1, res2, res3, (dst + 8), dst_stride);
629 dst += (4 * dst_stride);
630 }
631 }
632
hevc_hz_8t_16w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)633 static void hevc_hz_8t_16w_msa(uint8_t *src, int32_t src_stride,
634 int16_t *dst, int32_t dst_stride,
635 const int8_t *filter, int32_t height)
636 {
637 uint32_t loop_cnt;
638 v16i8 src0, src1, src2, src3;
639 v8i16 filt0, filt1, filt2, filt3;
640 v16i8 mask1, mask2, mask3;
641 v16i8 vec0, vec1, vec2, vec3;
642 v8i16 dst0, dst1, dst2, dst3;
643 v8i16 filter_vec, const_vec;
644 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
645
646 src -= 3;
647 const_vec = __msa_ldi_h(128);
648 const_vec <<= 6;
649
650 filter_vec = LD_SH(filter);
651 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
652
653 mask1 = mask0 + 2;
654 mask2 = mask0 + 4;
655 mask3 = mask0 + 6;
656
657 for (loop_cnt = (height >> 1); loop_cnt--;) {
658 LD_SB2(src, src_stride, src0, src2);
659 LD_SB2(src + 8, src_stride, src1, src3);
660 src += (2 * src_stride);
661 XORI_B4_128_SB(src0, src1, src2, src3);
662
663 dst0 = const_vec;
664 dst1 = const_vec;
665 dst2 = const_vec;
666 dst3 = const_vec;
667 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
668 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
669 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
670 dst1, dst2, dst3);
671 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
672 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
673 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
674 dst1, dst2, dst3);
675 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
676 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
677 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
678 dst1, dst2, dst3);
679 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec0, vec1);
680 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
681 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
682 dst1, dst2, dst3);
683
684 ST_SH2(dst0, dst2, dst, dst_stride);
685 ST_SH2(dst1, dst3, dst + 8, dst_stride);
686 dst += (2 * dst_stride);
687 }
688 }
689
hevc_hz_8t_24w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)690 static void hevc_hz_8t_24w_msa(uint8_t *src, int32_t src_stride,
691 int16_t *dst, int32_t dst_stride,
692 const int8_t *filter, int32_t height)
693 {
694 uint32_t loop_cnt;
695 v16i8 src0, src1, src2, src3;
696 v8i16 filt0, filt1, filt2, filt3;
697 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
698 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
699 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
700 v8i16 filter_vec, const_vec;
701 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
702
703 src -= 3;
704 filter_vec = LD_SH(filter);
705 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
706
707 mask1 = mask0 + 2;
708 mask2 = mask0 + 4;
709 mask3 = mask0 + 6;
710 mask4 = mask0 + 8;
711 mask5 = mask0 + 10;
712 mask6 = mask0 + 12;
713 mask7 = mask0 + 14;
714
715 const_vec = __msa_ldi_h(128);
716 const_vec <<= 6;
717
718 for (loop_cnt = (height >> 1); loop_cnt--;) {
719 LD_SB2(src, 16, src0, src1);
720 src += src_stride;
721 LD_SB2(src, 16, src2, src3);
722 src += src_stride;
723 XORI_B4_128_SB(src0, src1, src2, src3);
724
725 dst0 = const_vec;
726 dst1 = const_vec;
727 dst2 = const_vec;
728 dst3 = const_vec;
729 dst4 = const_vec;
730 dst5 = const_vec;
731 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
732 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
733 VSHF_B2_SB(src2, src3, src3, src3, mask4, mask0, vec4, vec5);
734 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
735 dst1, dst2, dst3);
736 DPADD_SB2_SH(vec4, vec5, filt0, filt0, dst4, dst5);
737 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
738 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
739 VSHF_B2_SB(src2, src3, src3, src3, mask5, mask1, vec4, vec5);
740 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
741 dst1, dst2, dst3);
742 DPADD_SB2_SH(vec4, vec5, filt1, filt1, dst4, dst5);
743 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
744 VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3);
745 VSHF_B2_SB(src2, src3, src3, src3, mask6, mask2, vec4, vec5);
746 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
747 dst1, dst2, dst3);
748 DPADD_SB2_SH(vec4, vec5, filt2, filt2, dst4, dst5);
749 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
750 VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3);
751 VSHF_B2_SB(src2, src3, src3, src3, mask7, mask3, vec4, vec5);
752 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
753 dst1, dst2, dst3);
754 DPADD_SB2_SH(vec4, vec5, filt3, filt3, dst4, dst5);
755
756 ST_SH2(dst0, dst1, dst, 8);
757 ST_SH(dst2, dst + 16);
758 dst += dst_stride;
759 ST_SH2(dst3, dst4, dst, 8);
760 ST_SH(dst5, dst + 16);
761 dst += dst_stride;
762 }
763 }
764
hevc_hz_8t_32w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)765 static void hevc_hz_8t_32w_msa(uint8_t *src, int32_t src_stride,
766 int16_t *dst, int32_t dst_stride,
767 const int8_t *filter, int32_t height)
768 {
769 uint32_t loop_cnt;
770 v16i8 src0, src1, src2;
771 v8i16 filt0, filt1, filt2, filt3;
772 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
773 v16i8 vec0, vec1, vec2, vec3;
774 v8i16 dst0, dst1, dst2, dst3;
775 v8i16 filter_vec, const_vec;
776 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
777
778 src -= 3;
779 filter_vec = LD_SH(filter);
780 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
781
782 mask1 = mask0 + 2;
783 mask2 = mask0 + 4;
784 mask3 = mask0 + 6;
785 mask4 = mask0 + 8;
786 mask5 = mask0 + 10;
787 mask6 = mask0 + 12;
788 mask7 = mask0 + 14;
789
790 const_vec = __msa_ldi_h(128);
791 const_vec <<= 6;
792
793 for (loop_cnt = height; loop_cnt--;) {
794 LD_SB2(src, 16, src0, src1);
795 src2 = LD_SB(src + 24);
796 src += src_stride;
797 XORI_B3_128_SB(src0, src1, src2);
798
799 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
800 vec0, vec1, vec2, vec3);
801 dst0 = const_vec;
802 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
803 dst0, dst0, dst0, dst0);
804 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
805 vec0, vec1, vec2, vec3);
806 dst1 = const_vec;
807 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
808 dst1, dst1, dst1, dst1);
809 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
810 vec0, vec1, vec2, vec3);
811 dst2 = const_vec;
812 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
813 dst2, dst2, dst2, dst2);
814 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
815 vec0, vec1, vec2, vec3);
816 dst3 = const_vec;
817 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
818 dst3, dst3, dst3, dst3);
819
820 ST_SH4(dst0, dst1, dst2, dst3, dst, 8);
821 dst += dst_stride;
822 }
823 }
824
hevc_hz_8t_48w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)825 static void hevc_hz_8t_48w_msa(uint8_t *src, int32_t src_stride,
826 int16_t *dst, int32_t dst_stride,
827 const int8_t *filter, int32_t height)
828 {
829 uint32_t loop_cnt;
830 v16i8 src0, src1, src2, src3;
831 v8i16 filt0, filt1, filt2, filt3;
832 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
833 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
834 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
835 v8i16 filter_vec, const_vec;
836 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
837
838 src -= 3;
839 filter_vec = LD_SH(filter);
840 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
841
842 mask1 = mask0 + 2;
843 mask2 = mask0 + 4;
844 mask3 = mask0 + 6;
845 mask4 = mask0 + 8;
846 mask5 = mask0 + 10;
847 mask6 = mask0 + 12;
848 mask7 = mask0 + 14;
849
850 const_vec = __msa_ldi_h(128);
851 const_vec <<= 6;
852
853 for (loop_cnt = height; loop_cnt--;) {
854 LD_SB3(src, 16, src0, src1, src2);
855 src3 = LD_SB(src + 40);
856 src += src_stride;
857 XORI_B4_128_SB(src0, src1, src2, src3);
858
859 dst0 = const_vec;
860 dst1 = const_vec;
861 dst2 = const_vec;
862 dst3 = const_vec;
863 dst4 = const_vec;
864 dst5 = const_vec;
865 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
866 VSHF_B2_SB(src1, src1, src1, src2, mask0, mask4, vec2, vec3);
867 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
868 dst1, dst2, dst3);
869 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
870 VSHF_B2_SB(src1, src1, src1, src2, mask1, mask5, vec2, vec3);
871 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
872 dst1, dst2, dst3);
873 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
874 VSHF_B2_SB(src1, src1, src1, src2, mask2, mask6, vec2, vec3);
875 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
876 dst1, dst2, dst3);
877 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
878 VSHF_B2_SB(src1, src1, src1, src2, mask3, mask7, vec2, vec3);
879 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
880 dst1, dst2, dst3);
881 ST_SH4(dst0, dst1, dst2, dst3, dst, 8);
882
883 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec4, vec5);
884 DPADD_SB2_SH(vec4, vec5, filt0, filt0, dst4, dst5);
885 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec4, vec5);
886 DPADD_SB2_SH(vec4, vec5, filt1, filt1, dst4, dst5);
887 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec4, vec5);
888 DPADD_SB2_SH(vec4, vec5, filt2, filt2, dst4, dst5);
889 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec4, vec5);
890 DPADD_SB2_SH(vec4, vec5, filt3, filt3, dst4, dst5);
891 ST_SH2(dst4, dst5, (dst + 32), 8);
892 dst += dst_stride;
893 }
894 }
895
hevc_hz_8t_64w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)896 static void hevc_hz_8t_64w_msa(uint8_t *src, int32_t src_stride,
897 int16_t *dst, int32_t dst_stride,
898 const int8_t *filter, int32_t height)
899 {
900 uint32_t loop_cnt;
901 v16i8 src0, src1, src2, src3, src4;
902 v8i16 filt0, filt1, filt2, filt3;
903 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
904 v16i8 vec0, vec1, vec2, vec3;
905 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
906 v8i16 filter_vec, const_vec;
907 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
908
909 src -= 3;
910
911 filter_vec = LD_SH(filter);
912 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
913
914 mask1 = mask0 + 2;
915 mask2 = mask0 + 4;
916 mask3 = mask0 + 6;
917 mask4 = mask0 + 8;
918 mask5 = mask0 + 10;
919 mask6 = mask0 + 12;
920 mask7 = mask0 + 14;
921
922 const_vec = __msa_ldi_h(128);
923 const_vec <<= 6;
924
925 for (loop_cnt = height; loop_cnt--;) {
926 LD_SB4(src, 16, src0, src1, src2, src3);
927 src4 = LD_SB(src + 56);
928 src += src_stride;
929 XORI_B5_128_SB(src0, src1, src2, src3, src4);
930
931 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
932 vec0, vec1, vec2, vec3);
933 dst0 = const_vec;
934 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
935 dst0, dst0, dst0, dst0);
936 ST_SH(dst0, dst);
937
938 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
939 vec0, vec1, vec2, vec3);
940 dst1 = const_vec;
941 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
942 dst1, dst1, dst1, dst1);
943 ST_SH(dst1, dst + 8);
944
945 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
946 vec0, vec1, vec2, vec3);
947 dst2 = const_vec;
948 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
949 dst2, dst2, dst2, dst2);
950 ST_SH(dst2, dst + 16);
951
952 VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
953 vec0, vec1, vec2, vec3);
954 dst3 = const_vec;
955 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
956 dst3, dst3, dst3, dst3);
957 ST_SH(dst3, dst + 24);
958
959 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
960 vec0, vec1, vec2, vec3);
961 dst4 = const_vec;
962 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
963 dst4, dst4, dst4, dst4);
964 ST_SH(dst4, dst + 32);
965
966 VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7,
967 vec0, vec1, vec2, vec3);
968 dst5 = const_vec;
969 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
970 dst5, dst5, dst5, dst5);
971 ST_SH(dst5, dst + 40);
972
973 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
974 vec0, vec1, vec2, vec3);
975 dst6 = const_vec;
976 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
977 dst6, dst6, dst6, dst6);
978 ST_SH(dst6, dst + 48);
979
980 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
981 vec0, vec1, vec2, vec3);
982 dst7 = const_vec;
983 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
984 dst7, dst7, dst7, dst7);
985 ST_SH(dst7, dst + 56);
986 dst += dst_stride;
987 }
988 }
989
hevc_vt_8t_4w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)990 static void hevc_vt_8t_4w_msa(uint8_t *src, int32_t src_stride,
991 int16_t *dst, int32_t dst_stride,
992 const int8_t *filter, int32_t height)
993 {
994 int32_t loop_cnt;
995 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
996 v16i8 src9, src10, src11, src12, src13, src14;
997 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
998 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
999 v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1000 v16i8 src2110, src4332, src6554, src8776, src10998;
1001 v16i8 src12111110, src14131312;
1002 v8i16 dst10, dst32, dst54, dst76;
1003 v8i16 filt0, filt1, filt2, filt3;
1004 v8i16 filter_vec, const_vec;
1005
1006 src -= (3 * src_stride);
1007
1008 const_vec = __msa_ldi_h(128);
1009 const_vec <<= 6;
1010
1011 filter_vec = LD_SH(filter);
1012 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1013
1014 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1015 src += (7 * src_stride);
1016 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1017 src10_r, src32_r, src54_r, src21_r);
1018 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1019 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1020 src2110, src4332, src6554);
1021 XORI_B3_128_SB(src2110, src4332, src6554);
1022
1023 for (loop_cnt = (height >> 3); loop_cnt--;) {
1024 LD_SB8(src, src_stride,
1025 src7, src8, src9, src10, src11, src12, src13, src14);
1026 src += (8 * src_stride);
1027
1028 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1029 src76_r, src87_r, src98_r, src109_r);
1030 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1031 src1110_r, src1211_r, src1312_r, src1413_r);
1032 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r,
1033 src1211_r, src1110_r, src1413_r, src1312_r,
1034 src8776, src10998, src12111110, src14131312);
1035 XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
1036
1037 dst10 = const_vec;
1038 DPADD_SB4_SH(src2110, src4332, src6554, src8776,
1039 filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10);
1040 dst32 = const_vec;
1041 DPADD_SB4_SH(src4332, src6554, src8776, src10998,
1042 filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
1043 dst54 = const_vec;
1044 DPADD_SB4_SH(src6554, src8776, src10998, src12111110,
1045 filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
1046 dst76 = const_vec;
1047 DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
1048 filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
1049
1050 ST_D8(dst10, dst32, dst54, dst76, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
1051 dst += (8 * dst_stride);
1052
1053 src2110 = src10998;
1054 src4332 = src12111110;
1055 src6554 = src14131312;
1056 src6 = src14;
1057 }
1058 }
1059
hevc_vt_8t_8w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1060 static void hevc_vt_8t_8w_msa(uint8_t *src, int32_t src_stride,
1061 int16_t *dst, int32_t dst_stride,
1062 const int8_t *filter, int32_t height)
1063 {
1064 int32_t loop_cnt;
1065 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1066 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1067 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1068 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1069 v8i16 filter_vec, const_vec;
1070 v8i16 filt0, filt1, filt2, filt3;
1071
1072 src -= (3 * src_stride);
1073 const_vec = __msa_ldi_h(128);
1074 const_vec <<= 6;
1075
1076 filter_vec = LD_SH(filter);
1077 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1078
1079 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1080 src += (7 * src_stride);
1081 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1082 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1083 src10_r, src32_r, src54_r, src21_r);
1084 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1085
1086 for (loop_cnt = (height >> 2); loop_cnt--;) {
1087 LD_SB4(src, src_stride, src7, src8, src9, src10);
1088 src += (4 * src_stride);
1089 XORI_B4_128_SB(src7, src8, src9, src10);
1090 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1091 src76_r, src87_r, src98_r, src109_r);
1092
1093 dst0_r = const_vec;
1094 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1095 filt0, filt1, filt2, filt3,
1096 dst0_r, dst0_r, dst0_r, dst0_r);
1097 dst1_r = const_vec;
1098 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1099 filt0, filt1, filt2, filt3,
1100 dst1_r, dst1_r, dst1_r, dst1_r);
1101 dst2_r = const_vec;
1102 DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
1103 filt0, filt1, filt2, filt3,
1104 dst2_r, dst2_r, dst2_r, dst2_r);
1105 dst3_r = const_vec;
1106 DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
1107 filt0, filt1, filt2, filt3,
1108 dst3_r, dst3_r, dst3_r, dst3_r);
1109
1110 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
1111 dst += (4 * dst_stride);
1112
1113 src10_r = src54_r;
1114 src32_r = src76_r;
1115 src54_r = src98_r;
1116 src21_r = src65_r;
1117 src43_r = src87_r;
1118 src65_r = src109_r;
1119 src6 = src10;
1120 }
1121 }
1122
hevc_vt_8t_12w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1123 static void hevc_vt_8t_12w_msa(uint8_t *src, int32_t src_stride,
1124 int16_t *dst, int32_t dst_stride,
1125 const int8_t *filter, int32_t height)
1126 {
1127 int32_t loop_cnt;
1128 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1129 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1130 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1131 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1132 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1133 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1134 v16i8 src2110, src4332, src6554, src8776, src10998;
1135 v8i16 dst0_l, dst1_l;
1136 v8i16 filter_vec, const_vec;
1137 v8i16 filt0, filt1, filt2, filt3;
1138
1139 src -= (3 * src_stride);
1140 const_vec = __msa_ldi_h(128);
1141 const_vec <<= 6;
1142
1143 filter_vec = LD_SH(filter);
1144 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1145
1146 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1147 src += (7 * src_stride);
1148 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1149 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1150 src10_r, src32_r, src54_r, src21_r);
1151 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1152 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1153 src10_l, src32_l, src54_l, src21_l);
1154 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1155 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1156 src2110, src4332, src6554);
1157
1158 for (loop_cnt = (height >> 2); loop_cnt--;) {
1159 LD_SB4(src, src_stride, src7, src8, src9, src10);
1160 src += (4 * src_stride);
1161 XORI_B4_128_SB(src7, src8, src9, src10);
1162 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1163 src76_r, src87_r, src98_r, src109_r);
1164 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1165 src76_l, src87_l, src98_l, src109_l);
1166 ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
1167
1168 dst0_r = const_vec;
1169 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1170 filt0, filt1, filt2, filt3,
1171 dst0_r, dst0_r, dst0_r, dst0_r);
1172 dst1_r = const_vec;
1173 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1174 filt0, filt1, filt2, filt3,
1175 dst1_r, dst1_r, dst1_r, dst1_r);
1176 dst2_r = const_vec;
1177 DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
1178 filt0, filt1, filt2, filt3,
1179 dst2_r, dst2_r, dst2_r, dst2_r);
1180 dst3_r = const_vec;
1181 DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
1182 filt0, filt1, filt2, filt3,
1183 dst3_r, dst3_r, dst3_r, dst3_r);
1184 dst0_l = const_vec;
1185 DPADD_SB4_SH(src2110, src4332, src6554, src8776,
1186 filt0, filt1, filt2, filt3,
1187 dst0_l, dst0_l, dst0_l, dst0_l);
1188 dst1_l = const_vec;
1189 DPADD_SB4_SH(src4332, src6554, src8776, src10998,
1190 filt0, filt1, filt2, filt3,
1191 dst1_l, dst1_l, dst1_l, dst1_l);
1192
1193 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
1194 ST_D4(dst0_l, dst1_l, 0, 1, 0, 1, dst + 8, dst_stride);
1195 dst += (4 * dst_stride);
1196
1197 src10_r = src54_r;
1198 src32_r = src76_r;
1199 src54_r = src98_r;
1200 src21_r = src65_r;
1201 src43_r = src87_r;
1202 src65_r = src109_r;
1203 src2110 = src6554;
1204 src4332 = src8776;
1205 src6554 = src10998;
1206 src6 = src10;
1207 }
1208 }
1209
hevc_vt_8t_16multx4mult_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t width)1210 static void hevc_vt_8t_16multx4mult_msa(uint8_t *src,
1211 int32_t src_stride,
1212 int16_t *dst,
1213 int32_t dst_stride,
1214 const int8_t *filter,
1215 int32_t height,
1216 int32_t width)
1217 {
1218 uint8_t *src_tmp;
1219 int16_t *dst_tmp;
1220 int32_t loop_cnt, cnt;
1221 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1222 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1223 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1224 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1225 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1226 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1227 v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
1228 v8i16 filter_vec, const_vec;
1229 v8i16 filt0, filt1, filt2, filt3;
1230
1231 src -= (3 * src_stride);
1232 const_vec = __msa_ldi_h(128);
1233 const_vec <<= 6;
1234
1235 filter_vec = LD_SH(filter);
1236 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1237
1238 for (cnt = width >> 4; cnt--;) {
1239 src_tmp = src;
1240 dst_tmp = dst;
1241
1242 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1243 src_tmp += (7 * src_stride);
1244 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1245 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1246 src10_r, src32_r, src54_r, src21_r);
1247 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1248 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1249 src10_l, src32_l, src54_l, src21_l);
1250 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1251
1252 for (loop_cnt = (height >> 2); loop_cnt--;) {
1253 LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1254 src_tmp += (4 * src_stride);
1255 XORI_B4_128_SB(src7, src8, src9, src10);
1256 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1257 src76_r, src87_r, src98_r, src109_r);
1258 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1259 src76_l, src87_l, src98_l, src109_l);
1260
1261 dst0_r = const_vec;
1262 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1263 filt0, filt1, filt2, filt3,
1264 dst0_r, dst0_r, dst0_r, dst0_r);
1265 dst1_r = const_vec;
1266 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1267 filt0, filt1, filt2, filt3,
1268 dst1_r, dst1_r, dst1_r, dst1_r);
1269 dst2_r = const_vec;
1270 DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
1271 filt0, filt1, filt2, filt3,
1272 dst2_r, dst2_r, dst2_r, dst2_r);
1273 dst3_r = const_vec;
1274 DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
1275 filt0, filt1, filt2, filt3,
1276 dst3_r, dst3_r, dst3_r, dst3_r);
1277 dst0_l = const_vec;
1278 DPADD_SB4_SH(src10_l, src32_l, src54_l, src76_l,
1279 filt0, filt1, filt2, filt3,
1280 dst0_l, dst0_l, dst0_l, dst0_l);
1281 dst1_l = const_vec;
1282 DPADD_SB4_SH(src21_l, src43_l, src65_l, src87_l,
1283 filt0, filt1, filt2, filt3,
1284 dst1_l, dst1_l, dst1_l, dst1_l);
1285 dst2_l = const_vec;
1286 DPADD_SB4_SH(src32_l, src54_l, src76_l, src98_l,
1287 filt0, filt1, filt2, filt3,
1288 dst2_l, dst2_l, dst2_l, dst2_l);
1289 dst3_l = const_vec;
1290 DPADD_SB4_SH(src43_l, src65_l, src87_l, src109_l,
1291 filt0, filt1, filt2, filt3,
1292 dst3_l, dst3_l, dst3_l, dst3_l);
1293
1294 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst_tmp, dst_stride);
1295 ST_SH4(dst0_l, dst1_l, dst2_l, dst3_l, dst_tmp + 8, dst_stride);
1296 dst_tmp += (4 * dst_stride);
1297
1298 src10_r = src54_r;
1299 src32_r = src76_r;
1300 src54_r = src98_r;
1301 src21_r = src65_r;
1302 src43_r = src87_r;
1303 src65_r = src109_r;
1304 src10_l = src54_l;
1305 src32_l = src76_l;
1306 src54_l = src98_l;
1307 src21_l = src65_l;
1308 src43_l = src87_l;
1309 src65_l = src109_l;
1310 src6 = src10;
1311 }
1312
1313 src += 16;
1314 dst += 16;
1315 }
1316 }
1317
hevc_vt_8t_16w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1318 static void hevc_vt_8t_16w_msa(uint8_t *src, int32_t src_stride,
1319 int16_t *dst, int32_t dst_stride,
1320 const int8_t *filter, int32_t height)
1321 {
1322 hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1323 filter, height, 16);
1324 }
1325
hevc_vt_8t_24w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1326 static void hevc_vt_8t_24w_msa(uint8_t *src, int32_t src_stride,
1327 int16_t *dst, int32_t dst_stride,
1328 const int8_t *filter, int32_t height)
1329 {
1330 hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1331 filter, height, 16);
1332 hevc_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride,
1333 filter, height);
1334 }
1335
hevc_vt_8t_32w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1336 static void hevc_vt_8t_32w_msa(uint8_t *src, int32_t src_stride,
1337 int16_t *dst, int32_t dst_stride,
1338 const int8_t *filter, int32_t height)
1339 {
1340 hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1341 filter, height, 32);
1342 }
1343
hevc_vt_8t_48w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1344 static void hevc_vt_8t_48w_msa(uint8_t *src, int32_t src_stride,
1345 int16_t *dst, int32_t dst_stride,
1346 const int8_t *filter, int32_t height)
1347 {
1348 hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1349 filter, height, 48);
1350 }
1351
hevc_vt_8t_64w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1352 static void hevc_vt_8t_64w_msa(uint8_t *src, int32_t src_stride,
1353 int16_t *dst, int32_t dst_stride,
1354 const int8_t *filter, int32_t height)
1355 {
1356 hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1357 filter, height, 64);
1358 }
1359
hevc_hv_8t_4w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)1360 static void hevc_hv_8t_4w_msa(uint8_t *src, int32_t src_stride,
1361 int16_t *dst, int32_t dst_stride,
1362 const int8_t *filter_x, const int8_t *filter_y,
1363 int32_t height)
1364 {
1365 uint32_t loop_cnt;
1366 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1367 v8i16 filt0, filt1, filt2, filt3;
1368 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1369 v16i8 mask1, mask2, mask3;
1370 v8i16 filter_vec, const_vec;
1371 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1372 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1373 v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1374 v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
1375 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r;
1376 v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
1377 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1378
1379 src -= ((3 * src_stride) + 3);
1380 filter_vec = LD_SH(filter_x);
1381 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1382
1383 filter_vec = LD_SH(filter_y);
1384 UNPCK_R_SB_SH(filter_vec, filter_vec);
1385
1386 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1387
1388 mask1 = mask0 + 2;
1389 mask2 = mask0 + 4;
1390 mask3 = mask0 + 6;
1391
1392 const_vec = __msa_ldi_h(128);
1393 const_vec <<= 6;
1394
1395 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1396 src += (7 * src_stride);
1397 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1398
1399 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1400 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1401 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1402 vec8, vec9, vec10, vec11);
1403 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1404 vec12, vec13, vec14, vec15);
1405 dst30 = const_vec;
1406 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1407 dst30, dst30, dst30, dst30);
1408 dst41 = const_vec;
1409 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1410 dst41, dst41, dst41, dst41);
1411 dst52 = const_vec;
1412 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1413 dst52, dst52, dst52, dst52);
1414 dst63 = const_vec;
1415 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1416 dst63, dst63, dst63, dst63);
1417
1418 ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1419 ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1420 ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1421 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1422
1423 for (loop_cnt = height >> 2; loop_cnt--;) {
1424 LD_SB4(src, src_stride, src7, src8, src9, src10);
1425 src += (4 * src_stride);
1426 XORI_B4_128_SB(src7, src8, src9, src10);
1427
1428 VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
1429 vec0, vec1, vec2, vec3);
1430 VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
1431 vec4, vec5, vec6, vec7);
1432 dst97 = const_vec;
1433 dst108 = const_vec;
1434 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1435 dst97, dst97, dst97, dst97);
1436 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1437 dst108, dst108, dst108, dst108);
1438
1439 dst76_r = __msa_ilvr_h(dst97, dst66);
1440 ILVRL_H2_SH(dst108, dst97, dst87_r, dst109_r);
1441 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
1442 dst98_r = __msa_ilvr_h(dst66, dst108);
1443
1444 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1445 filt_h0, filt_h1, filt_h2, filt_h3);
1446 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1447 filt_h0, filt_h1, filt_h2, filt_h3);
1448 dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r,
1449 filt_h0, filt_h1, filt_h2, filt_h3);
1450 dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r,
1451 filt_h0, filt_h1, filt_h2, filt_h3);
1452 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1453 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst2_r);
1454 ST_D4(dst0_r, dst2_r, 0, 1, 0, 1, dst, dst_stride);
1455 dst += (4 * dst_stride);
1456
1457 dst10_r = dst54_r;
1458 dst32_r = dst76_r;
1459 dst54_r = dst98_r;
1460 dst21_r = dst65_r;
1461 dst43_r = dst87_r;
1462 dst65_r = dst109_r;
1463 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
1464 }
1465 }
1466
hevc_hv_8t_8multx1mult_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height,int32_t width)1467 static void hevc_hv_8t_8multx1mult_msa(uint8_t *src,
1468 int32_t src_stride,
1469 int16_t *dst,
1470 int32_t dst_stride,
1471 const int8_t *filter_x,
1472 const int8_t *filter_y,
1473 int32_t height, int32_t width)
1474 {
1475 uint32_t loop_cnt, cnt;
1476 uint8_t *src_tmp;
1477 int16_t *dst_tmp;
1478 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1479 v8i16 filt0, filt1, filt2, filt3;
1480 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1481 v16i8 mask1, mask2, mask3;
1482 v8i16 filter_vec, const_vec;
1483 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1484 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1485 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1486 v4i32 dst0_r, dst0_l;
1487 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1488 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1489 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1490
1491 src -= ((3 * src_stride) + 3);
1492 filter_vec = LD_SH(filter_x);
1493 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1494
1495 filter_vec = LD_SH(filter_y);
1496 UNPCK_R_SB_SH(filter_vec, filter_vec);
1497
1498 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1499
1500 mask1 = mask0 + 2;
1501 mask2 = mask0 + 4;
1502 mask3 = mask0 + 6;
1503
1504 const_vec = __msa_ldi_h(128);
1505 const_vec <<= 6;
1506
1507 for (cnt = width >> 3; cnt--;) {
1508 src_tmp = src;
1509 dst_tmp = dst;
1510
1511 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1512 src_tmp += (7 * src_stride);
1513 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1514
1515 /* row 0 row 1 row 2 row 3 */
1516 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1517 vec0, vec1, vec2, vec3);
1518 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1519 vec4, vec5, vec6, vec7);
1520 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1521 vec8, vec9, vec10, vec11);
1522 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1523 vec12, vec13, vec14, vec15);
1524 dst0 = const_vec;
1525 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1526 dst0, dst0, dst0, dst0);
1527 dst1 = const_vec;
1528 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1529 dst1, dst1, dst1, dst1);
1530 dst2 = const_vec;
1531 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1532 dst2, dst2, dst2, dst2);
1533 dst3 = const_vec;
1534 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1535 dst3, dst3, dst3, dst3);
1536
1537 /* row 4 row 5 row 6 */
1538 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1539 vec0, vec1, vec2, vec3);
1540 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1541 vec4, vec5, vec6, vec7);
1542 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1543 vec8, vec9, vec10, vec11);
1544 dst4 = const_vec;
1545 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1546 dst4, dst4, dst4, dst4);
1547 dst5 = const_vec;
1548 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1549 dst5, dst5, dst5, dst5);
1550 dst6 = const_vec;
1551 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1552 dst6, dst6, dst6, dst6);
1553
1554 for (loop_cnt = height; loop_cnt--;) {
1555 src7 = LD_SB(src_tmp);
1556 src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
1557 src_tmp += src_stride;
1558
1559 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1560 vec0, vec1, vec2, vec3);
1561 dst7 = const_vec;
1562 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1563 dst7, dst7, dst7, dst7);
1564
1565 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
1566 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
1567 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
1568 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1569 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1570 filt_h0, filt_h1, filt_h2, filt_h3);
1571 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1572 filt_h0, filt_h1, filt_h2, filt_h3);
1573 dst0_r >>= 6;
1574 dst0_l >>= 6;
1575
1576 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
1577 ST_SW(dst0_r, dst_tmp);
1578 dst_tmp += dst_stride;
1579
1580 dst0 = dst1;
1581 dst1 = dst2;
1582 dst2 = dst3;
1583 dst3 = dst4;
1584 dst4 = dst5;
1585 dst5 = dst6;
1586 dst6 = dst7;
1587 }
1588
1589 src += 8;
1590 dst += 8;
1591 }
1592 }
1593
hevc_hv_8t_8w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)1594 static void hevc_hv_8t_8w_msa(uint8_t *src, int32_t src_stride,
1595 int16_t *dst, int32_t dst_stride,
1596 const int8_t *filter_x, const int8_t *filter_y,
1597 int32_t height)
1598 {
1599 hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride,
1600 filter_x, filter_y, height, 8);
1601 }
1602
hevc_hv_8t_12w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)1603 static void hevc_hv_8t_12w_msa(uint8_t *src, int32_t src_stride,
1604 int16_t *dst, int32_t dst_stride,
1605 const int8_t *filter_x, const int8_t *filter_y,
1606 int32_t height)
1607 {
1608 uint32_t loop_cnt;
1609 uint8_t *src_tmp;
1610 int16_t *dst_tmp;
1611 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1612 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1613 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1614 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1615 v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
1616 v8i16 filter_vec, const_vec;
1617 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1618 v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1619 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
1620 v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
1621 v4i32 dst0_r, dst0_l, dst1_r, dst2_r, dst3_r;
1622
1623 src -= ((3 * src_stride) + 3);
1624 filter_vec = LD_SH(filter_x);
1625 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1626
1627 filter_vec = LD_SH(filter_y);
1628 UNPCK_R_SB_SH(filter_vec, filter_vec);
1629
1630 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1631
1632 mask0 = LD_SB(ff_hevc_mask_arr);
1633 mask1 = mask0 + 2;
1634 mask2 = mask0 + 4;
1635 mask3 = mask0 + 6;
1636
1637 const_vec = __msa_ldi_h(128);
1638 const_vec <<= 6;
1639
1640 src_tmp = src;
1641 dst_tmp = dst;
1642
1643 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1644 src_tmp += (7 * src_stride);
1645 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1646
1647 /* row 0 row 1 row 2 row 3 */
1648 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1649 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1650 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1651 vec11);
1652 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
1653 vec15);
1654 dst0 = const_vec;
1655 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst0, dst0,
1656 dst0, dst0);
1657 dst1 = const_vec;
1658 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, dst1, dst1,
1659 dst1, dst1);
1660 dst2 = const_vec;
1661 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, dst2,
1662 dst2, dst2, dst2);
1663 dst3 = const_vec;
1664 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3, dst3,
1665 dst3, dst3, dst3);
1666
1667 /* row 4 row 5 row 6 */
1668 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1669 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1670 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1671 vec11);
1672 dst4 = const_vec;
1673 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst4, dst4,
1674 dst4, dst4);
1675 dst5 = const_vec;
1676 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, dst5, dst5,
1677 dst5, dst5);
1678 dst6 = const_vec;
1679 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, dst6,
1680 dst6, dst6, dst6);
1681
1682 for (loop_cnt = height; loop_cnt--;) {
1683 src7 = LD_SB(src_tmp);
1684 src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
1685 src_tmp += src_stride;
1686
1687 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1688 vec3);
1689 dst7 = const_vec;
1690 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst7,
1691 dst7, dst7, dst7);
1692
1693 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
1694 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
1695 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
1696 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1697 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1698 filt_h1, filt_h2, filt_h3);
1699 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0,
1700 filt_h1, filt_h2, filt_h3);
1701 dst0_r >>= 6;
1702 dst0_l >>= 6;
1703
1704 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
1705 ST_SW(dst0_r, dst_tmp);
1706 dst_tmp += dst_stride;
1707
1708 dst0 = dst1;
1709 dst1 = dst2;
1710 dst2 = dst3;
1711 dst3 = dst4;
1712 dst4 = dst5;
1713 dst5 = dst6;
1714 dst6 = dst7;
1715 }
1716
1717 src += 8;
1718 dst += 8;
1719
1720 mask4 = LD_SB(ff_hevc_mask_arr + 16);
1721 mask5 = mask4 + 2;
1722 mask6 = mask4 + 4;
1723 mask7 = mask4 + 6;
1724
1725 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1726 src += (7 * src_stride);
1727 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1728
1729 VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
1730 VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
1731 VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1732 vec11);
1733 VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
1734 vec15);
1735 dst30 = const_vec;
1736 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst30,
1737 dst30, dst30, dst30);
1738 dst41 = const_vec;
1739 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, dst41,
1740 dst41, dst41, dst41);
1741 dst52 = const_vec;
1742 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, dst52,
1743 dst52, dst52, dst52);
1744 dst63 = const_vec;
1745 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3, dst63,
1746 dst63, dst63, dst63);
1747
1748 ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1749 ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1750 ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1751
1752 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1753
1754 for (loop_cnt = height >> 2; loop_cnt--;) {
1755 LD_SB4(src, src_stride, src7, src8, src9, src10);
1756 src += (4 * src_stride);
1757 XORI_B4_128_SB(src7, src8, src9, src10);
1758
1759 VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
1760 vec3);
1761 VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
1762 vec7);
1763 dst97 = const_vec;
1764 dst108 = const_vec;
1765 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst97,
1766 dst97, dst97, dst97);
1767 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, dst108,
1768 dst108, dst108, dst108);
1769
1770 dst76_r = __msa_ilvr_h(dst97, dst66);
1771 ILVRL_H2_SH(dst108, dst97, dst87_r, dst109_r);
1772 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
1773 dst98_r = __msa_ilvr_h(dst66, dst108);
1774
1775 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1776 filt_h1, filt_h2, filt_h3);
1777 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1778 filt_h1, filt_h2, filt_h3);
1779 dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1780 filt_h1, filt_h2, filt_h3);
1781 dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1782 filt_h1, filt_h2, filt_h3);
1783 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1784 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst2_r);
1785 ST_D4(dst0_r, dst2_r, 0, 1, 0, 1, dst, dst_stride);
1786 dst += (4 * dst_stride);
1787
1788 dst10_r = dst54_r;
1789 dst32_r = dst76_r;
1790 dst54_r = dst98_r;
1791 dst21_r = dst65_r;
1792 dst43_r = dst87_r;
1793 dst65_r = dst109_r;
1794 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
1795 }
1796 }
1797
hevc_hv_8t_16w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)1798 static void hevc_hv_8t_16w_msa(uint8_t *src, int32_t src_stride,
1799 int16_t *dst, int32_t dst_stride,
1800 const int8_t *filter_x, const int8_t *filter_y,
1801 int32_t height)
1802 {
1803 hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride,
1804 filter_x, filter_y, height, 16);
1805 }
1806
hevc_hv_8t_24w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)1807 static void hevc_hv_8t_24w_msa(uint8_t *src, int32_t src_stride,
1808 int16_t *dst, int32_t dst_stride,
1809 const int8_t *filter_x, const int8_t *filter_y,
1810 int32_t height)
1811 {
1812 hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride,
1813 filter_x, filter_y, height, 24);
1814 }
1815
hevc_hv_8t_32w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)1816 static void hevc_hv_8t_32w_msa(uint8_t *src, int32_t src_stride,
1817 int16_t *dst, int32_t dst_stride,
1818 const int8_t *filter_x, const int8_t *filter_y,
1819 int32_t height)
1820 {
1821 hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride,
1822 filter_x, filter_y, height, 32);
1823 }
1824
hevc_hv_8t_48w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)1825 static void hevc_hv_8t_48w_msa(uint8_t *src, int32_t src_stride,
1826 int16_t *dst, int32_t dst_stride,
1827 const int8_t *filter_x, const int8_t *filter_y,
1828 int32_t height)
1829 {
1830 hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride,
1831 filter_x, filter_y, height, 48);
1832 }
1833
hevc_hv_8t_64w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)1834 static void hevc_hv_8t_64w_msa(uint8_t *src, int32_t src_stride,
1835 int16_t *dst, int32_t dst_stride,
1836 const int8_t *filter_x, const int8_t *filter_y,
1837 int32_t height)
1838 {
1839 hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride,
1840 filter_x, filter_y, height, 64);
1841 }
1842
hevc_hz_4t_4x2_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter)1843 static void hevc_hz_4t_4x2_msa(uint8_t *src,
1844 int32_t src_stride,
1845 int16_t *dst,
1846 int32_t dst_stride,
1847 const int8_t *filter)
1848 {
1849 v8i16 filt0, filt1;
1850 v16i8 src0, src1;
1851 v16i8 mask1, vec0, vec1;
1852 v8i16 dst0;
1853 v8i16 filter_vec, const_vec;
1854 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1855
1856 src -= 1;
1857
1858 filter_vec = LD_SH(filter);
1859 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
1860
1861 mask1 = mask0 + 2;
1862
1863 const_vec = __msa_ldi_h(128);
1864 const_vec <<= 6;
1865
1866 LD_SB2(src, src_stride, src0, src1);
1867 XORI_B2_128_SB(src0, src1);
1868
1869 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1870 dst0 = const_vec;
1871 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
1872
1873 ST_D2(dst0, 0, 1, dst, dst_stride);
1874 }
1875
hevc_hz_4t_4x4_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter)1876 static void hevc_hz_4t_4x4_msa(uint8_t *src,
1877 int32_t src_stride,
1878 int16_t *dst,
1879 int32_t dst_stride,
1880 const int8_t *filter)
1881 {
1882 v8i16 filt0, filt1;
1883 v16i8 src0, src1, src2, src3;
1884 v16i8 mask1, vec0, vec1;
1885 v8i16 dst0, dst1;
1886 v8i16 filter_vec, const_vec;
1887 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1888
1889 src -= 1;
1890
1891 filter_vec = LD_SH(filter);
1892 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
1893
1894 mask1 = mask0 + 2;
1895
1896 const_vec = __msa_ldi_h(128);
1897 const_vec <<= 6;
1898
1899 LD_SB4(src, src_stride, src0, src1, src2, src3);
1900 XORI_B4_128_SB(src0, src1, src2, src3);
1901
1902 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1903 dst0 = const_vec;
1904 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
1905
1906 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
1907 dst1 = const_vec;
1908 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
1909
1910 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
1911 }
1912
hevc_hz_4t_4x8multiple_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1913 static void hevc_hz_4t_4x8multiple_msa(uint8_t *src,
1914 int32_t src_stride,
1915 int16_t *dst,
1916 int32_t dst_stride,
1917 const int8_t *filter,
1918 int32_t height)
1919 {
1920 uint32_t loop_cnt;
1921 v8i16 filt0, filt1;
1922 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1923 v16i8 mask1, vec0, vec1;
1924 v8i16 dst0, dst1, dst2, dst3;
1925 v8i16 filter_vec, const_vec;
1926 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1927
1928 src -= 1;
1929
1930 filter_vec = LD_SH(filter);
1931 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
1932
1933 mask1 = mask0 + 2;
1934
1935 const_vec = __msa_ldi_h(128);
1936 const_vec <<= 6;
1937
1938 for (loop_cnt = (height >> 3); loop_cnt--;) {
1939 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1940 src += (8 * src_stride);
1941
1942 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1943
1944 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1945 dst0 = const_vec;
1946 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
1947 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
1948 dst1 = const_vec;
1949 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
1950 VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
1951 dst2 = const_vec;
1952 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
1953 VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
1954 dst3 = const_vec;
1955 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
1956
1957 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
1958 dst += (8 * dst_stride);
1959 }
1960 }
1961
hevc_hz_4t_4w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1962 static void hevc_hz_4t_4w_msa(uint8_t *src,
1963 int32_t src_stride,
1964 int16_t *dst,
1965 int32_t dst_stride,
1966 const int8_t *filter,
1967 int32_t height)
1968 {
1969 if (2 == height) {
1970 hevc_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
1971 } else if (4 == height) {
1972 hevc_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
1973 } else if (0 == height % 8) {
1974 hevc_hz_4t_4x8multiple_msa(src, src_stride, dst, dst_stride,
1975 filter, height);
1976 }
1977 }
1978
hevc_hz_4t_6w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1979 static void hevc_hz_4t_6w_msa(uint8_t *src,
1980 int32_t src_stride,
1981 int16_t *dst,
1982 int32_t dst_stride,
1983 const int8_t *filter,
1984 int32_t height)
1985 {
1986 uint32_t loop_cnt;
1987 uint64_t dst_val0, dst_val1, dst_val2, dst_val3;
1988 uint32_t dst_val_int0, dst_val_int1, dst_val_int2, dst_val_int3;
1989 v8i16 filt0, filt1, dst0, dst1, dst2, dst3;
1990 v16i8 src0, src1, src2, src3;
1991 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
1992 v16i8 mask1;
1993 v16i8 vec0, vec1;
1994 v8i16 filter_vec, const_vec;
1995
1996 src -= 1;
1997
1998 filter_vec = LD_SH(filter);
1999 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2000
2001 mask1 = mask0 + 2;
2002
2003 const_vec = __msa_ldi_h(128);
2004 const_vec <<= 6;
2005
2006 for (loop_cnt = 2; loop_cnt--;) {
2007 LD_SB4(src, src_stride, src0, src1, src2, src3);
2008 src += (4 * src_stride);
2009
2010 XORI_B4_128_SB(src0, src1, src2, src3);
2011
2012 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2013 dst0 = const_vec;
2014 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2015 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2016 dst1 = const_vec;
2017 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2018 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2019 dst2 = const_vec;
2020 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2021 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2022 dst3 = const_vec;
2023 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2024
2025 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
2026 dst_val1 = __msa_copy_u_d((v2i64) dst1, 0);
2027 dst_val2 = __msa_copy_u_d((v2i64) dst2, 0);
2028 dst_val3 = __msa_copy_u_d((v2i64) dst3, 0);
2029
2030 dst_val_int0 = __msa_copy_u_w((v4i32) dst0, 2);
2031 dst_val_int1 = __msa_copy_u_w((v4i32) dst1, 2);
2032 dst_val_int2 = __msa_copy_u_w((v4i32) dst2, 2);
2033 dst_val_int3 = __msa_copy_u_w((v4i32) dst3, 2);
2034
2035 SD(dst_val0, dst);
2036 SW(dst_val_int0, dst + 4);
2037 dst += dst_stride;
2038 SD(dst_val1, dst);
2039 SW(dst_val_int1, dst + 4);
2040 dst += dst_stride;
2041 SD(dst_val2, dst);
2042 SW(dst_val_int2, dst + 4);
2043 dst += dst_stride;
2044 SD(dst_val3, dst);
2045 SW(dst_val_int3, dst + 4);
2046 dst += dst_stride;
2047 }
2048 }
2049
hevc_hz_4t_8x2multiple_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2050 static void hevc_hz_4t_8x2multiple_msa(uint8_t *src,
2051 int32_t src_stride,
2052 int16_t *dst,
2053 int32_t dst_stride,
2054 const int8_t *filter,
2055 int32_t height)
2056 {
2057 uint32_t loop_cnt;
2058 v8i16 filt0, filt1, dst0, dst1;
2059 v16i8 src0, src1;
2060 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2061 v16i8 mask1;
2062 v16i8 vec0, vec1;
2063 v8i16 filter_vec, const_vec;
2064
2065 src -= 1;
2066
2067 filter_vec = LD_SH(filter);
2068 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2069
2070 mask1 = mask0 + 2;
2071
2072 const_vec = __msa_ldi_h(128);
2073 const_vec <<= 6;
2074
2075 for (loop_cnt = (height >> 1); loop_cnt--;) {
2076 LD_SB2(src, src_stride, src0, src1);
2077 src += (2 * src_stride);
2078
2079 XORI_B2_128_SB(src0, src1);
2080
2081 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2082 dst0 = const_vec;
2083 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2084
2085 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2086 dst1 = const_vec;
2087 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2088
2089 ST_SH2(dst0, dst1, dst, dst_stride);
2090 dst += (2 * dst_stride);
2091 }
2092 }
2093
hevc_hz_4t_8x4multiple_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2094 static void hevc_hz_4t_8x4multiple_msa(uint8_t *src,
2095 int32_t src_stride,
2096 int16_t *dst,
2097 int32_t dst_stride,
2098 const int8_t *filter,
2099 int32_t height)
2100 {
2101 uint32_t loop_cnt;
2102 v8i16 filt0, filt1;
2103 v16i8 src0, src1, src2, src3;
2104 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2105 v16i8 mask1;
2106 v16i8 vec0, vec1;
2107 v8i16 dst0, dst1, dst2, dst3;
2108 v8i16 filter_vec, const_vec;
2109
2110 src -= 1;
2111
2112 filter_vec = LD_SH(filter);
2113 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2114
2115 mask1 = mask0 + 2;
2116
2117 const_vec = __msa_ldi_h(128);
2118 const_vec <<= 6;
2119
2120 for (loop_cnt = (height >> 2); loop_cnt--;) {
2121 LD_SB4(src, src_stride, src0, src1, src2, src3);
2122 src += (4 * src_stride);
2123
2124 XORI_B4_128_SB(src0, src1, src2, src3);
2125
2126 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2127 dst0 = const_vec;
2128 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2129
2130 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2131 dst1 = const_vec;
2132 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2133
2134 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2135 dst2 = const_vec;
2136 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2137
2138 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2139 dst3 = const_vec;
2140 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2141
2142 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
2143 dst += (4 * dst_stride);
2144 }
2145 }
2146
hevc_hz_4t_8w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2147 static void hevc_hz_4t_8w_msa(uint8_t *src,
2148 int32_t src_stride,
2149 int16_t *dst,
2150 int32_t dst_stride,
2151 const int8_t *filter,
2152 int32_t height)
2153 {
2154 if (2 == height || 6 == height) {
2155 hevc_hz_4t_8x2multiple_msa(src, src_stride, dst, dst_stride,
2156 filter, height);
2157 } else {
2158 hevc_hz_4t_8x4multiple_msa(src, src_stride, dst, dst_stride,
2159 filter, height);
2160 }
2161 }
2162
hevc_hz_4t_12w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2163 static void hevc_hz_4t_12w_msa(uint8_t *src,
2164 int32_t src_stride,
2165 int16_t *dst,
2166 int32_t dst_stride,
2167 const int8_t *filter,
2168 int32_t height)
2169 {
2170 uint32_t loop_cnt;
2171 v8i16 filt0, filt1;
2172 v16i8 src0, src1, src2, src3;
2173 v16i8 mask1;
2174 v16i8 vec0, vec1;
2175 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2176 v8i16 filter_vec, const_vec;
2177 v16i8 mask3;
2178 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2179 v16i8 mask2 = {
2180 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
2181 };
2182
2183 src -= 1;
2184
2185 filter_vec = LD_SH(filter);
2186 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2187
2188 mask1 = mask0 + 2;
2189 mask3 = mask2 + 2;
2190
2191 const_vec = __msa_ldi_h(128);
2192 const_vec <<= 6;
2193
2194 for (loop_cnt = (height >> 2); loop_cnt--;) {
2195 LD_SB4(src, src_stride, src0, src1, src2, src3);
2196 src += (4 * src_stride);
2197 XORI_B4_128_SB(src0, src1, src2, src3);
2198
2199 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2200 dst0 = const_vec;
2201 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2202 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2203 dst1 = const_vec;
2204 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2205 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2206 dst2 = const_vec;
2207 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2208 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2209 dst3 = const_vec;
2210 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2211 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2212 dst4 = const_vec;
2213 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2214 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
2215 dst5 = const_vec;
2216 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
2217
2218 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
2219 ST_D4(dst4, dst5, 0, 1, 0, 1, dst + 8, dst_stride);
2220 dst += (4 * dst_stride);
2221 }
2222 }
2223
hevc_hz_4t_16w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2224 static void hevc_hz_4t_16w_msa(uint8_t *src,
2225 int32_t src_stride,
2226 int16_t *dst,
2227 int32_t dst_stride,
2228 const int8_t *filter,
2229 int32_t height)
2230 {
2231 uint32_t loop_cnt;
2232 v16i8 src0, src1, src2, src3;
2233 v16i8 src4, src5, src6, src7;
2234 v8i16 filt0, filt1;
2235 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2236 v16i8 mask1;
2237 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2238 v16i8 vec0, vec1;
2239 v8i16 filter_vec, const_vec;
2240
2241 src -= 1;
2242
2243 filter_vec = LD_SH(filter);
2244 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2245
2246 mask1 = mask0 + 2;
2247
2248 const_vec = __msa_ldi_h(128);
2249 const_vec <<= 6;
2250
2251 for (loop_cnt = (height >> 2); loop_cnt--;) {
2252 LD_SB4(src, src_stride, src0, src2, src4, src6);
2253 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2254 src += (4 * src_stride);
2255
2256 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2257
2258 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2259 dst0 = const_vec;
2260 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2261
2262 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2263 dst1 = const_vec;
2264 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2265
2266 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2267 dst2 = const_vec;
2268 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2269
2270 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2271 dst3 = const_vec;
2272 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2273
2274 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2275 dst4 = const_vec;
2276 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2277
2278 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2279 dst5 = const_vec;
2280 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
2281
2282 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
2283 dst6 = const_vec;
2284 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
2285
2286 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
2287 dst7 = const_vec;
2288 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
2289
2290 ST_SH4(dst0, dst2, dst4, dst6, dst, dst_stride);
2291 ST_SH4(dst1, dst3, dst5, dst7, dst + 8, dst_stride);
2292 dst += (4 * dst_stride);
2293 }
2294 }
2295
hevc_hz_4t_24w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2296 static void hevc_hz_4t_24w_msa(uint8_t *src,
2297 int32_t src_stride,
2298 int16_t *dst,
2299 int32_t dst_stride,
2300 const int8_t *filter,
2301 int32_t height)
2302 {
2303 uint32_t loop_cnt;
2304 int16_t *dst_tmp = dst + 16;
2305 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2306 v8i16 filt0, filt1;
2307 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2308 v16i8 mask1, mask00, mask11;
2309 v16i8 vec0, vec1;
2310 v8i16 dst0, dst1, dst2, dst3;
2311 v8i16 filter_vec, const_vec;
2312
2313 src -= 1;
2314
2315 filter_vec = LD_SH(filter);
2316 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2317
2318 mask1 = mask0 + 2;
2319 mask00 = mask0 + 8;
2320 mask11 = mask0 + 10;
2321
2322 const_vec = __msa_ldi_h(128);
2323 const_vec <<= 6;
2324
2325 for (loop_cnt = (height >> 2); loop_cnt--;) {
2326 /* 16 width */
2327 LD_SB4(src, src_stride, src0, src2, src4, src6);
2328 LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
2329 src += (4 * src_stride);
2330
2331 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2332
2333 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2334 dst0 = const_vec;
2335 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2336
2337 VSHF_B2_SB(src0, src1, src0, src1, mask00, mask11, vec0, vec1);
2338 dst1 = const_vec;
2339 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2340
2341 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2342 dst2 = const_vec;
2343 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2344
2345 VSHF_B2_SB(src2, src3, src2, src3, mask00, mask11, vec0, vec1);
2346 dst3 = const_vec;
2347 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2348
2349 ST_SH2(dst0, dst1, dst, 8);
2350 dst += dst_stride;
2351 ST_SH2(dst2, dst3, dst, 8);
2352 dst += dst_stride;
2353
2354 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2355 dst0 = const_vec;
2356 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2357
2358 VSHF_B2_SB(src4, src5, src4, src5, mask00, mask11, vec0, vec1);
2359 dst1 = const_vec;
2360 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2361
2362 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
2363 dst2 = const_vec;
2364 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2365
2366 VSHF_B2_SB(src6, src7, src6, src7, mask00, mask11, vec0, vec1);
2367 dst3 = const_vec;
2368 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2369
2370 ST_SH2(dst0, dst1, dst, 8);
2371 dst += dst_stride;
2372 ST_SH2(dst2, dst3, dst, 8);
2373 dst += dst_stride;
2374
2375 /* 8 width */
2376 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2377 dst0 = const_vec;
2378 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2379
2380 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2381 dst1 = const_vec;
2382 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2383
2384 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2385 dst2 = const_vec;
2386 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2387
2388 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
2389 dst3 = const_vec;
2390 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2391
2392 ST_SH4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride);
2393 dst_tmp += (4 * dst_stride);
2394 }
2395 }
2396
hevc_hz_4t_32w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2397 static void hevc_hz_4t_32w_msa(uint8_t *src,
2398 int32_t src_stride,
2399 int16_t *dst,
2400 int32_t dst_stride,
2401 const int8_t *filter,
2402 int32_t height)
2403 {
2404 uint32_t loop_cnt;
2405 v16i8 src0, src1, src2;
2406 v8i16 filt0, filt1;
2407 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2408 v16i8 mask1, mask2, mask3;
2409 v8i16 dst0, dst1, dst2, dst3;
2410 v16i8 vec0, vec1, vec2, vec3;
2411 v8i16 filter_vec, const_vec;
2412
2413 src -= 1;
2414
2415 filter_vec = LD_SH(filter);
2416 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2417
2418 const_vec = __msa_ldi_h(128);
2419 const_vec <<= 6;
2420
2421 mask1 = mask0 + 2;
2422 mask2 = mask0 + 8;
2423 mask3 = mask0 + 10;
2424
2425 for (loop_cnt = height; loop_cnt--;) {
2426 LD_SB2(src, 16, src0, src1);
2427 src2 = LD_SB(src + 24);
2428 src += src_stride;
2429
2430 XORI_B3_128_SB(src0, src1, src2);
2431
2432 dst0 = const_vec;
2433 dst1 = const_vec;
2434 dst2 = const_vec;
2435 dst3 = const_vec;
2436 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask2, vec0, vec1);
2437 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
2438 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2439 dst1, dst2, dst3);
2440 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask3, vec0, vec1);
2441 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
2442 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2443 dst1, dst2, dst3);
2444 ST_SH4(dst0, dst1, dst2, dst3, dst, 8);
2445 dst += dst_stride;
2446 }
2447 }
2448
hevc_vt_4t_4x2_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter)2449 static void hevc_vt_4t_4x2_msa(uint8_t *src,
2450 int32_t src_stride,
2451 int16_t *dst,
2452 int32_t dst_stride,
2453 const int8_t *filter)
2454 {
2455 v16i8 src0, src1, src2, src3, src4;
2456 v16i8 src10_r, src32_r, src21_r, src43_r;
2457 v16i8 src2110, src4332;
2458 v8i16 dst10;
2459 v8i16 filt0, filt1;
2460 v8i16 filter_vec, const_vec;
2461
2462 src -= src_stride;
2463
2464 const_vec = __msa_ldi_h(128);
2465 const_vec <<= 6;
2466
2467 filter_vec = LD_SH(filter);
2468 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2469
2470 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2471 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
2472 src10_r, src21_r, src32_r, src43_r);
2473
2474 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
2475 XORI_B2_128_SB(src2110, src4332);
2476 dst10 = const_vec;
2477 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2478
2479 ST_D2(dst10, 0, 1, dst, dst_stride);
2480 }
2481
hevc_vt_4t_4x4_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2482 static void hevc_vt_4t_4x4_msa(uint8_t *src,
2483 int32_t src_stride,
2484 int16_t *dst,
2485 int32_t dst_stride,
2486 const int8_t *filter,
2487 int32_t height)
2488 {
2489 v16i8 src0, src1, src2, src3, src4, src5, src6;
2490 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2491 v16i8 src2110, src4332, src6554;
2492 v8i16 dst10, dst32;
2493 v8i16 filt0, filt1;
2494 v8i16 filter_vec, const_vec;
2495
2496 src -= src_stride;
2497
2498 const_vec = __msa_ldi_h(128);
2499 const_vec <<= 6;
2500
2501 filter_vec = LD_SH(filter);
2502 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2503
2504 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
2505 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
2506 src10_r, src21_r, src32_r, src43_r);
2507 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2508 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
2509 src2110, src4332, src6554);
2510 XORI_B3_128_SB(src2110, src4332, src6554);
2511 dst10 = const_vec;
2512 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2513 dst32 = const_vec;
2514 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2515
2516 ST_D4(dst10, dst32, 0, 1, 0, 1, dst, dst_stride);
2517 }
2518
hevc_vt_4t_4x8_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2519 static void hevc_vt_4t_4x8_msa(uint8_t *src,
2520 int32_t src_stride,
2521 int16_t *dst,
2522 int32_t dst_stride,
2523 const int8_t *filter,
2524 int32_t height)
2525 {
2526 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2527 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
2528 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
2529 v16i8 src2110, src4332, src6554, src8776, src10998;
2530 v8i16 dst10, dst32, dst54, dst76;
2531 v8i16 filt0, filt1;
2532 v8i16 filter_vec, const_vec;
2533
2534 src -= src_stride;
2535 const_vec = __msa_ldi_h(128);
2536 const_vec <<= 6;
2537
2538 filter_vec = LD_SH(filter);
2539 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2540
2541 LD_SB3(src, src_stride, src0, src1, src2);
2542 src += (3 * src_stride);
2543
2544 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2545 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2546 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2547
2548 LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
2549 src += (8 * src_stride);
2550 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2551 src32_r, src43_r, src54_r, src65_r);
2552 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
2553 src76_r, src87_r, src98_r, src109_r);
2554 ILVR_D4_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r, src109_r,
2555 src98_r, src4332, src6554, src8776, src10998);
2556 XORI_B4_128_SB(src4332, src6554, src8776, src10998);
2557 dst10 = const_vec;
2558 dst32 = const_vec;
2559 dst54 = const_vec;
2560 dst76 = const_vec;
2561 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2562 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2563 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
2564 DPADD_SB2_SH(src8776, src10998, filt0, filt1, dst76, dst76);
2565 ST_D8(dst10, dst32, dst54, dst76, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
2566 }
2567
hevc_vt_4t_4x16_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2568 static void hevc_vt_4t_4x16_msa(uint8_t *src, int32_t src_stride,
2569 int16_t *dst, int32_t dst_stride,
2570 const int8_t *filter, int32_t height)
2571 {
2572 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2573 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
2574 v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
2575 v16i8 src10998;
2576 v8i16 dst10, dst32, dst54, dst76, filt0, filt1, filter_vec, const_vec;
2577
2578 src -= src_stride;
2579 const_vec = __msa_ldi_h(128);
2580 const_vec <<= 6;
2581
2582 filter_vec = LD_SH(filter);
2583 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2584
2585 LD_SB3(src, src_stride, src0, src1, src2);
2586 src += (3 * src_stride);
2587
2588 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2589 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2590 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2591
2592 LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
2593 src += (8 * src_stride);
2594 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, src32_r, src43_r,
2595 src54_r, src65_r);
2596 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
2597 src87_r, src98_r, src109_r);
2598 ILVR_D4_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r, src109_r,
2599 src98_r, src4332, src6554, src8776, src10998);
2600 XORI_B4_128_SB(src4332, src6554, src8776, src10998);
2601
2602 dst10 = const_vec;
2603 dst32 = const_vec;
2604 dst54 = const_vec;
2605 dst76 = const_vec;
2606 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2607 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2608 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
2609 DPADD_SB2_SH(src8776, src10998, filt0, filt1, dst76, dst76);
2610 ST_D8(dst10, dst32, dst54, dst76, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
2611 dst += (8 * dst_stride);
2612
2613 src2 = src10;
2614 src2110 = src10998;
2615
2616 LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
2617 src += (8 * src_stride);
2618
2619 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, src32_r, src43_r,
2620 src54_r, src65_r);
2621 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
2622 src87_r, src98_r, src109_r);
2623 ILVR_D4_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r, src109_r,
2624 src98_r, src4332, src6554, src8776, src10998);
2625 XORI_B4_128_SB(src4332, src6554, src8776, src10998);
2626
2627 dst10 = const_vec;
2628 dst32 = const_vec;
2629 dst54 = const_vec;
2630 dst76 = const_vec;
2631 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2632 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2633 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
2634 DPADD_SB2_SH(src8776, src10998, filt0, filt1, dst76, dst76);
2635 ST_D8(dst10, dst32, dst54, dst76, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
2636 }
2637
hevc_vt_4t_4w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2638 static void hevc_vt_4t_4w_msa(uint8_t *src,
2639 int32_t src_stride,
2640 int16_t *dst,
2641 int32_t dst_stride,
2642 const int8_t *filter,
2643 int32_t height)
2644 {
2645 if (2 == height) {
2646 hevc_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
2647 } else if (4 == height) {
2648 hevc_vt_4t_4x4_msa(src, src_stride, dst, dst_stride, filter, height);
2649 } else if (8 == height) {
2650 hevc_vt_4t_4x8_msa(src, src_stride, dst, dst_stride, filter, height);
2651 } else if (16 == height) {
2652 hevc_vt_4t_4x16_msa(src, src_stride, dst, dst_stride, filter, height);
2653 }
2654 }
2655
hevc_vt_4t_6w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2656 static void hevc_vt_4t_6w_msa(uint8_t *src,
2657 int32_t src_stride,
2658 int16_t *dst,
2659 int32_t dst_stride,
2660 const int8_t *filter,
2661 int32_t height)
2662 {
2663 int32_t loop_cnt;
2664 uint32_t dst_val_int0, dst_val_int1, dst_val_int2, dst_val_int3;
2665 uint64_t dst_val0, dst_val1, dst_val2, dst_val3;
2666 v16i8 src0, src1, src2, src3, src4;
2667 v16i8 src10_r, src32_r, src21_r, src43_r;
2668 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
2669 v8i16 filt0, filt1;
2670 v8i16 filter_vec, const_vec;
2671
2672 src -= src_stride;
2673 const_vec = __msa_ldi_h(128);
2674 const_vec <<= 6;
2675
2676 filter_vec = LD_SH(filter);
2677 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2678
2679 LD_SB3(src, src_stride, src0, src1, src2);
2680 src += (3 * src_stride);
2681 XORI_B3_128_SB(src0, src1, src2);
2682 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2683
2684 for (loop_cnt = (height >> 2); loop_cnt--;) {
2685 LD_SB2(src, src_stride, src3, src4);
2686 src += (2 * src_stride);
2687 XORI_B2_128_SB(src3, src4);
2688 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2689
2690 dst0_r = const_vec;
2691 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2692 dst1_r = const_vec;
2693 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2694
2695 LD_SB2(src, src_stride, src1, src2);
2696 src += (2 * src_stride);
2697 XORI_B2_128_SB(src1, src2);
2698 ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
2699
2700 dst2_r = const_vec;
2701 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
2702 dst3_r = const_vec;
2703 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
2704
2705 dst_val0 = __msa_copy_u_d((v2i64) dst0_r, 0);
2706 dst_val1 = __msa_copy_u_d((v2i64) dst1_r, 0);
2707 dst_val2 = __msa_copy_u_d((v2i64) dst2_r, 0);
2708 dst_val3 = __msa_copy_u_d((v2i64) dst3_r, 0);
2709
2710 dst_val_int0 = __msa_copy_u_w((v4i32) dst0_r, 2);
2711 dst_val_int1 = __msa_copy_u_w((v4i32) dst1_r, 2);
2712 dst_val_int2 = __msa_copy_u_w((v4i32) dst2_r, 2);
2713 dst_val_int3 = __msa_copy_u_w((v4i32) dst3_r, 2);
2714
2715 SD(dst_val0, dst);
2716 SW(dst_val_int0, dst + 4);
2717 dst += dst_stride;
2718 SD(dst_val1, dst);
2719 SW(dst_val_int1, dst + 4);
2720 dst += dst_stride;
2721 SD(dst_val2, dst);
2722 SW(dst_val_int2, dst + 4);
2723 dst += dst_stride;
2724 SD(dst_val3, dst);
2725 SW(dst_val_int3, dst + 4);
2726 dst += dst_stride;
2727 }
2728 }
2729
hevc_vt_4t_8x2_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter)2730 static void hevc_vt_4t_8x2_msa(uint8_t *src,
2731 int32_t src_stride,
2732 int16_t *dst,
2733 int32_t dst_stride,
2734 const int8_t *filter)
2735 {
2736 v16i8 src0, src1, src2, src3, src4;
2737 v16i8 src10_r, src32_r, src21_r, src43_r;
2738 v8i16 dst0_r, dst1_r;
2739 v8i16 filt0, filt1;
2740 v8i16 filter_vec, const_vec;
2741
2742 src -= src_stride;
2743 const_vec = __msa_ldi_h(128);
2744 const_vec <<= 6;
2745
2746 filter_vec = LD_SH(filter);
2747 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2748
2749 LD_SB3(src, src_stride, src0, src1, src2);
2750 src += (3 * src_stride);
2751 XORI_B3_128_SB(src0, src1, src2);
2752 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2753
2754 LD_SB2(src, src_stride, src3, src4);
2755 XORI_B2_128_SB(src3, src4);
2756 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2757 dst0_r = const_vec;
2758 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2759 dst1_r = const_vec;
2760 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2761
2762 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2763 }
2764
hevc_vt_4t_8x6_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter)2765 static void hevc_vt_4t_8x6_msa(uint8_t *src,
2766 int32_t src_stride,
2767 int16_t *dst,
2768 int32_t dst_stride,
2769 const int8_t *filter)
2770 {
2771 v16i8 src0, src1, src2, src3, src4;
2772 v16i8 src10_r, src32_r, src21_r, src43_r;
2773 v8i16 dst0_r, dst1_r;
2774 v8i16 filt0, filt1;
2775 v8i16 filter_vec, const_vec;
2776
2777 src -= src_stride;
2778 const_vec = __msa_ldi_h(128);
2779 const_vec <<= 6;
2780
2781 filter_vec = LD_SH(filter);
2782 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2783
2784 LD_SB3(src, src_stride, src0, src1, src2);
2785 src += (3 * src_stride);
2786 XORI_B3_128_SB(src0, src1, src2);
2787 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2788
2789 LD_SB2(src, src_stride, src3, src4);
2790 src += (2 * src_stride);
2791 XORI_B2_128_SB(src3, src4);
2792
2793 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2794 dst0_r = const_vec;
2795 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2796 dst1_r = const_vec;
2797 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2798
2799 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2800 dst += (2 * dst_stride);
2801
2802 LD_SB2(src, src_stride, src1, src2);
2803 src += (2 * src_stride);
2804 XORI_B2_128_SB(src1, src2);
2805
2806 ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
2807 dst0_r = const_vec;
2808 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
2809 dst1_r = const_vec;
2810 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
2811
2812 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2813 dst += (2 * dst_stride);
2814
2815 LD_SB2(src, src_stride, src3, src4);
2816 XORI_B2_128_SB(src3, src4);
2817
2818 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2819 dst0_r = const_vec;
2820 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2821 dst1_r = const_vec;
2822 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2823
2824 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2825 }
2826
hevc_vt_4t_8x4multiple_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2827 static void hevc_vt_4t_8x4multiple_msa(uint8_t *src,
2828 int32_t src_stride,
2829 int16_t *dst,
2830 int32_t dst_stride,
2831 const int8_t *filter,
2832 int32_t height)
2833 {
2834 int32_t loop_cnt;
2835 v16i8 src0, src1, src2, src3, src4, src5, src6;
2836 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2837 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
2838 v8i16 filt0, filt1;
2839 v8i16 filter_vec, const_vec;
2840
2841 src -= src_stride;
2842 const_vec = __msa_ldi_h(128);
2843 const_vec <<= 6;
2844
2845 filter_vec = LD_SH(filter);
2846 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2847
2848 LD_SB3(src, src_stride, src0, src1, src2);
2849 src += (3 * src_stride);
2850 XORI_B3_128_SB(src0, src1, src2);
2851 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2852
2853 for (loop_cnt = (height >> 2); loop_cnt--;) {
2854 LD_SB4(src, src_stride, src3, src4, src5, src6);
2855 src += (4 * src_stride);
2856 XORI_B4_128_SB(src3, src4, src5, src6);
2857 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2858 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2859 dst0_r = const_vec;
2860 dst1_r = const_vec;
2861 dst2_r = const_vec;
2862 dst3_r = const_vec;
2863 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2864 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2865 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
2866 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
2867 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
2868 dst += (4 * dst_stride);
2869
2870 src2 = src6;
2871 src10_r = src54_r;
2872 src21_r = src65_r;
2873 }
2874 }
2875
hevc_vt_4t_8w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2876 static void hevc_vt_4t_8w_msa(uint8_t *src,
2877 int32_t src_stride,
2878 int16_t *dst,
2879 int32_t dst_stride,
2880 const int8_t *filter,
2881 int32_t height)
2882 {
2883 if (2 == height) {
2884 hevc_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter);
2885 } else if (6 == height) {
2886 hevc_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter);
2887 } else {
2888 hevc_vt_4t_8x4multiple_msa(src, src_stride, dst, dst_stride,
2889 filter, height);
2890 }
2891 }
2892
hevc_vt_4t_12w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2893 static void hevc_vt_4t_12w_msa(uint8_t *src,
2894 int32_t src_stride,
2895 int16_t *dst,
2896 int32_t dst_stride,
2897 const int8_t *filter,
2898 int32_t height)
2899 {
2900 int32_t loop_cnt;
2901 v16i8 src0, src1, src2, src3, src4, src5, src6;
2902 v16i8 src10_r, src32_r, src21_r, src43_r;
2903 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
2904 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
2905 v16i8 src2110, src4332;
2906 v16i8 src54_r, src65_r, src6554;
2907 v8i16 dst0_l, dst1_l;
2908 v8i16 filt0, filt1;
2909 v8i16 filter_vec, const_vec;
2910
2911 src -= (1 * src_stride);
2912 const_vec = __msa_ldi_h(128);
2913 const_vec <<= 6;
2914
2915 filter_vec = LD_SH(filter);
2916 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2917
2918 LD_SB3(src, src_stride, src0, src1, src2);
2919 src += (3 * src_stride);
2920 XORI_B3_128_SB(src0, src1, src2);
2921 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2922 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2923 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
2924
2925 for (loop_cnt = 4; loop_cnt--;) {
2926 LD_SB2(src, src_stride, src3, src4);
2927 src += (2 * src_stride);
2928 LD_SB2(src, src_stride, src5, src6);
2929 src += (2 * src_stride);
2930 XORI_B2_128_SB(src3, src4);
2931 XORI_B2_128_SB(src5, src6);
2932
2933 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2934 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2935 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
2936 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2937 ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l);
2938 src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
2939
2940 dst0_r = const_vec;
2941 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2942 dst1_r = const_vec;
2943 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2944 dst2_r = const_vec;
2945 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
2946 dst3_r = const_vec;
2947 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
2948 dst0_l = const_vec;
2949 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst0_l, dst0_l);
2950 dst1_l = const_vec;
2951 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst1_l, dst1_l);
2952
2953 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
2954 ST_D4(dst0_l, dst1_l, 0, 1, 0, 1, dst + 8, dst_stride);
2955 dst += (4 * dst_stride);
2956
2957 src2 = src6;
2958 src10_r = src54_r;
2959 src21_r = src65_r;
2960 src2110 = src6554;
2961 }
2962 }
2963
hevc_vt_4t_16w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2964 static void hevc_vt_4t_16w_msa(uint8_t *src,
2965 int32_t src_stride,
2966 int16_t *dst,
2967 int32_t dst_stride,
2968 const int8_t *filter,
2969 int32_t height)
2970 {
2971 int32_t loop_cnt;
2972 v16i8 src0, src1, src2, src3, src4, src5;
2973 v16i8 src10_r, src32_r, src21_r, src43_r;
2974 v16i8 src10_l, src32_l, src21_l, src43_l;
2975 v8i16 dst0_r, dst1_r, dst0_l, dst1_l;
2976 v8i16 filt0, filt1;
2977 v8i16 filter_vec, const_vec;
2978
2979 src -= src_stride;
2980 const_vec = __msa_ldi_h(128);
2981 const_vec <<= 6;
2982
2983 filter_vec = LD_SH(filter);
2984 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2985
2986 LD_SB3(src, src_stride, src0, src1, src2);
2987 src += (3 * src_stride);
2988 XORI_B3_128_SB(src0, src1, src2);
2989 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2990 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2991
2992 for (loop_cnt = (height >> 2); loop_cnt--;) {
2993 LD_SB2(src, src_stride, src3, src4);
2994 src += (2 * src_stride);
2995 XORI_B2_128_SB(src3, src4);
2996 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2997 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2998 dst0_r = const_vec;
2999 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3000 dst0_l = const_vec;
3001 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3002 dst1_r = const_vec;
3003 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3004 dst1_l = const_vec;
3005 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3006 ST_SH2(dst0_r, dst0_l, dst, 8);
3007 dst += dst_stride;
3008 ST_SH2(dst1_r, dst1_l, dst, 8);
3009 dst += dst_stride;
3010
3011 LD_SB2(src, src_stride, src5, src2);
3012 src += (2 * src_stride);
3013 XORI_B2_128_SB(src5, src2);
3014 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3015 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3016 dst0_r = const_vec;
3017 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3018 dst0_l = const_vec;
3019 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3020 dst1_r = const_vec;
3021 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3022 dst1_l = const_vec;
3023 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3024 ST_SH2(dst0_r, dst0_l, dst, 8);
3025 dst += dst_stride;
3026 ST_SH2(dst1_r, dst1_l, dst, 8);
3027 dst += dst_stride;
3028 }
3029 }
3030
hevc_vt_4t_24w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)3031 static void hevc_vt_4t_24w_msa(uint8_t *src,
3032 int32_t src_stride,
3033 int16_t *dst,
3034 int32_t dst_stride,
3035 const int8_t *filter,
3036 int32_t height)
3037 {
3038 int32_t loop_cnt;
3039 v16i8 src0, src1, src2, src3, src4, src5;
3040 v16i8 src6, src7, src8, src9, src10, src11;
3041 v16i8 src10_r, src32_r, src76_r, src98_r;
3042 v16i8 src21_r, src43_r, src87_r, src109_r;
3043 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3044 v16i8 src10_l, src32_l, src21_l, src43_l;
3045 v8i16 dst0_l, dst1_l;
3046 v8i16 filt0, filt1;
3047 v8i16 filter_vec, const_vec;
3048
3049 src -= src_stride;
3050 const_vec = __msa_ldi_h(128);
3051 const_vec <<= 6;
3052
3053 filter_vec = LD_SH(filter);
3054 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3055
3056 LD_SB3(src, src_stride, src0, src1, src2);
3057 XORI_B3_128_SB(src0, src1, src2);
3058 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3059 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3060
3061 LD_SB3(src + 16, src_stride, src6, src7, src8);
3062 src += (3 * src_stride);
3063 XORI_B3_128_SB(src6, src7, src8);
3064 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3065
3066 for (loop_cnt = (height >> 2); loop_cnt--;) {
3067 LD_SB2(src, src_stride, src3, src4);
3068 XORI_B2_128_SB(src3, src4);
3069 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3070 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3071
3072 LD_SB2(src + 16, src_stride, src9, src10);
3073 src += (2 * src_stride);
3074 XORI_B2_128_SB(src9, src10);
3075 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3076
3077 dst0_r = const_vec;
3078 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3079 dst0_l = const_vec;
3080 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3081 dst1_r = const_vec;
3082 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3083 dst1_l = const_vec;
3084 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3085 dst2_r = const_vec;
3086 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3087 dst3_r = const_vec;
3088 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3089
3090 ST_SH2(dst0_r, dst0_l, dst, 8);
3091 ST_SH(dst2_r, dst + 16);
3092 dst += dst_stride;
3093 ST_SH2(dst1_r, dst1_l, dst, 8);
3094 ST_SH(dst3_r, dst + 16);
3095 dst += dst_stride;
3096
3097 LD_SB2(src, src_stride, src5, src2);
3098 XORI_B2_128_SB(src5, src2);
3099 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3100 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3101
3102 LD_SB2(src + 16, src_stride, src11, src8);
3103 src += (2 * src_stride);
3104 XORI_B2_128_SB(src11, src8);
3105 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
3106
3107 dst0_r = const_vec;
3108 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3109 dst0_l = const_vec;
3110 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3111 dst1_r = const_vec;
3112 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3113 dst1_l = const_vec;
3114 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3115 dst2_r = const_vec;
3116 DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
3117 dst3_r = const_vec;
3118 DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
3119
3120 ST_SH2(dst0_r, dst0_l, dst, 8);
3121 ST_SH(dst2_r, dst + 16);
3122 dst += dst_stride;
3123 ST_SH2(dst1_r, dst1_l, dst, 8);
3124 ST_SH(dst3_r, dst + 16);
3125 dst += dst_stride;
3126 }
3127 }
3128
hevc_vt_4t_32w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)3129 static void hevc_vt_4t_32w_msa(uint8_t *src,
3130 int32_t src_stride,
3131 int16_t *dst,
3132 int32_t dst_stride,
3133 const int8_t *filter,
3134 int32_t height)
3135 {
3136 int32_t loop_cnt;
3137 v16i8 src0, src1, src2, src3, src4, src5;
3138 v16i8 src6, src7, src8, src9, src10, src11;
3139 v16i8 src10_r, src32_r, src76_r, src98_r;
3140 v16i8 src21_r, src43_r, src87_r, src109_r;
3141 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3142 v16i8 src10_l, src32_l, src76_l, src98_l;
3143 v16i8 src21_l, src43_l, src87_l, src109_l;
3144 v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
3145 v8i16 filt0, filt1;
3146 v8i16 filter_vec, const_vec;
3147
3148 src -= src_stride;
3149 const_vec = __msa_ldi_h(128);
3150 const_vec <<= 6;
3151
3152 filter_vec = LD_SH(filter);
3153 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3154
3155 LD_SB3(src, src_stride, src0, src1, src2);
3156 XORI_B3_128_SB(src0, src1, src2);
3157 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3158 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3159
3160 LD_SB3(src + 16, src_stride, src6, src7, src8);
3161 src += (3 * src_stride);
3162 XORI_B3_128_SB(src6, src7, src8);
3163 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3164 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3165
3166 for (loop_cnt = (height >> 2); loop_cnt--;) {
3167 LD_SB2(src, src_stride, src3, src4);
3168 XORI_B2_128_SB(src3, src4);
3169 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3170 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3171
3172 LD_SB2(src + 16, src_stride, src9, src10);
3173 src += (2 * src_stride);
3174 XORI_B2_128_SB(src9, src10);
3175 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3176 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3177
3178 dst0_r = const_vec;
3179 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3180 dst0_l = const_vec;
3181 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3182 dst1_r = const_vec;
3183 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3184 dst1_l = const_vec;
3185 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3186 dst2_r = const_vec;
3187 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3188 dst2_l = const_vec;
3189 DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, dst2_l, dst2_l);
3190 dst3_r = const_vec;
3191 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3192 dst3_l = const_vec;
3193 DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, dst3_l, dst3_l);
3194
3195 ST_SH4(dst0_r, dst0_l, dst2_r, dst2_l, dst, 8);
3196 dst += dst_stride;
3197 ST_SH4(dst1_r, dst1_l, dst3_r, dst3_l, dst, 8);
3198 dst += dst_stride;
3199
3200 LD_SB2(src, src_stride, src5, src2);
3201 XORI_B2_128_SB(src5, src2);
3202 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3203 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3204
3205 LD_SB2(src + 16, src_stride, src11, src8);
3206 src += (2 * src_stride);
3207 XORI_B2_128_SB(src11, src8);
3208 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
3209 ILVL_B2_SB(src11, src10, src8, src11, src76_l, src87_l);
3210
3211 dst0_r = const_vec;
3212 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3213 dst0_l = const_vec;
3214 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3215 dst1_r = const_vec;
3216 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3217 dst1_l = const_vec;
3218 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3219 dst2_r = const_vec;
3220 DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
3221 dst2_l = const_vec;
3222 DPADD_SB2_SH(src98_l, src76_l, filt0, filt1, dst2_l, dst2_l);
3223 dst3_r = const_vec;
3224 DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
3225 dst3_l = const_vec;
3226 DPADD_SB2_SH(src109_l, src87_l, filt0, filt1, dst3_l, dst3_l);
3227
3228 ST_SH4(dst0_r, dst0_l, dst2_r, dst2_l, dst, 8);
3229 dst += dst_stride;
3230 ST_SH4(dst1_r, dst1_l, dst3_r, dst3_l, dst, 8);
3231 dst += dst_stride;
3232 }
3233 }
3234
hevc_hv_4t_4x2_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y)3235 static void hevc_hv_4t_4x2_msa(uint8_t *src,
3236 int32_t src_stride,
3237 int16_t *dst,
3238 int32_t dst_stride,
3239 const int8_t *filter_x,
3240 const int8_t *filter_y)
3241 {
3242 v16i8 src0, src1, src2, src3, src4;
3243 v8i16 filt0, filt1;
3244 v8i16 filt_h0, filt_h1;
3245 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3246 v16i8 mask1;
3247 v8i16 filter_vec, const_vec;
3248 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3249 v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43;
3250 v4i32 dst0, dst1;
3251
3252 src -= (src_stride + 1);
3253 filter_vec = LD_SH(filter_x);
3254 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3255
3256 filter_vec = LD_SH(filter_y);
3257 UNPCK_R_SB_SH(filter_vec, filter_vec);
3258
3259 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3260
3261 mask1 = mask0 + 2;
3262
3263 const_vec = __msa_ldi_h(128);
3264 const_vec <<= 6;
3265
3266 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3267 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3268 VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
3269 VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
3270 VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
3271
3272 dst20 = const_vec;
3273 dst31 = const_vec;
3274 dst42 = const_vec;
3275 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst20, dst20);
3276 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst31, dst31);
3277 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst42, dst42);
3278 ILVRL_H2_SH(dst31, dst20, dst10, dst32);
3279 ILVRL_H2_SH(dst42, dst31, dst21, dst43);
3280
3281 dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3282 dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3283 dst0 >>= 6;
3284 dst1 >>= 6;
3285 dst0 = (v4i32) __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
3286 ST_D2(dst0, 0, 1, dst, dst_stride);
3287 }
3288
hevc_hv_4t_4x4_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y)3289 static void hevc_hv_4t_4x4_msa(uint8_t *src,
3290 int32_t src_stride,
3291 int16_t *dst,
3292 int32_t dst_stride,
3293 const int8_t *filter_x,
3294 const int8_t *filter_y)
3295 {
3296 v16i8 src0, src1, src2, src3, src4, src5, src6;
3297 v8i16 filt0, filt1;
3298 v8i16 filt_h0, filt_h1;
3299 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3300 v16i8 mask1;
3301 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3302 v8i16 filter_vec, const_vec;
3303 v8i16 dst30, dst41, dst52, dst63, dst10, dst32, dst54, dst21, dst43, dst65;
3304 v4i32 dst0, dst1, dst2, dst3;
3305
3306 src -= (src_stride + 1);
3307
3308 filter_vec = LD_SH(filter_x);
3309 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3310
3311 filter_vec = LD_SH(filter_y);
3312 UNPCK_R_SB_SH(filter_vec, filter_vec);
3313
3314 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3315
3316 mask1 = mask0 + 2;
3317
3318 const_vec = __msa_ldi_h(128);
3319 const_vec <<= 6;
3320
3321 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3322 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3323
3324 VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
3325 VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
3326 VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
3327 VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
3328
3329 dst30 = const_vec;
3330 dst41 = const_vec;
3331 dst52 = const_vec;
3332 dst63 = const_vec;
3333 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst30, dst30);
3334 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst41, dst41);
3335 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst52, dst52);
3336 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst63, dst63);
3337
3338 ILVRL_H2_SH(dst41, dst30, dst10, dst43);
3339 ILVRL_H2_SH(dst52, dst41, dst21, dst54);
3340 ILVRL_H2_SH(dst63, dst52, dst32, dst65);
3341
3342 dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3343 dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3344 dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
3345 dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
3346 SRA_4V(dst0, dst1, dst2, dst3, 6);
3347 PCKEV_H2_SW(dst1, dst0, dst3, dst2, dst0, dst2);
3348 ST_D4(dst0, dst2, 0, 1, 0, 1, dst, dst_stride);
3349 }
3350
3351
hevc_hv_4t_4multx8mult_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)3352 static void hevc_hv_4t_4multx8mult_msa(uint8_t *src,
3353 int32_t src_stride,
3354 int16_t *dst,
3355 int32_t dst_stride,
3356 const int8_t *filter_x,
3357 const int8_t *filter_y,
3358 int32_t height)
3359 {
3360 uint32_t loop_cnt;
3361 v16i8 src0, src1, src2, src3, src4, src5, src6;
3362 v16i8 src7, src8, src9, src10;
3363 v8i16 filt0, filt1;
3364 v8i16 filt_h0, filt_h1;
3365 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3366 v16i8 mask1;
3367 v8i16 filter_vec, const_vec;
3368 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3369 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3370 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r;
3371 v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
3372 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3373
3374 src -= (src_stride + 1);
3375 filter_vec = LD_SH(filter_x);
3376 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3377
3378 filter_vec = LD_SH(filter_y);
3379 UNPCK_R_SB_SH(filter_vec, filter_vec);
3380
3381 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3382
3383 mask1 = mask0 + 2;
3384
3385 const_vec = __msa_ldi_h(128);
3386 const_vec <<= 6;
3387
3388 LD_SB3(src, src_stride, src0, src1, src2);
3389 src += (3 * src_stride);
3390 XORI_B3_128_SB(src0, src1, src2);
3391 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
3392 VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
3393 dst10 = const_vec;
3394 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst10, dst10);
3395 dst21 = const_vec;
3396 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst21, dst21);
3397 ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
3398 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3399
3400 for (loop_cnt = height >> 3; loop_cnt--;) {
3401 LD_SB8(src, src_stride,
3402 src3, src4, src5, src6, src7, src8, src9, src10);
3403 src += (8 * src_stride);
3404 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3405
3406 VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
3407 VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
3408 VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
3409 VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
3410
3411 dst73 = const_vec;
3412 dst84 = const_vec;
3413 dst95 = const_vec;
3414 dst106 = const_vec;
3415 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst73, dst73);
3416 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst84, dst84);
3417 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst95, dst95);
3418 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst106, dst106);
3419
3420 dst32_r = __msa_ilvr_h(dst73, dst22);
3421 ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
3422 ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
3423 ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
3424 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3425 dst76_r = __msa_ilvr_h(dst22, dst106);
3426
3427 dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3428 dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3429 dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3430 dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3431 dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3432 dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3433 dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3434 dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3435 SRA_4V(dst0, dst1, dst2, dst3, 6);
3436 SRA_4V(dst4, dst5, dst6, dst7, 6);
3437 PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
3438 dst0, dst1, dst2, dst3);
3439 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
3440 dst += (8 * dst_stride);
3441
3442 dst10_r = dst98_r;
3443 dst21_r = dst109_r;
3444 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
3445 }
3446 }
3447
hevc_hv_4t_4w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)3448 static void hevc_hv_4t_4w_msa(uint8_t *src,
3449 int32_t src_stride,
3450 int16_t *dst,
3451 int32_t dst_stride,
3452 const int8_t *filter_x,
3453 const int8_t *filter_y,
3454 int32_t height)
3455 {
3456 if (2 == height) {
3457 hevc_hv_4t_4x2_msa(src, src_stride, dst, dst_stride,
3458 filter_x, filter_y);
3459 } else if (4 == height) {
3460 hevc_hv_4t_4x4_msa(src, src_stride, dst, dst_stride,
3461 filter_x, filter_y);
3462 } else if (0 == (height % 8)) {
3463 hevc_hv_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
3464 filter_x, filter_y, height);
3465 }
3466 }
3467
hevc_hv_4t_6w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)3468 static void hevc_hv_4t_6w_msa(uint8_t *src,
3469 int32_t src_stride,
3470 int16_t *dst,
3471 int32_t dst_stride,
3472 const int8_t *filter_x,
3473 const int8_t *filter_y,
3474 int32_t height)
3475 {
3476 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3477 v8i16 filt0, filt1;
3478 v8i16 filt_h0, filt_h1;
3479 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3480 v16i8 mask1;
3481 v8i16 filter_vec, const_vec;
3482 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3483 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
3484 v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3485 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
3486 v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
3487 v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l;
3488 v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
3489 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3490 v4i32 dst0_l, dst1_l, dst2_l, dst3_l;
3491
3492 src -= (src_stride + 1);
3493 filter_vec = LD_SH(filter_x);
3494 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3495
3496 filter_vec = LD_SH(filter_y);
3497 UNPCK_R_SB_SH(filter_vec, filter_vec);
3498
3499 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3500
3501 mask1 = mask0 + 2;
3502
3503 const_vec = __msa_ldi_h(128);
3504 const_vec <<= 6;
3505
3506 LD_SB3(src, src_stride, src0, src1, src2);
3507 src += (3 * src_stride);
3508 XORI_B3_128_SB(src0, src1, src2);
3509
3510 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3511 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3512 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3513
3514 dsth0 = const_vec;
3515 dsth1 = const_vec;
3516 dsth2 = const_vec;
3517 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dsth0, dsth0);
3518 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dsth1, dsth1);
3519 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dsth2, dsth2);
3520
3521 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
3522 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
3523
3524 LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
3525 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3526
3527 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3528 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3529 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3530 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3531
3532 dsth3 = const_vec;
3533 dsth4 = const_vec;
3534 dsth5 = const_vec;
3535 dsth6 = const_vec;
3536 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dsth3, dsth3);
3537 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dsth4, dsth4);
3538 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dsth5, dsth5);
3539 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dsth6, dsth6);
3540
3541 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3542 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
3543 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
3544 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
3545
3546 dsth7 = const_vec;
3547 dsth8 = const_vec;
3548 dsth9 = const_vec;
3549 dsth10 = const_vec;
3550 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dsth7, dsth7);
3551 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dsth8, dsth8);
3552 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dsth9, dsth9);
3553 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dsth10, dsth10);
3554
3555 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
3556 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
3557 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
3558 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
3559 ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
3560 ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
3561 ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
3562 ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
3563
3564 PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
3565 PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
3566 dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
3567
3568 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3569 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3570 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3571 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3572 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3573 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3574 dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3575 dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3576 dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
3577 dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
3578 dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
3579 dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
3580 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3581 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3582 SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
3583 PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
3584 PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
3585 PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
3586 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
3587 ST_W4(tmp4, 0, 1, 2, 3, dst + 4, dst_stride);
3588 dst += 4 * dst_stride;
3589 ST_D4(tmp2, tmp3, 0, 1, 0, 1, dst, dst_stride);
3590 ST_W4(tmp5, 0, 1, 2, 3, dst + 4, dst_stride);
3591 }
3592
hevc_hv_4t_8x2_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y)3593 static void hevc_hv_4t_8x2_msa(uint8_t *src,
3594 int32_t src_stride,
3595 int16_t *dst,
3596 int32_t dst_stride,
3597 const int8_t *filter_x,
3598 const int8_t *filter_y)
3599 {
3600 v16i8 src0, src1, src2, src3, src4;
3601 v8i16 filt0, filt1;
3602 v8i16 filt_h0, filt_h1;
3603 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3604 v16i8 mask1;
3605 v8i16 filter_vec, const_vec;
3606 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3607 v8i16 dst0, dst1, dst2, dst3, dst4;
3608 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3609 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3610 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3611
3612 src -= (src_stride + 1);
3613
3614 filter_vec = LD_SH(filter_x);
3615 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3616
3617 filter_vec = LD_SH(filter_y);
3618 UNPCK_R_SB_SH(filter_vec, filter_vec);
3619
3620 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3621
3622 mask1 = mask0 + 2;
3623
3624 const_vec = __msa_ldi_h(128);
3625 const_vec <<= 6;
3626
3627 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3628 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3629
3630 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3631 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3632 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3633 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3634 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3635
3636 dst0 = const_vec;
3637 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3638 dst1 = const_vec;
3639 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3640 dst2 = const_vec;
3641 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3642 dst3 = const_vec;
3643 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst3, dst3);
3644 dst4 = const_vec;
3645 DPADD_SB2_SH(vec8, vec9, filt0, filt1, dst4, dst4);
3646
3647 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3648 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3649 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3650 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3651 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3652 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3653 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3654 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3655 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3656 PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3657 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3658 }
3659
hevc_hv_4t_8multx4_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t width8mult)3660 static void hevc_hv_4t_8multx4_msa(uint8_t *src, int32_t src_stride,
3661 int16_t *dst, int32_t dst_stride,
3662 const int8_t *filter_x,
3663 const int8_t *filter_y, int32_t width8mult)
3664 {
3665 int32_t cnt;
3666 v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
3667 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3668 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, const_vec;
3669 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6;
3670 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3671 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3672 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3673
3674 src -= (src_stride + 1);
3675
3676 filter_vec = LD_SH(filter_x);
3677 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3678
3679 filter_vec = LD_SH(filter_y);
3680 UNPCK_R_SB_SH(filter_vec, filter_vec);
3681
3682 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3683
3684 mask0 = LD_SB(ff_hevc_mask_arr);
3685 mask1 = mask0 + 2;
3686
3687 const_vec = __msa_ldi_h(128);
3688 const_vec <<= 6;
3689
3690 for (cnt = width8mult; cnt--;) {
3691 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3692 src += 8;
3693 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3694
3695 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3696 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3697 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3698
3699 dst0 = const_vec;
3700 dst1 = const_vec;
3701 dst2 = const_vec;
3702 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3703 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3704 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3705
3706 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3707 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3708
3709 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3710 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3711 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3712 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3713 dst3 = const_vec;
3714 dst4 = const_vec;
3715 dst5 = const_vec;
3716 dst6 = const_vec;
3717 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3718 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst4, dst4);
3719 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst5, dst5);
3720 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst6, dst6);
3721 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3722 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3723 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3724 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3725 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3726 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3727 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3728 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3729
3730 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3731 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3732 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3733 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3734 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3735 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3736 PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3737 PCKEV_H2_SW(dst2_l, dst2_r, dst3_l, dst3_r, dst2_r, dst3_r);
3738
3739 ST_SW4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
3740 dst += 8;
3741 }
3742 }
3743
hevc_hv_4t_8x6_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y)3744 static void hevc_hv_4t_8x6_msa(uint8_t *src,
3745 int32_t src_stride,
3746 int16_t *dst,
3747 int32_t dst_stride,
3748 const int8_t *filter_x,
3749 const int8_t *filter_y)
3750 {
3751 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3752 v8i16 filt0, filt1;
3753 v8i16 filt_h0, filt_h1;
3754 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3755 v16i8 mask1;
3756 v8i16 filter_vec, const_vec;
3757 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3758 v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
3759 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3760 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3761 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
3762 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
3763 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
3764 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
3765 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
3766
3767 src -= (src_stride + 1);
3768
3769 filter_vec = LD_SH(filter_x);
3770 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3771
3772 filter_vec = LD_SH(filter_y);
3773 UNPCK_R_SB_SH(filter_vec, filter_vec);
3774
3775 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3776
3777 mask1 = mask0 + 2;
3778
3779 const_vec = __msa_ldi_h(128);
3780 const_vec <<= 6;
3781
3782 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3783 src += (5 * src_stride);
3784 LD_SB4(src, src_stride, src5, src6, src7, src8);
3785
3786 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3787 XORI_B4_128_SB(src5, src6, src7, src8);
3788
3789 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3790 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3791 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3792 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3793 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3794 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
3795 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
3796 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
3797 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
3798
3799 dst0 = const_vec;
3800 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3801 dst1 = const_vec;
3802 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3803 dst2 = const_vec;
3804 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3805 dst3 = const_vec;
3806 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst3, dst3);
3807 dst4 = const_vec;
3808 DPADD_SB2_SH(vec8, vec9, filt0, filt1, dst4, dst4);
3809 dst5 = const_vec;
3810 DPADD_SB2_SH(vec10, vec11, filt0, filt1, dst5, dst5);
3811 dst6 = const_vec;
3812 DPADD_SB2_SH(vec12, vec13, filt0, filt1, dst6, dst6);
3813 dst7 = const_vec;
3814 DPADD_SB2_SH(vec14, vec15, filt0, filt1, dst7, dst7);
3815 dst8 = const_vec;
3816 DPADD_SB2_SH(vec16, vec17, filt0, filt1, dst8, dst8);
3817
3818 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3819 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3820 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3821 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3822 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3823 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3824 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
3825 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
3826
3827 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3828 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3829 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3830 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3831 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3832 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3833 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3834 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3835 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3836 dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
3837 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3838 dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
3839
3840 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3841 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3842 SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
3843
3844 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r,
3845 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r, dst2_r, dst3_r);
3846 PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r, dst5_r);
3847
3848 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3849 dst += (2 * dst_stride);
3850 ST_SW2(dst2_r, dst3_r, dst, dst_stride);
3851 dst += (2 * dst_stride);
3852 ST_SW2(dst4_r, dst5_r, dst, dst_stride);
3853 }
3854
hevc_hv_4t_8multx4mult_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height,int32_t width8mult)3855 static void hevc_hv_4t_8multx4mult_msa(uint8_t *src,
3856 int32_t src_stride,
3857 int16_t *dst,
3858 int32_t dst_stride,
3859 const int8_t *filter_x,
3860 const int8_t *filter_y,
3861 int32_t height,
3862 int32_t width8mult)
3863 {
3864 uint32_t loop_cnt, cnt;
3865 uint8_t *src_tmp;
3866 int16_t *dst_tmp;
3867 v16i8 src0, src1, src2, src3, src4, src5, src6;
3868 v8i16 filt0, filt1;
3869 v8i16 filt_h0, filt_h1;
3870 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3871 v16i8 mask1;
3872 v8i16 filter_vec, const_vec;
3873 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3874 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6;
3875 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3876 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3877 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3878
3879 src -= (src_stride + 1);
3880
3881 filter_vec = LD_SH(filter_x);
3882 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3883
3884 filter_vec = LD_SH(filter_y);
3885 UNPCK_R_SB_SH(filter_vec, filter_vec);
3886
3887 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3888
3889 mask1 = mask0 + 2;
3890
3891 const_vec = __msa_ldi_h(128);
3892 const_vec <<= 6;
3893
3894 for (cnt = width8mult; cnt--;) {
3895 src_tmp = src;
3896 dst_tmp = dst;
3897
3898 LD_SB3(src_tmp, src_stride, src0, src1, src2);
3899 src_tmp += (3 * src_stride);
3900
3901 XORI_B3_128_SB(src0, src1, src2);
3902
3903 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3904 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3905 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3906
3907 dst0 = const_vec;
3908 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3909 dst1 = const_vec;
3910 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3911 dst2 = const_vec;
3912 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3913
3914 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3915 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3916
3917 for (loop_cnt = height >> 2; loop_cnt--;) {
3918 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3919 src_tmp += (4 * src_stride);
3920 XORI_B4_128_SB(src3, src4, src5, src6);
3921
3922 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3923 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3924 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3925 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3926
3927 dst3 = const_vec;
3928 dst4 = const_vec;
3929 dst5 = const_vec;
3930 dst6 = const_vec;
3931 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3932 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst4, dst4);
3933 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst5, dst5);
3934 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst6, dst6);
3935
3936 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3937 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3938 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3939 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3940
3941 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3942 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3943 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3944 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3945 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3946 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3947 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3948 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3949
3950 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3951 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3952
3953 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r,
3954 dst2_l, dst2_r, dst3_l, dst3_r,
3955 dst0_r, dst1_r, dst2_r, dst3_r);
3956
3957 ST_SW4(dst0_r, dst1_r, dst2_r, dst3_r, dst_tmp, dst_stride);
3958 dst_tmp += (4 * dst_stride);
3959
3960 dst10_r = dst54_r;
3961 dst10_l = dst54_l;
3962 dst21_r = dst65_r;
3963 dst21_l = dst65_l;
3964 dst2 = dst6;
3965 }
3966
3967 src += 8;
3968 dst += 8;
3969 }
3970 }
3971
hevc_hv_4t_8w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)3972 static void hevc_hv_4t_8w_msa(uint8_t *src,
3973 int32_t src_stride,
3974 int16_t *dst,
3975 int32_t dst_stride,
3976 const int8_t *filter_x,
3977 const int8_t *filter_y,
3978 int32_t height)
3979 {
3980
3981 if (2 == height) {
3982 hevc_hv_4t_8x2_msa(src, src_stride, dst, dst_stride,
3983 filter_x, filter_y);
3984 } else if (4 == height) {
3985 hevc_hv_4t_8multx4_msa(src, src_stride, dst, dst_stride,
3986 filter_x, filter_y, 1);
3987 } else if (6 == height) {
3988 hevc_hv_4t_8x6_msa(src, src_stride, dst, dst_stride,
3989 filter_x, filter_y);
3990 } else if (0 == (height % 4)) {
3991 hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
3992 filter_x, filter_y, height, 1);
3993 }
3994 }
3995
hevc_hv_4t_12w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)3996 static void hevc_hv_4t_12w_msa(uint8_t *src,
3997 int32_t src_stride,
3998 int16_t *dst,
3999 int32_t dst_stride,
4000 const int8_t *filter_x,
4001 const int8_t *filter_y,
4002 int32_t height)
4003 {
4004 uint32_t loop_cnt;
4005 uint8_t *src_tmp;
4006 int16_t *dst_tmp;
4007 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4008 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4009 v16i8 mask0, mask1, mask2, mask3;
4010 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, const_vec;
4011 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst10, dst21, dst22, dst73;
4012 v8i16 dst84, dst95, dst106, dst76_r, dst98_r, dst87_r, dst109_r;
4013 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
4014 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
4015 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4016 v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4017
4018 src -= (src_stride + 1);
4019
4020 filter_vec = LD_SH(filter_x);
4021 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4022
4023 filter_vec = LD_SH(filter_y);
4024 UNPCK_R_SB_SH(filter_vec, filter_vec);
4025
4026 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4027
4028 mask0 = LD_SB(ff_hevc_mask_arr);
4029 mask1 = mask0 + 2;
4030
4031 const_vec = __msa_ldi_h(128);
4032 const_vec <<= 6;
4033
4034 src_tmp = src;
4035 dst_tmp = dst;
4036
4037 LD_SB3(src_tmp, src_stride, src0, src1, src2);
4038 src_tmp += (3 * src_stride);
4039
4040 XORI_B3_128_SB(src0, src1, src2);
4041
4042 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4043 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4044 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4045
4046 dst0 = const_vec;
4047 dst1 = const_vec;
4048 dst2 = const_vec;
4049 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
4050 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
4051 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
4052
4053 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4054 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4055
4056 for (loop_cnt = 4; loop_cnt--;) {
4057 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
4058 src_tmp += (4 * src_stride);
4059 XORI_B4_128_SB(src3, src4, src5, src6);
4060
4061 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4062 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4063 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4064 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4065
4066 dst3 = const_vec;
4067 dst4 = const_vec;
4068 dst5 = const_vec;
4069 dst6 = const_vec;
4070 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
4071 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst4, dst4);
4072 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst5, dst5);
4073 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst6, dst6);
4074
4075 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4076 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4077 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
4078 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
4079
4080 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4081 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4082 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4083 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4084 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4085 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4086 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4087 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4088
4089 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4090 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4091 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4092 dst3_r, dst0_r, dst1_r, dst2_r, dst3_r);
4093 ST_SW4(dst0_r, dst1_r, dst2_r, dst3_r, dst_tmp, dst_stride);
4094 dst_tmp += (4 * dst_stride);
4095
4096 dst10_r = dst54_r;
4097 dst10_l = dst54_l;
4098 dst21_r = dst65_r;
4099 dst21_l = dst65_l;
4100 dst2 = dst6;
4101 }
4102
4103 src += 8;
4104 dst += 8;
4105
4106 mask2 = LD_SB(ff_hevc_mask_arr + 16);
4107 mask3 = mask2 + 2;
4108
4109 LD_SB3(src, src_stride, src0, src1, src2);
4110 src += (3 * src_stride);
4111 XORI_B3_128_SB(src0, src1, src2);
4112 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
4113 VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
4114 dst10 = const_vec;
4115 dst21 = const_vec;
4116 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst10, dst10);
4117 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst21, dst21);
4118 ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
4119 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4120
4121 for (loop_cnt = 2; loop_cnt--;) {
4122 LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9,
4123 src10);
4124 src += (8 * src_stride);
4125 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4126 VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
4127 VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
4128 VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
4129 VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
4130
4131 dst73 = const_vec;
4132 dst84 = const_vec;
4133 dst95 = const_vec;
4134 dst106 = const_vec;
4135 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst73, dst73);
4136 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst84, dst84);
4137 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst95, dst95);
4138 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst106, dst106);
4139
4140 dst32_r = __msa_ilvr_h(dst73, dst22);
4141 ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
4142 ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
4143 ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
4144 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4145 dst76_r = __msa_ilvr_h(dst22, dst106);
4146
4147 tmp0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4148 tmp1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4149 tmp2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4150 tmp3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4151 tmp4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4152 tmp5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4153 tmp6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4154 tmp7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4155
4156 SRA_4V(tmp0, tmp1, tmp2, tmp3, 6);
4157 SRA_4V(tmp4, tmp5, tmp6, tmp7, 6);
4158 PCKEV_H4_SW(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, tmp0, tmp1,
4159 tmp2, tmp3);
4160 ST_D8(tmp0, tmp1, tmp2, tmp3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
4161 dst += (8 * dst_stride);
4162
4163 dst10_r = dst98_r;
4164 dst21_r = dst109_r;
4165 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4166 }
4167 }
4168
hevc_hv_4t_16w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)4169 static void hevc_hv_4t_16w_msa(uint8_t *src,
4170 int32_t src_stride,
4171 int16_t *dst,
4172 int32_t dst_stride,
4173 const int8_t *filter_x,
4174 const int8_t *filter_y,
4175 int32_t height)
4176 {
4177 if (4 == height) {
4178 hevc_hv_4t_8multx4_msa(src, src_stride, dst, dst_stride,
4179 filter_x, filter_y, 2);
4180 } else {
4181 hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4182 filter_x, filter_y, height, 2);
4183 }
4184 }
4185
hevc_hv_4t_24w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)4186 static void hevc_hv_4t_24w_msa(uint8_t *src,
4187 int32_t src_stride,
4188 int16_t *dst,
4189 int32_t dst_stride,
4190 const int8_t *filter_x,
4191 const int8_t *filter_y,
4192 int32_t height)
4193 {
4194 hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4195 filter_x, filter_y, height, 3);
4196 }
4197
hevc_hv_4t_32w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)4198 static void hevc_hv_4t_32w_msa(uint8_t *src,
4199 int32_t src_stride,
4200 int16_t *dst,
4201 int32_t dst_stride,
4202 const int8_t *filter_x,
4203 const int8_t *filter_y,
4204 int32_t height)
4205 {
4206 hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4207 filter_x, filter_y, height, 4);
4208 }
4209
4210 #define MC_COPY(WIDTH) \
4211 void ff_hevc_put_hevc_pel_pixels##WIDTH##_8_msa(int16_t *dst, \
4212 uint8_t *src, \
4213 ptrdiff_t src_stride, \
4214 int height, \
4215 intptr_t mx, \
4216 intptr_t my, \
4217 int width) \
4218 { \
4219 hevc_copy_##WIDTH##w_msa(src, src_stride, dst, MAX_PB_SIZE, height); \
4220 }
4221
4222 MC_COPY(4);
4223 MC_COPY(6);
4224 MC_COPY(8);
4225 MC_COPY(12);
4226 MC_COPY(16);
4227 MC_COPY(24);
4228 MC_COPY(32);
4229 MC_COPY(48);
4230 MC_COPY(64);
4231
4232 #undef MC_COPY
4233
4234 #define MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
4235 void ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_msa(int16_t *dst, \
4236 uint8_t *src, \
4237 ptrdiff_t src_stride, \
4238 int height, \
4239 intptr_t mx, \
4240 intptr_t my, \
4241 int width) \
4242 { \
4243 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
4244 \
4245 hevc_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \
4246 MAX_PB_SIZE, filter, height); \
4247 }
4248
4249 MC(qpel, h, 4, 8, hz, mx);
4250 MC(qpel, h, 8, 8, hz, mx);
4251 MC(qpel, h, 12, 8, hz, mx);
4252 MC(qpel, h, 16, 8, hz, mx);
4253 MC(qpel, h, 24, 8, hz, mx);
4254 MC(qpel, h, 32, 8, hz, mx);
4255 MC(qpel, h, 48, 8, hz, mx);
4256 MC(qpel, h, 64, 8, hz, mx);
4257
4258 MC(qpel, v, 4, 8, vt, my);
4259 MC(qpel, v, 8, 8, vt, my);
4260 MC(qpel, v, 12, 8, vt, my);
4261 MC(qpel, v, 16, 8, vt, my);
4262 MC(qpel, v, 24, 8, vt, my);
4263 MC(qpel, v, 32, 8, vt, my);
4264 MC(qpel, v, 48, 8, vt, my);
4265 MC(qpel, v, 64, 8, vt, my);
4266
4267 MC(epel, h, 4, 4, hz, mx);
4268 MC(epel, h, 6, 4, hz, mx);
4269 MC(epel, h, 8, 4, hz, mx);
4270 MC(epel, h, 12, 4, hz, mx);
4271 MC(epel, h, 16, 4, hz, mx);
4272 MC(epel, h, 24, 4, hz, mx);
4273 MC(epel, h, 32, 4, hz, mx);
4274
4275 MC(epel, v, 4, 4, vt, my);
4276 MC(epel, v, 6, 4, vt, my);
4277 MC(epel, v, 8, 4, vt, my);
4278 MC(epel, v, 12, 4, vt, my);
4279 MC(epel, v, 16, 4, vt, my);
4280 MC(epel, v, 24, 4, vt, my);
4281 MC(epel, v, 32, 4, vt, my);
4282
4283 #undef MC
4284
4285 #define MC_HV(PEL, WIDTH, TAP) \
4286 void ff_hevc_put_hevc_##PEL##_hv##WIDTH##_8_msa(int16_t *dst, \
4287 uint8_t *src, \
4288 ptrdiff_t src_stride, \
4289 int height, \
4290 intptr_t mx, \
4291 intptr_t my, \
4292 int width) \
4293 { \
4294 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
4295 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
4296 \
4297 hevc_hv_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, MAX_PB_SIZE, \
4298 filter_x, filter_y, height); \
4299 }
4300
4301 MC_HV(qpel, 4, 8);
4302 MC_HV(qpel, 8, 8);
4303 MC_HV(qpel, 12, 8);
4304 MC_HV(qpel, 16, 8);
4305 MC_HV(qpel, 24, 8);
4306 MC_HV(qpel, 32, 8);
4307 MC_HV(qpel, 48, 8);
4308 MC_HV(qpel, 64, 8);
4309
4310 MC_HV(epel, 4, 4);
4311 MC_HV(epel, 6, 4);
4312 MC_HV(epel, 8, 4);
4313 MC_HV(epel, 12, 4);
4314 MC_HV(epel, 16, 4);
4315 MC_HV(epel, 24, 4);
4316 MC_HV(epel, 32, 4);
4317
4318 #undef MC_HV
4319