1 /*
2 * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "libavcodec/mips/hevcdsp_mips.h"
23 #include "libavcodec/mips/hevc_macros_msa.h"
24
25 static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
26 /* 8 width cases */
27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28 /* 4 width cases */
29 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
30 };
31
hevc_copy_4w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,int32_t height)32 static void hevc_copy_4w_msa(uint8_t *src, int32_t src_stride,
33 int16_t *dst, int32_t dst_stride,
34 int32_t height)
35 {
36 v16i8 zero = { 0 };
37
38 if (2 == height) {
39 v16i8 src0, src1;
40 v8i16 in0;
41
42 LD_SB2(src, src_stride, src0, src1);
43
44 src0 = (v16i8) __msa_ilvr_w((v4i32) src1, (v4i32) src0);
45 in0 = (v8i16) __msa_ilvr_b(zero, src0);
46 in0 <<= 6;
47 ST8x2_UB(in0, dst, 2 * dst_stride);
48 } else if (4 == height) {
49 v16i8 src0, src1, src2, src3;
50 v8i16 in0, in1;
51
52 LD_SB4(src, src_stride, src0, src1, src2, src3);
53
54 ILVR_W2_SB(src1, src0, src3, src2, src0, src1);
55 ILVR_B2_SH(zero, src0, zero, src1, in0, in1);
56 in0 <<= 6;
57 in1 <<= 6;
58 ST8x4_UB(in0, in1, dst, 2 * dst_stride);
59 } else if (0 == height % 8) {
60 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
61 v8i16 in0, in1, in2, in3;
62 uint32_t loop_cnt;
63
64 for (loop_cnt = (height >> 3); loop_cnt--;) {
65 LD_SB8(src, src_stride,
66 src0, src1, src2, src3, src4, src5, src6, src7);
67 src += (8 * src_stride);
68
69 ILVR_W4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
70 src0, src1, src2, src3);
71 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
72 in0, in1, in2, in3);
73 SLLI_4V(in0, in1, in2, in3, 6);
74 ST8x8_UB(in0, in1, in2, in3, dst, 2 * dst_stride);
75 dst += (8 * dst_stride);
76 }
77 }
78 }
79
hevc_copy_6w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,int32_t height)80 static void hevc_copy_6w_msa(uint8_t *src, int32_t src_stride,
81 int16_t *dst, int32_t dst_stride,
82 int32_t height)
83 {
84 uint32_t loop_cnt;
85 v16i8 zero = { 0 };
86 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
87 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
88
89 for (loop_cnt = (height >> 3); loop_cnt--;) {
90 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
91 src += (8 * src_stride);
92
93 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
94 in0, in1, in2, in3);
95 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
96 in4, in5, in6, in7);
97 SLLI_4V(in0, in1, in2, in3, 6);
98 SLLI_4V(in4, in5, in6, in7, 6);
99 ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, dst, 2 * dst_stride);
100 dst += (8 * dst_stride);
101 }
102 }
103
hevc_copy_8w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,int32_t height)104 static void hevc_copy_8w_msa(uint8_t *src, int32_t src_stride,
105 int16_t *dst, int32_t dst_stride,
106 int32_t height)
107 {
108 v16i8 zero = { 0 };
109
110 if (2 == height) {
111 v16i8 src0, src1;
112 v8i16 in0, in1;
113
114 LD_SB2(src, src_stride, src0, src1);
115
116 ILVR_B2_SH(zero, src0, zero, src1, in0, in1);
117 in0 <<= 6;
118 in1 <<= 6;
119 ST_SH2(in0, in1, dst, dst_stride);
120 } else if (4 == height) {
121 v16i8 src0, src1, src2, src3;
122 v8i16 in0, in1, in2, in3;
123
124 LD_SB4(src, src_stride, src0, src1, src2, src3);
125
126 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
127 in0, in1, in2, in3);
128 SLLI_4V(in0, in1, in2, in3, 6);
129 ST_SH4(in0, in1, in2, in3, dst, dst_stride);
130 } else if (6 == height) {
131 v16i8 src0, src1, src2, src3, src4, src5;
132 v8i16 in0, in1, in2, in3, in4, in5;
133
134 LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5);
135
136 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
137 in0, in1, in2, in3);
138 ILVR_B2_SH(zero, src4, zero, src5, in4, in5);
139 SLLI_4V(in0, in1, in2, in3, 6);
140 in4 <<= 6;
141 in5 <<= 6;
142 ST_SH6(in0, in1, in2, in3, in4, in5, dst, dst_stride);
143 } else if (0 == height % 8) {
144 uint32_t loop_cnt;
145 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
146 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
147
148 for (loop_cnt = (height >> 3); loop_cnt--;) {
149 LD_SB8(src, src_stride,
150 src0, src1, src2, src3, src4, src5, src6, src7);
151 src += (8 * src_stride);
152
153 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
154 in0, in1, in2, in3);
155 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
156 in4, in5, in6, in7);
157 SLLI_4V(in0, in1, in2, in3, 6);
158 SLLI_4V(in4, in5, in6, in7, 6);
159 ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, dst, dst_stride);
160 dst += (8 * dst_stride);
161 }
162 }
163 }
164
hevc_copy_12w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,int32_t height)165 static void hevc_copy_12w_msa(uint8_t *src, int32_t src_stride,
166 int16_t *dst, int32_t dst_stride,
167 int32_t height)
168 {
169 uint32_t loop_cnt;
170 v16i8 zero = { 0 };
171 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
172 v8i16 in0, in1, in0_r, in1_r, in2_r, in3_r;
173
174 for (loop_cnt = (height >> 3); loop_cnt--;) {
175 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
176 src += (8 * src_stride);
177
178 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
179 in0_r, in1_r, in2_r, in3_r);
180 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
181 ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
182 ILVR_B2_SH(zero, src0, zero, src1, in0, in1);
183 in0 <<= 6;
184 in1 <<= 6;
185 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
186 ST8x4_UB(in0, in1, dst + 8, 2 * dst_stride);
187 dst += (4 * dst_stride);
188
189 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
190 in0_r, in1_r, in2_r, in3_r);
191 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
192 ILVL_W2_SB(src5, src4, src7, src6, src0, src1);
193 ILVR_B2_SH(zero, src0, zero, src1, in0, in1);
194 in0 <<= 6;
195 in1 <<= 6;
196 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
197 ST8x4_UB(in0, in1, dst + 8, 2 * dst_stride);
198 dst += (4 * dst_stride);
199 }
200 }
201
hevc_copy_16w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,int32_t height)202 static void hevc_copy_16w_msa(uint8_t *src, int32_t src_stride,
203 int16_t *dst, int32_t dst_stride,
204 int32_t height)
205 {
206 v16i8 zero = { 0 };
207
208 if (4 == height) {
209 v16i8 src0, src1, src2, src3;
210 v8i16 in0_r, in1_r, in2_r, in3_r;
211 v8i16 in0_l, in1_l, in2_l, in3_l;
212
213 LD_SB4(src, src_stride, src0, src1, src2, src3);
214
215 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
216 in0_r, in1_r, in2_r, in3_r);
217 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
218 in0_l, in1_l, in2_l, in3_l);
219 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
220 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
221 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
222 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
223 } else if (12 == height) {
224 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
225 v16i8 src8, src9, src10, src11;
226 v8i16 in0_r, in1_r, in2_r, in3_r;
227 v8i16 in0_l, in1_l, in2_l, in3_l;
228
229 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
230 src += (8 * src_stride);
231 LD_SB4(src, src_stride, src8, src9, src10, src11);
232
233 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
234 in0_r, in1_r, in2_r, in3_r);
235 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
236 in0_l, in1_l, in2_l, in3_l);
237 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
238 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
239 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
240 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
241 dst += (4 * dst_stride);
242
243 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
244 in0_r, in1_r, in2_r, in3_r);
245 ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
246 in0_l, in1_l, in2_l, in3_l);
247 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
248 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
249 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
250 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
251 dst += (4 * dst_stride);
252
253 ILVR_B4_SH(zero, src8, zero, src9, zero, src10, zero, src11,
254 in0_r, in1_r, in2_r, in3_r);
255 ILVL_B4_SH(zero, src8, zero, src9, zero, src10, zero, src11,
256 in0_l, in1_l, in2_l, in3_l);
257 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
258 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
259 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
260 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
261 } else if (0 == (height % 8)) {
262 uint32_t loop_cnt;
263 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
264 v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
265
266 for (loop_cnt = (height >> 3); loop_cnt--;) {
267 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6,
268 src7);
269 src += (8 * src_stride);
270 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_r,
271 in1_r, in2_r, in3_r);
272 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_l,
273 in1_l, in2_l, in3_l);
274 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
275 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
276 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
277 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
278 dst += (4 * dst_stride);
279
280 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_r,
281 in1_r, in2_r, in3_r);
282 ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_l,
283 in1_l, in2_l, in3_l);
284 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
285 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
286 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
287 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
288 dst += (4 * dst_stride);
289 }
290 }
291 }
292
hevc_copy_24w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,int32_t height)293 static void hevc_copy_24w_msa(uint8_t *src, int32_t src_stride,
294 int16_t *dst, int32_t dst_stride,
295 int32_t height)
296 {
297 uint32_t loop_cnt;
298 v16i8 zero = { 0 };
299 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
300 v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
301
302 for (loop_cnt = (height >> 2); loop_cnt--;) {
303 LD_SB4(src, src_stride, src0, src1, src2, src3);
304 LD_SB4((src + 16), src_stride, src4, src5, src6, src7);
305 src += (4 * src_stride);
306 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_r, in1_r,
307 in2_r, in3_r);
308 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_l, in1_l,
309 in2_l, in3_l);
310 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
311 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
312 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
313 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
314 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_r, in1_r,
315 in2_r, in3_r);
316 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
317 ST_SH4(in0_r, in1_r, in2_r, in3_r, (dst + 16), dst_stride);
318 dst += (4 * dst_stride);
319 }
320 }
321
hevc_copy_32w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,int32_t height)322 static void hevc_copy_32w_msa(uint8_t *src, int32_t src_stride,
323 int16_t *dst, int32_t dst_stride,
324 int32_t height)
325 {
326 uint32_t loop_cnt;
327 v16i8 zero = { 0 };
328 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
329 v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
330
331 for (loop_cnt = (height >> 2); loop_cnt--;) {
332 LD_SB4(src, src_stride, src0, src2, src4, src6);
333 LD_SB4((src + 16), src_stride, src1, src3, src5, src7);
334 src += (4 * src_stride);
335
336 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_r, in1_r,
337 in2_r, in3_r);
338 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_l, in1_l,
339 in2_l, in3_l);
340 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
341 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
342 ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
343 dst += dst_stride;
344 ST_SH4(in2_r, in2_l, in3_r, in3_l, dst, 8);
345 dst += dst_stride;
346
347 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_r, in1_r,
348 in2_r, in3_r);
349 ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_l, in1_l,
350 in2_l, in3_l);
351 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
352 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
353 ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
354 dst += dst_stride;
355 ST_SH4(in2_r, in2_l, in3_r, in3_l, dst, 8);
356 dst += dst_stride;
357 }
358 }
359
hevc_copy_48w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,int32_t height)360 static void hevc_copy_48w_msa(uint8_t *src, int32_t src_stride,
361 int16_t *dst, int32_t dst_stride,
362 int32_t height)
363 {
364 uint32_t loop_cnt;
365 v16i8 zero = { 0 };
366 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
367 v16i8 src8, src9, src10, src11;
368 v8i16 in0_r, in1_r, in2_r, in3_r, in4_r, in5_r;
369 v8i16 in0_l, in1_l, in2_l, in3_l, in4_l, in5_l;
370
371 for (loop_cnt = (height >> 2); loop_cnt--;) {
372 LD_SB3(src, 16, src0, src1, src2);
373 src += src_stride;
374 LD_SB3(src, 16, src3, src4, src5);
375 src += src_stride;
376 LD_SB3(src, 16, src6, src7, src8);
377 src += src_stride;
378 LD_SB3(src, 16, src9, src10, src11);
379 src += src_stride;
380
381 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
382 in0_r, in1_r, in2_r, in3_r);
383 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
384 in0_l, in1_l, in2_l, in3_l);
385 ILVR_B2_SH(zero, src4, zero, src5, in4_r, in5_r);
386 ILVL_B2_SH(zero, src4, zero, src5, in4_l, in5_l);
387 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
388 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
389 SLLI_4V(in4_r, in5_r, in4_l, in5_l, 6);
390 ST_SH6(in0_r, in0_l, in1_r, in1_l, in2_r, in2_l, dst, 8);
391 dst += dst_stride;
392 ST_SH6(in3_r, in3_l, in4_r, in4_l, in5_r, in5_l, dst, 8);
393 dst += dst_stride;
394
395 ILVR_B4_SH(zero, src6, zero, src7, zero, src8, zero, src9,
396 in0_r, in1_r, in2_r, in3_r);
397 ILVL_B4_SH(zero, src6, zero, src7, zero, src8, zero, src9,
398 in0_l, in1_l, in2_l, in3_l);
399 ILVR_B2_SH(zero, src10, zero, src11, in4_r, in5_r);
400 ILVL_B2_SH(zero, src10, zero, src11, in4_l, in5_l);
401 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
402 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
403 SLLI_4V(in4_r, in5_r, in4_l, in5_l, 6);
404 ST_SH6(in0_r, in0_l, in1_r, in1_l, in2_r, in2_l, dst, 8);
405 dst += dst_stride;
406 ST_SH6(in3_r, in3_l, in4_r, in4_l, in5_r, in5_l, dst, 8);
407 dst += dst_stride;
408 }
409 }
410
hevc_copy_64w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,int32_t height)411 static void hevc_copy_64w_msa(uint8_t *src, int32_t src_stride,
412 int16_t *dst, int32_t dst_stride,
413 int32_t height)
414 {
415 uint32_t loop_cnt;
416 v16i8 zero = { 0 };
417 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
418 v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
419
420 for (loop_cnt = (height >> 1); loop_cnt--;) {
421 LD_SB4(src, 16, src0, src1, src2, src3);
422 src += src_stride;
423 LD_SB4(src, 16, src4, src5, src6, src7);
424 src += src_stride;
425
426 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
427 in0_r, in1_r, in2_r, in3_r);
428 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
429 in0_l, in1_l, in2_l, in3_l);
430 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
431 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
432 ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
433 ST_SH4(in2_r, in2_l, in3_r, in3_l, (dst + 32), 8);
434 dst += dst_stride;
435
436 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
437 in0_r, in1_r, in2_r, in3_r);
438 ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
439 in0_l, in1_l, in2_l, in3_l);
440 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
441 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
442 ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
443 ST_SH4(in2_r, in2_l, in3_r, in3_l, (dst + 32), 8);
444 dst += dst_stride;
445 }
446 }
447
hevc_hz_8t_4w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)448 static void hevc_hz_8t_4w_msa(uint8_t *src, int32_t src_stride,
449 int16_t *dst, int32_t dst_stride,
450 const int8_t *filter, int32_t height)
451 {
452 uint32_t loop_cnt;
453 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
454 v8i16 filt0, filt1, filt2, filt3;
455 v16i8 mask1, mask2, mask3;
456 v16i8 vec0, vec1, vec2, vec3;
457 v8i16 dst0, dst1, dst2, dst3;
458 v8i16 filter_vec, const_vec;
459 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
460
461 src -= 3;
462 const_vec = __msa_ldi_h(128);
463 const_vec <<= 6;
464
465 filter_vec = LD_SH(filter);
466 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
467
468 mask1 = mask0 + 2;
469 mask2 = mask0 + 4;
470 mask3 = mask0 + 6;
471
472 for (loop_cnt = (height >> 3); loop_cnt--;) {
473 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
474 src += (8 * src_stride);
475 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
476
477 VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
478 vec0, vec1, vec2, vec3);
479 dst0 = const_vec;
480 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
481 dst0, dst0, dst0, dst0);
482 VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
483 vec0, vec1, vec2, vec3);
484 dst1 = const_vec;
485 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
486 dst1, dst1, dst1, dst1);
487 VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
488 vec0, vec1, vec2, vec3);
489 dst2 = const_vec;
490 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
491 dst2, dst2, dst2, dst2);
492 VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
493 vec0, vec1, vec2, vec3);
494 dst3 = const_vec;
495 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
496 dst3, dst3, dst3, dst3);
497
498 ST8x8_UB(dst0, dst1, dst2, dst3, dst, 2 * dst_stride);
499 dst += (8 * dst_stride);
500 }
501 }
502
hevc_hz_8t_8w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)503 static void hevc_hz_8t_8w_msa(uint8_t *src, int32_t src_stride,
504 int16_t *dst, int32_t dst_stride,
505 const int8_t *filter, int32_t height)
506 {
507 uint32_t loop_cnt;
508 v16i8 src0, src1, src2, src3;
509 v8i16 filt0, filt1, filt2, filt3;
510 v16i8 mask1, mask2, mask3;
511 v16i8 vec0, vec1, vec2, vec3;
512 v8i16 dst0, dst1, dst2, dst3;
513 v8i16 filter_vec, const_vec;
514 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
515
516 src -= 3;
517 const_vec = __msa_ldi_h(128);
518 const_vec <<= 6;
519
520 filter_vec = LD_SH(filter);
521 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
522
523 mask1 = mask0 + 2;
524 mask2 = mask0 + 4;
525 mask3 = mask0 + 6;
526
527 for (loop_cnt = (height >> 2); loop_cnt--;) {
528 LD_SB4(src, src_stride, src0, src1, src2, src3);
529 src += (4 * src_stride);
530 XORI_B4_128_SB(src0, src1, src2, src3);
531
532 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
533 vec0, vec1, vec2, vec3);
534 dst0 = const_vec;
535 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
536 dst0, dst0, dst0, dst0);
537 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
538 vec0, vec1, vec2, vec3);
539 dst1 = const_vec;
540 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
541 dst1, dst1, dst1, dst1);
542 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
543 vec0, vec1, vec2, vec3);
544 dst2 = const_vec;
545 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
546 dst2, dst2, dst2, dst2);
547 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
548 vec0, vec1, vec2, vec3);
549 dst3 = const_vec;
550 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
551 dst3, dst3, dst3, dst3);
552
553 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
554 dst += (4 * dst_stride);
555 }
556 }
557
hevc_hz_8t_12w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)558 static void hevc_hz_8t_12w_msa(uint8_t *src, int32_t src_stride,
559 int16_t *dst, int32_t dst_stride,
560 const int8_t *filter, int32_t height)
561 {
562 uint32_t loop_cnt;
563 int64_t res0, res1, res2, res3;
564 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
565 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
566 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
567 v8i16 filt0, filt1, filt2, filt3, dst0, dst1, dst2, dst3, dst4, dst5;
568 v8i16 filter_vec, const_vec;
569
570 src -= 3;
571 const_vec = __msa_ldi_h(128);
572 const_vec <<= 6;
573
574 filter_vec = LD_SH(filter);
575 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
576
577 mask0 = LD_SB(ff_hevc_mask_arr);
578 mask1 = mask0 + 2;
579 mask2 = mask0 + 4;
580 mask3 = mask0 + 6;
581 mask4 = LD_SB(ff_hevc_mask_arr + 16);
582 mask5 = mask4 + 2;
583 mask6 = mask4 + 4;
584 mask7 = mask4 + 6;
585
586 for (loop_cnt = 4; loop_cnt--;) {
587 LD_SB4(src, src_stride, src0, src1, src2, src3);
588 LD_SB4(src + 8, src_stride, src4, src5, src6, src7);
589 src += (4 * src_stride);
590 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
591
592 dst0 = const_vec;
593 dst1 = const_vec;
594 dst2 = const_vec;
595 dst3 = const_vec;
596 dst4 = const_vec;
597 dst5 = const_vec;
598 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
599 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
600 VSHF_B2_SB(src4, src5, src6, src7, mask4, mask4, vec4, vec5);
601 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
602 dst1, dst2, dst3);
603 DPADD_SB2_SH(vec4, vec5, filt0, filt0, dst4, dst5);
604 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
605 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
606 VSHF_B2_SB(src4, src5, src6, src7, mask5, mask5, vec4, vec5);
607 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
608 dst1, dst2, dst3);
609 DPADD_SB2_SH(vec4, vec5, filt1, filt1, dst4, dst5);
610 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
611 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
612 VSHF_B2_SB(src4, src5, src6, src7, mask6, mask6, vec4, vec5);
613 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
614 dst1, dst2, dst3);
615 DPADD_SB2_SH(vec4, vec5, filt2, filt2, dst4, dst5);
616 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec0, vec1);
617 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
618 VSHF_B2_SB(src4, src5, src6, src7, mask7, mask7, vec4, vec5);
619 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
620 dst1, dst2, dst3);
621 DPADD_SB2_SH(vec4, vec5, filt3, filt3, dst4, dst5);
622
623 res0 = __msa_copy_s_d((v2i64) dst4, 0);
624 res1 = __msa_copy_s_d((v2i64) dst4, 1);
625 res2 = __msa_copy_s_d((v2i64) dst5, 0);
626 res3 = __msa_copy_s_d((v2i64) dst5, 1);
627 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
628 SD4(res0, res1, res2, res3, (dst + 8), dst_stride);
629 dst += (4 * dst_stride);
630 }
631 }
632
hevc_hz_8t_16w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)633 static void hevc_hz_8t_16w_msa(uint8_t *src, int32_t src_stride,
634 int16_t *dst, int32_t dst_stride,
635 const int8_t *filter, int32_t height)
636 {
637 uint32_t loop_cnt;
638 v16i8 src0, src1, src2, src3;
639 v8i16 filt0, filt1, filt2, filt3;
640 v16i8 mask1, mask2, mask3;
641 v16i8 vec0, vec1, vec2, vec3;
642 v8i16 dst0, dst1, dst2, dst3;
643 v8i16 filter_vec, const_vec;
644 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
645
646 src -= 3;
647 const_vec = __msa_ldi_h(128);
648 const_vec <<= 6;
649
650 filter_vec = LD_SH(filter);
651 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
652
653 mask1 = mask0 + 2;
654 mask2 = mask0 + 4;
655 mask3 = mask0 + 6;
656
657 for (loop_cnt = (height >> 1); loop_cnt--;) {
658 LD_SB2(src, src_stride, src0, src2);
659 LD_SB2(src + 8, src_stride, src1, src3);
660 src += (2 * src_stride);
661 XORI_B4_128_SB(src0, src1, src2, src3);
662
663 dst0 = const_vec;
664 dst1 = const_vec;
665 dst2 = const_vec;
666 dst3 = const_vec;
667 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
668 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
669 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
670 dst1, dst2, dst3);
671 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
672 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
673 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
674 dst1, dst2, dst3);
675 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
676 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
677 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
678 dst1, dst2, dst3);
679 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec0, vec1);
680 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
681 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
682 dst1, dst2, dst3);
683
684 ST_SH2(dst0, dst2, dst, dst_stride);
685 ST_SH2(dst1, dst3, dst + 8, dst_stride);
686 dst += (2 * dst_stride);
687 }
688 }
689
hevc_hz_8t_24w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)690 static void hevc_hz_8t_24w_msa(uint8_t *src, int32_t src_stride,
691 int16_t *dst, int32_t dst_stride,
692 const int8_t *filter, int32_t height)
693 {
694 uint32_t loop_cnt;
695 v16i8 src0, src1, src2, src3;
696 v8i16 filt0, filt1, filt2, filt3;
697 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
698 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
699 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
700 v8i16 filter_vec, const_vec;
701 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
702
703 src -= 3;
704 filter_vec = LD_SH(filter);
705 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
706
707 mask1 = mask0 + 2;
708 mask2 = mask0 + 4;
709 mask3 = mask0 + 6;
710 mask4 = mask0 + 8;
711 mask5 = mask0 + 10;
712 mask6 = mask0 + 12;
713 mask7 = mask0 + 14;
714
715 const_vec = __msa_ldi_h(128);
716 const_vec <<= 6;
717
718 for (loop_cnt = (height >> 1); loop_cnt--;) {
719 LD_SB2(src, 16, src0, src1);
720 src += src_stride;
721 LD_SB2(src, 16, src2, src3);
722 src += src_stride;
723 XORI_B4_128_SB(src0, src1, src2, src3);
724
725 dst0 = const_vec;
726 dst1 = const_vec;
727 dst2 = const_vec;
728 dst3 = const_vec;
729 dst4 = const_vec;
730 dst5 = const_vec;
731 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
732 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
733 VSHF_B2_SB(src2, src3, src3, src3, mask4, mask0, vec4, vec5);
734 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
735 dst1, dst2, dst3);
736 DPADD_SB2_SH(vec4, vec5, filt0, filt0, dst4, dst5);
737 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
738 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
739 VSHF_B2_SB(src2, src3, src3, src3, mask5, mask1, vec4, vec5);
740 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
741 dst1, dst2, dst3);
742 DPADD_SB2_SH(vec4, vec5, filt1, filt1, dst4, dst5);
743 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
744 VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3);
745 VSHF_B2_SB(src2, src3, src3, src3, mask6, mask2, vec4, vec5);
746 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
747 dst1, dst2, dst3);
748 DPADD_SB2_SH(vec4, vec5, filt2, filt2, dst4, dst5);
749 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
750 VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3);
751 VSHF_B2_SB(src2, src3, src3, src3, mask7, mask3, vec4, vec5);
752 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
753 dst1, dst2, dst3);
754 DPADD_SB2_SH(vec4, vec5, filt3, filt3, dst4, dst5);
755
756 ST_SH2(dst0, dst1, dst, 8);
757 ST_SH(dst2, dst + 16);
758 dst += dst_stride;
759 ST_SH2(dst3, dst4, dst, 8);
760 ST_SH(dst5, dst + 16);
761 dst += dst_stride;
762 }
763 }
764
hevc_hz_8t_32w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)765 static void hevc_hz_8t_32w_msa(uint8_t *src, int32_t src_stride,
766 int16_t *dst, int32_t dst_stride,
767 const int8_t *filter, int32_t height)
768 {
769 uint32_t loop_cnt;
770 v16i8 src0, src1, src2;
771 v8i16 filt0, filt1, filt2, filt3;
772 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
773 v16i8 vec0, vec1, vec2, vec3;
774 v8i16 dst0, dst1, dst2, dst3;
775 v8i16 filter_vec, const_vec;
776 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
777
778 src -= 3;
779 filter_vec = LD_SH(filter);
780 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
781
782 mask1 = mask0 + 2;
783 mask2 = mask0 + 4;
784 mask3 = mask0 + 6;
785 mask4 = mask0 + 8;
786 mask5 = mask0 + 10;
787 mask6 = mask0 + 12;
788 mask7 = mask0 + 14;
789
790 const_vec = __msa_ldi_h(128);
791 const_vec <<= 6;
792
793 for (loop_cnt = height; loop_cnt--;) {
794 LD_SB2(src, 16, src0, src1);
795 src2 = LD_SB(src + 24);
796 src += src_stride;
797 XORI_B3_128_SB(src0, src1, src2);
798
799 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
800 vec0, vec1, vec2, vec3);
801 dst0 = const_vec;
802 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
803 dst0, dst0, dst0, dst0);
804 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
805 vec0, vec1, vec2, vec3);
806 dst1 = const_vec;
807 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
808 dst1, dst1, dst1, dst1);
809 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
810 vec0, vec1, vec2, vec3);
811 dst2 = const_vec;
812 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
813 dst2, dst2, dst2, dst2);
814 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
815 vec0, vec1, vec2, vec3);
816 dst3 = const_vec;
817 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
818 dst3, dst3, dst3, dst3);
819
820 ST_SH4(dst0, dst1, dst2, dst3, dst, 8);
821 dst += dst_stride;
822 }
823 }
824
hevc_hz_8t_48w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)825 static void hevc_hz_8t_48w_msa(uint8_t *src, int32_t src_stride,
826 int16_t *dst, int32_t dst_stride,
827 const int8_t *filter, int32_t height)
828 {
829 uint32_t loop_cnt;
830 v16i8 src0, src1, src2, src3;
831 v8i16 filt0, filt1, filt2, filt3;
832 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
833 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
834 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
835 v8i16 filter_vec, const_vec;
836 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
837
838 src -= 3;
839 filter_vec = LD_SH(filter);
840 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
841
842 mask1 = mask0 + 2;
843 mask2 = mask0 + 4;
844 mask3 = mask0 + 6;
845 mask4 = mask0 + 8;
846 mask5 = mask0 + 10;
847 mask6 = mask0 + 12;
848 mask7 = mask0 + 14;
849
850 const_vec = __msa_ldi_h(128);
851 const_vec <<= 6;
852
853 for (loop_cnt = height; loop_cnt--;) {
854 LD_SB3(src, 16, src0, src1, src2);
855 src3 = LD_SB(src + 40);
856 src += src_stride;
857 XORI_B4_128_SB(src0, src1, src2, src3);
858
859 dst0 = const_vec;
860 dst1 = const_vec;
861 dst2 = const_vec;
862 dst3 = const_vec;
863 dst4 = const_vec;
864 dst5 = const_vec;
865 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
866 VSHF_B2_SB(src1, src1, src1, src2, mask0, mask4, vec2, vec3);
867 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
868 dst1, dst2, dst3);
869 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
870 VSHF_B2_SB(src1, src1, src1, src2, mask1, mask5, vec2, vec3);
871 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
872 dst1, dst2, dst3);
873 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
874 VSHF_B2_SB(src1, src1, src1, src2, mask2, mask6, vec2, vec3);
875 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
876 dst1, dst2, dst3);
877 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
878 VSHF_B2_SB(src1, src1, src1, src2, mask3, mask7, vec2, vec3);
879 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
880 dst1, dst2, dst3);
881 ST_SH4(dst0, dst1, dst2, dst3, dst, 8);
882
883 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec4, vec5);
884 DPADD_SB2_SH(vec4, vec5, filt0, filt0, dst4, dst5);
885 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec4, vec5);
886 DPADD_SB2_SH(vec4, vec5, filt1, filt1, dst4, dst5);
887 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec4, vec5);
888 DPADD_SB2_SH(vec4, vec5, filt2, filt2, dst4, dst5);
889 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec4, vec5);
890 DPADD_SB2_SH(vec4, vec5, filt3, filt3, dst4, dst5);
891 ST_SH2(dst4, dst5, (dst + 32), 8);
892 dst += dst_stride;
893 }
894 }
895
hevc_hz_8t_64w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)896 static void hevc_hz_8t_64w_msa(uint8_t *src, int32_t src_stride,
897 int16_t *dst, int32_t dst_stride,
898 const int8_t *filter, int32_t height)
899 {
900 uint32_t loop_cnt;
901 v16i8 src0, src1, src2, src3, src4;
902 v8i16 filt0, filt1, filt2, filt3;
903 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
904 v16i8 vec0, vec1, vec2, vec3;
905 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
906 v8i16 filter_vec, const_vec;
907 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
908
909 src -= 3;
910
911 filter_vec = LD_SH(filter);
912 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
913
914 mask1 = mask0 + 2;
915 mask2 = mask0 + 4;
916 mask3 = mask0 + 6;
917 mask4 = mask0 + 8;
918 mask5 = mask0 + 10;
919 mask6 = mask0 + 12;
920 mask7 = mask0 + 14;
921
922 const_vec = __msa_ldi_h(128);
923 const_vec <<= 6;
924
925 for (loop_cnt = height; loop_cnt--;) {
926 LD_SB4(src, 16, src0, src1, src2, src3);
927 src4 = LD_SB(src + 56);
928 src += src_stride;
929 XORI_B5_128_SB(src0, src1, src2, src3, src4);
930
931 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
932 vec0, vec1, vec2, vec3);
933 dst0 = const_vec;
934 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
935 dst0, dst0, dst0, dst0);
936 ST_SH(dst0, dst);
937
938 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
939 vec0, vec1, vec2, vec3);
940 dst1 = const_vec;
941 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
942 dst1, dst1, dst1, dst1);
943 ST_SH(dst1, dst + 8);
944
945 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
946 vec0, vec1, vec2, vec3);
947 dst2 = const_vec;
948 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
949 dst2, dst2, dst2, dst2);
950 ST_SH(dst2, dst + 16);
951
952 VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
953 vec0, vec1, vec2, vec3);
954 dst3 = const_vec;
955 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
956 dst3, dst3, dst3, dst3);
957 ST_SH(dst3, dst + 24);
958
959 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
960 vec0, vec1, vec2, vec3);
961 dst4 = const_vec;
962 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
963 dst4, dst4, dst4, dst4);
964 ST_SH(dst4, dst + 32);
965
966 VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7,
967 vec0, vec1, vec2, vec3);
968 dst5 = const_vec;
969 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
970 dst5, dst5, dst5, dst5);
971 ST_SH(dst5, dst + 40);
972
973 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
974 vec0, vec1, vec2, vec3);
975 dst6 = const_vec;
976 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
977 dst6, dst6, dst6, dst6);
978 ST_SH(dst6, dst + 48);
979
980 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
981 vec0, vec1, vec2, vec3);
982 dst7 = const_vec;
983 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
984 dst7, dst7, dst7, dst7);
985 ST_SH(dst7, dst + 56);
986 dst += dst_stride;
987 }
988 }
989
hevc_vt_8t_4w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)990 static void hevc_vt_8t_4w_msa(uint8_t *src, int32_t src_stride,
991 int16_t *dst, int32_t dst_stride,
992 const int8_t *filter, int32_t height)
993 {
994 int32_t loop_cnt;
995 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
996 v16i8 src9, src10, src11, src12, src13, src14;
997 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
998 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
999 v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1000 v16i8 src2110, src4332, src6554, src8776, src10998;
1001 v16i8 src12111110, src14131312;
1002 v8i16 dst10, dst32, dst54, dst76;
1003 v8i16 filt0, filt1, filt2, filt3;
1004 v8i16 filter_vec, const_vec;
1005
1006 src -= (3 * src_stride);
1007
1008 const_vec = __msa_ldi_h(128);
1009 const_vec <<= 6;
1010
1011 filter_vec = LD_SH(filter);
1012 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1013
1014 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1015 src += (7 * src_stride);
1016 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1017 src10_r, src32_r, src54_r, src21_r);
1018 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1019 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1020 src2110, src4332, src6554);
1021 XORI_B3_128_SB(src2110, src4332, src6554);
1022
1023 for (loop_cnt = (height >> 3); loop_cnt--;) {
1024 LD_SB8(src, src_stride,
1025 src7, src8, src9, src10, src11, src12, src13, src14);
1026 src += (8 * src_stride);
1027
1028 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1029 src76_r, src87_r, src98_r, src109_r);
1030 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1031 src1110_r, src1211_r, src1312_r, src1413_r);
1032 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r,
1033 src1211_r, src1110_r, src1413_r, src1312_r,
1034 src8776, src10998, src12111110, src14131312);
1035 XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
1036
1037 dst10 = const_vec;
1038 DPADD_SB4_SH(src2110, src4332, src6554, src8776,
1039 filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10);
1040 dst32 = const_vec;
1041 DPADD_SB4_SH(src4332, src6554, src8776, src10998,
1042 filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
1043 dst54 = const_vec;
1044 DPADD_SB4_SH(src6554, src8776, src10998, src12111110,
1045 filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
1046 dst76 = const_vec;
1047 DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
1048 filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
1049
1050 ST8x8_UB(dst10, dst32, dst54, dst76, dst, 2 * dst_stride);
1051 dst += (8 * dst_stride);
1052
1053 src2110 = src10998;
1054 src4332 = src12111110;
1055 src6554 = src14131312;
1056 src6 = src14;
1057 }
1058 }
1059
hevc_vt_8t_8w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1060 static void hevc_vt_8t_8w_msa(uint8_t *src, int32_t src_stride,
1061 int16_t *dst, int32_t dst_stride,
1062 const int8_t *filter, int32_t height)
1063 {
1064 int32_t loop_cnt;
1065 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1066 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1067 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1068 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1069 v8i16 filter_vec, const_vec;
1070 v8i16 filt0, filt1, filt2, filt3;
1071
1072 src -= (3 * src_stride);
1073 const_vec = __msa_ldi_h(128);
1074 const_vec <<= 6;
1075
1076 filter_vec = LD_SH(filter);
1077 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1078
1079 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1080 src += (7 * src_stride);
1081 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1082 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1083 src10_r, src32_r, src54_r, src21_r);
1084 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1085
1086 for (loop_cnt = (height >> 2); loop_cnt--;) {
1087 LD_SB4(src, src_stride, src7, src8, src9, src10);
1088 src += (4 * src_stride);
1089 XORI_B4_128_SB(src7, src8, src9, src10);
1090 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1091 src76_r, src87_r, src98_r, src109_r);
1092
1093 dst0_r = const_vec;
1094 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1095 filt0, filt1, filt2, filt3,
1096 dst0_r, dst0_r, dst0_r, dst0_r);
1097 dst1_r = const_vec;
1098 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1099 filt0, filt1, filt2, filt3,
1100 dst1_r, dst1_r, dst1_r, dst1_r);
1101 dst2_r = const_vec;
1102 DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
1103 filt0, filt1, filt2, filt3,
1104 dst2_r, dst2_r, dst2_r, dst2_r);
1105 dst3_r = const_vec;
1106 DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
1107 filt0, filt1, filt2, filt3,
1108 dst3_r, dst3_r, dst3_r, dst3_r);
1109
1110 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
1111 dst += (4 * dst_stride);
1112
1113 src10_r = src54_r;
1114 src32_r = src76_r;
1115 src54_r = src98_r;
1116 src21_r = src65_r;
1117 src43_r = src87_r;
1118 src65_r = src109_r;
1119 src6 = src10;
1120 }
1121 }
1122
hevc_vt_8t_12w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1123 static void hevc_vt_8t_12w_msa(uint8_t *src, int32_t src_stride,
1124 int16_t *dst, int32_t dst_stride,
1125 const int8_t *filter, int32_t height)
1126 {
1127 int32_t loop_cnt;
1128 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1129 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1130 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1131 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1132 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1133 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1134 v16i8 src2110, src4332, src6554, src8776, src10998;
1135 v8i16 dst0_l, dst1_l;
1136 v8i16 filter_vec, const_vec;
1137 v8i16 filt0, filt1, filt2, filt3;
1138
1139 src -= (3 * src_stride);
1140 const_vec = __msa_ldi_h(128);
1141 const_vec <<= 6;
1142
1143 filter_vec = LD_SH(filter);
1144 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1145
1146 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1147 src += (7 * src_stride);
1148 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1149 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1150 src10_r, src32_r, src54_r, src21_r);
1151 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1152 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1153 src10_l, src32_l, src54_l, src21_l);
1154 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1155 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1156 src2110, src4332, src6554);
1157
1158 for (loop_cnt = (height >> 2); loop_cnt--;) {
1159 LD_SB4(src, src_stride, src7, src8, src9, src10);
1160 src += (4 * src_stride);
1161 XORI_B4_128_SB(src7, src8, src9, src10);
1162 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1163 src76_r, src87_r, src98_r, src109_r);
1164 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1165 src76_l, src87_l, src98_l, src109_l);
1166 ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
1167
1168 dst0_r = const_vec;
1169 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1170 filt0, filt1, filt2, filt3,
1171 dst0_r, dst0_r, dst0_r, dst0_r);
1172 dst1_r = const_vec;
1173 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1174 filt0, filt1, filt2, filt3,
1175 dst1_r, dst1_r, dst1_r, dst1_r);
1176 dst2_r = const_vec;
1177 DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
1178 filt0, filt1, filt2, filt3,
1179 dst2_r, dst2_r, dst2_r, dst2_r);
1180 dst3_r = const_vec;
1181 DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
1182 filt0, filt1, filt2, filt3,
1183 dst3_r, dst3_r, dst3_r, dst3_r);
1184 dst0_l = const_vec;
1185 DPADD_SB4_SH(src2110, src4332, src6554, src8776,
1186 filt0, filt1, filt2, filt3,
1187 dst0_l, dst0_l, dst0_l, dst0_l);
1188 dst1_l = const_vec;
1189 DPADD_SB4_SH(src4332, src6554, src8776, src10998,
1190 filt0, filt1, filt2, filt3,
1191 dst1_l, dst1_l, dst1_l, dst1_l);
1192
1193 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
1194 ST8x4_UB(dst0_l, dst1_l, dst + 8, 2 * dst_stride);
1195 dst += (4 * dst_stride);
1196
1197 src10_r = src54_r;
1198 src32_r = src76_r;
1199 src54_r = src98_r;
1200 src21_r = src65_r;
1201 src43_r = src87_r;
1202 src65_r = src109_r;
1203 src2110 = src6554;
1204 src4332 = src8776;
1205 src6554 = src10998;
1206 src6 = src10;
1207 }
1208 }
1209
hevc_vt_8t_16multx4mult_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height,int32_t width)1210 static void hevc_vt_8t_16multx4mult_msa(uint8_t *src,
1211 int32_t src_stride,
1212 int16_t *dst,
1213 int32_t dst_stride,
1214 const int8_t *filter,
1215 int32_t height,
1216 int32_t width)
1217 {
1218 uint8_t *src_tmp;
1219 int16_t *dst_tmp;
1220 int32_t loop_cnt, cnt;
1221 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1222 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1223 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1224 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1225 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1226 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1227 v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
1228 v8i16 filter_vec, const_vec;
1229 v8i16 filt0, filt1, filt2, filt3;
1230
1231 src -= (3 * src_stride);
1232 const_vec = __msa_ldi_h(128);
1233 const_vec <<= 6;
1234
1235 filter_vec = LD_SH(filter);
1236 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1237
1238 for (cnt = width >> 4; cnt--;) {
1239 src_tmp = src;
1240 dst_tmp = dst;
1241
1242 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1243 src_tmp += (7 * src_stride);
1244 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1245 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1246 src10_r, src32_r, src54_r, src21_r);
1247 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1248 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1249 src10_l, src32_l, src54_l, src21_l);
1250 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1251
1252 for (loop_cnt = (height >> 2); loop_cnt--;) {
1253 LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1254 src_tmp += (4 * src_stride);
1255 XORI_B4_128_SB(src7, src8, src9, src10);
1256 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1257 src76_r, src87_r, src98_r, src109_r);
1258 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1259 src76_l, src87_l, src98_l, src109_l);
1260
1261 dst0_r = const_vec;
1262 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1263 filt0, filt1, filt2, filt3,
1264 dst0_r, dst0_r, dst0_r, dst0_r);
1265 dst1_r = const_vec;
1266 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1267 filt0, filt1, filt2, filt3,
1268 dst1_r, dst1_r, dst1_r, dst1_r);
1269 dst2_r = const_vec;
1270 DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
1271 filt0, filt1, filt2, filt3,
1272 dst2_r, dst2_r, dst2_r, dst2_r);
1273 dst3_r = const_vec;
1274 DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
1275 filt0, filt1, filt2, filt3,
1276 dst3_r, dst3_r, dst3_r, dst3_r);
1277 dst0_l = const_vec;
1278 DPADD_SB4_SH(src10_l, src32_l, src54_l, src76_l,
1279 filt0, filt1, filt2, filt3,
1280 dst0_l, dst0_l, dst0_l, dst0_l);
1281 dst1_l = const_vec;
1282 DPADD_SB4_SH(src21_l, src43_l, src65_l, src87_l,
1283 filt0, filt1, filt2, filt3,
1284 dst1_l, dst1_l, dst1_l, dst1_l);
1285 dst2_l = const_vec;
1286 DPADD_SB4_SH(src32_l, src54_l, src76_l, src98_l,
1287 filt0, filt1, filt2, filt3,
1288 dst2_l, dst2_l, dst2_l, dst2_l);
1289 dst3_l = const_vec;
1290 DPADD_SB4_SH(src43_l, src65_l, src87_l, src109_l,
1291 filt0, filt1, filt2, filt3,
1292 dst3_l, dst3_l, dst3_l, dst3_l);
1293
1294 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst_tmp, dst_stride);
1295 ST_SH4(dst0_l, dst1_l, dst2_l, dst3_l, dst_tmp + 8, dst_stride);
1296 dst_tmp += (4 * dst_stride);
1297
1298 src10_r = src54_r;
1299 src32_r = src76_r;
1300 src54_r = src98_r;
1301 src21_r = src65_r;
1302 src43_r = src87_r;
1303 src65_r = src109_r;
1304 src10_l = src54_l;
1305 src32_l = src76_l;
1306 src54_l = src98_l;
1307 src21_l = src65_l;
1308 src43_l = src87_l;
1309 src65_l = src109_l;
1310 src6 = src10;
1311 }
1312
1313 src += 16;
1314 dst += 16;
1315 }
1316 }
1317
hevc_vt_8t_16w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1318 static void hevc_vt_8t_16w_msa(uint8_t *src, int32_t src_stride,
1319 int16_t *dst, int32_t dst_stride,
1320 const int8_t *filter, int32_t height)
1321 {
1322 hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1323 filter, height, 16);
1324 }
1325
hevc_vt_8t_24w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1326 static void hevc_vt_8t_24w_msa(uint8_t *src, int32_t src_stride,
1327 int16_t *dst, int32_t dst_stride,
1328 const int8_t *filter, int32_t height)
1329 {
1330 hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1331 filter, height, 16);
1332 hevc_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride,
1333 filter, height);
1334 }
1335
hevc_vt_8t_32w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1336 static void hevc_vt_8t_32w_msa(uint8_t *src, int32_t src_stride,
1337 int16_t *dst, int32_t dst_stride,
1338 const int8_t *filter, int32_t height)
1339 {
1340 hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1341 filter, height, 32);
1342 }
1343
hevc_vt_8t_48w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1344 static void hevc_vt_8t_48w_msa(uint8_t *src, int32_t src_stride,
1345 int16_t *dst, int32_t dst_stride,
1346 const int8_t *filter, int32_t height)
1347 {
1348 hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1349 filter, height, 48);
1350 }
1351
hevc_vt_8t_64w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1352 static void hevc_vt_8t_64w_msa(uint8_t *src, int32_t src_stride,
1353 int16_t *dst, int32_t dst_stride,
1354 const int8_t *filter, int32_t height)
1355 {
1356 hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1357 filter, height, 64);
1358 }
1359
hevc_hv_8t_4w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)1360 static void hevc_hv_8t_4w_msa(uint8_t *src, int32_t src_stride,
1361 int16_t *dst, int32_t dst_stride,
1362 const int8_t *filter_x, const int8_t *filter_y,
1363 int32_t height)
1364 {
1365 uint32_t loop_cnt;
1366 int32_t dst_stride_in_bytes = 2 * dst_stride;
1367 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1368 v8i16 filt0, filt1, filt2, filt3;
1369 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1370 v16i8 mask1, mask2, mask3;
1371 v8i16 filter_vec, const_vec;
1372 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1373 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1374 v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1375 v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
1376 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r;
1377 v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
1378 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1379
1380 src -= ((3 * src_stride) + 3);
1381 filter_vec = LD_SH(filter_x);
1382 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1383
1384 filter_vec = LD_SH(filter_y);
1385 UNPCK_R_SB_SH(filter_vec, filter_vec);
1386
1387 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1388
1389 mask1 = mask0 + 2;
1390 mask2 = mask0 + 4;
1391 mask3 = mask0 + 6;
1392
1393 const_vec = __msa_ldi_h(128);
1394 const_vec <<= 6;
1395
1396 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1397 src += (7 * src_stride);
1398 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1399
1400 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1401 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1402 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1403 vec8, vec9, vec10, vec11);
1404 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1405 vec12, vec13, vec14, vec15);
1406 dst30 = const_vec;
1407 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1408 dst30, dst30, dst30, dst30);
1409 dst41 = const_vec;
1410 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1411 dst41, dst41, dst41, dst41);
1412 dst52 = const_vec;
1413 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1414 dst52, dst52, dst52, dst52);
1415 dst63 = const_vec;
1416 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1417 dst63, dst63, dst63, dst63);
1418
1419 ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1420 ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1421 ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1422 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1423
1424 for (loop_cnt = height >> 2; loop_cnt--;) {
1425 LD_SB4(src, src_stride, src7, src8, src9, src10);
1426 src += (4 * src_stride);
1427 XORI_B4_128_SB(src7, src8, src9, src10);
1428
1429 VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
1430 vec0, vec1, vec2, vec3);
1431 VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
1432 vec4, vec5, vec6, vec7);
1433 dst97 = const_vec;
1434 dst108 = const_vec;
1435 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1436 dst97, dst97, dst97, dst97);
1437 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1438 dst108, dst108, dst108, dst108);
1439
1440 dst76_r = __msa_ilvr_h(dst97, dst66);
1441 ILVRL_H2_SH(dst108, dst97, dst87_r, dst109_r);
1442 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
1443 dst98_r = __msa_ilvr_h(dst66, dst108);
1444
1445 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1446 filt_h0, filt_h1, filt_h2, filt_h3);
1447 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1448 filt_h0, filt_h1, filt_h2, filt_h3);
1449 dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r,
1450 filt_h0, filt_h1, filt_h2, filt_h3);
1451 dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r,
1452 filt_h0, filt_h1, filt_h2, filt_h3);
1453 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1454 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst2_r);
1455 ST8x4_UB(dst0_r, dst2_r, dst, dst_stride_in_bytes);
1456 dst += (4 * dst_stride);
1457
1458 dst10_r = dst54_r;
1459 dst32_r = dst76_r;
1460 dst54_r = dst98_r;
1461 dst21_r = dst65_r;
1462 dst43_r = dst87_r;
1463 dst65_r = dst109_r;
1464 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
1465 }
1466 }
1467
hevc_hv_8t_8multx1mult_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height,int32_t width)1468 static void hevc_hv_8t_8multx1mult_msa(uint8_t *src,
1469 int32_t src_stride,
1470 int16_t *dst,
1471 int32_t dst_stride,
1472 const int8_t *filter_x,
1473 const int8_t *filter_y,
1474 int32_t height, int32_t width)
1475 {
1476 uint32_t loop_cnt, cnt;
1477 uint8_t *src_tmp;
1478 int16_t *dst_tmp;
1479 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1480 v8i16 filt0, filt1, filt2, filt3;
1481 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1482 v16i8 mask1, mask2, mask3;
1483 v8i16 filter_vec, const_vec;
1484 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1485 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1486 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1487 v4i32 dst0_r, dst0_l;
1488 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1489 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1490 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1491
1492 src -= ((3 * src_stride) + 3);
1493 filter_vec = LD_SH(filter_x);
1494 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1495
1496 filter_vec = LD_SH(filter_y);
1497 UNPCK_R_SB_SH(filter_vec, filter_vec);
1498
1499 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1500
1501 mask1 = mask0 + 2;
1502 mask2 = mask0 + 4;
1503 mask3 = mask0 + 6;
1504
1505 const_vec = __msa_ldi_h(128);
1506 const_vec <<= 6;
1507
1508 for (cnt = width >> 3; cnt--;) {
1509 src_tmp = src;
1510 dst_tmp = dst;
1511
1512 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1513 src_tmp += (7 * src_stride);
1514 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1515
1516 /* row 0 row 1 row 2 row 3 */
1517 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1518 vec0, vec1, vec2, vec3);
1519 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1520 vec4, vec5, vec6, vec7);
1521 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1522 vec8, vec9, vec10, vec11);
1523 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1524 vec12, vec13, vec14, vec15);
1525 dst0 = const_vec;
1526 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1527 dst0, dst0, dst0, dst0);
1528 dst1 = const_vec;
1529 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1530 dst1, dst1, dst1, dst1);
1531 dst2 = const_vec;
1532 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1533 dst2, dst2, dst2, dst2);
1534 dst3 = const_vec;
1535 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1536 dst3, dst3, dst3, dst3);
1537
1538 /* row 4 row 5 row 6 */
1539 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1540 vec0, vec1, vec2, vec3);
1541 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1542 vec4, vec5, vec6, vec7);
1543 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1544 vec8, vec9, vec10, vec11);
1545 dst4 = const_vec;
1546 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1547 dst4, dst4, dst4, dst4);
1548 dst5 = const_vec;
1549 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1550 dst5, dst5, dst5, dst5);
1551 dst6 = const_vec;
1552 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1553 dst6, dst6, dst6, dst6);
1554
1555 for (loop_cnt = height; loop_cnt--;) {
1556 src7 = LD_SB(src_tmp);
1557 src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
1558 src_tmp += src_stride;
1559
1560 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1561 vec0, vec1, vec2, vec3);
1562 dst7 = const_vec;
1563 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1564 dst7, dst7, dst7, dst7);
1565
1566 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
1567 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
1568 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
1569 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1570 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1571 filt_h0, filt_h1, filt_h2, filt_h3);
1572 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1573 filt_h0, filt_h1, filt_h2, filt_h3);
1574 dst0_r >>= 6;
1575 dst0_l >>= 6;
1576
1577 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
1578 ST_SW(dst0_r, dst_tmp);
1579 dst_tmp += dst_stride;
1580
1581 dst0 = dst1;
1582 dst1 = dst2;
1583 dst2 = dst3;
1584 dst3 = dst4;
1585 dst4 = dst5;
1586 dst5 = dst6;
1587 dst6 = dst7;
1588 }
1589
1590 src += 8;
1591 dst += 8;
1592 }
1593 }
1594
hevc_hv_8t_8w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)1595 static void hevc_hv_8t_8w_msa(uint8_t *src, int32_t src_stride,
1596 int16_t *dst, int32_t dst_stride,
1597 const int8_t *filter_x, const int8_t *filter_y,
1598 int32_t height)
1599 {
1600 hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride,
1601 filter_x, filter_y, height, 8);
1602 }
1603
hevc_hv_8t_12w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)1604 static void hevc_hv_8t_12w_msa(uint8_t *src, int32_t src_stride,
1605 int16_t *dst, int32_t dst_stride,
1606 const int8_t *filter_x, const int8_t *filter_y,
1607 int32_t height)
1608 {
1609 uint32_t loop_cnt;
1610 int32_t dst_stride_in_bytes = 2 * dst_stride;
1611 uint8_t *src_tmp;
1612 int16_t *dst_tmp;
1613 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1614 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1615 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1616 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1617 v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
1618 v8i16 filter_vec, const_vec;
1619 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1620 v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1621 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
1622 v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
1623 v4i32 dst0_r, dst0_l, dst1_r, dst2_r, dst3_r;
1624
1625 src -= ((3 * src_stride) + 3);
1626 filter_vec = LD_SH(filter_x);
1627 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1628
1629 filter_vec = LD_SH(filter_y);
1630 UNPCK_R_SB_SH(filter_vec, filter_vec);
1631
1632 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1633
1634 mask0 = LD_SB(ff_hevc_mask_arr);
1635 mask1 = mask0 + 2;
1636 mask2 = mask0 + 4;
1637 mask3 = mask0 + 6;
1638
1639 const_vec = __msa_ldi_h(128);
1640 const_vec <<= 6;
1641
1642 src_tmp = src;
1643 dst_tmp = dst;
1644
1645 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1646 src_tmp += (7 * src_stride);
1647 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1648
1649 /* row 0 row 1 row 2 row 3 */
1650 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1651 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1652 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1653 vec11);
1654 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
1655 vec15);
1656 dst0 = const_vec;
1657 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst0, dst0,
1658 dst0, dst0);
1659 dst1 = const_vec;
1660 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, dst1, dst1,
1661 dst1, dst1);
1662 dst2 = const_vec;
1663 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, dst2,
1664 dst2, dst2, dst2);
1665 dst3 = const_vec;
1666 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3, dst3,
1667 dst3, dst3, dst3);
1668
1669 /* row 4 row 5 row 6 */
1670 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1671 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1672 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1673 vec11);
1674 dst4 = const_vec;
1675 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst4, dst4,
1676 dst4, dst4);
1677 dst5 = const_vec;
1678 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, dst5, dst5,
1679 dst5, dst5);
1680 dst6 = const_vec;
1681 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, dst6,
1682 dst6, dst6, dst6);
1683
1684 for (loop_cnt = height; loop_cnt--;) {
1685 src7 = LD_SB(src_tmp);
1686 src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
1687 src_tmp += src_stride;
1688
1689 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1690 vec3);
1691 dst7 = const_vec;
1692 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst7,
1693 dst7, dst7, dst7);
1694
1695 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
1696 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
1697 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
1698 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1699 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1700 filt_h1, filt_h2, filt_h3);
1701 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0,
1702 filt_h1, filt_h2, filt_h3);
1703 dst0_r >>= 6;
1704 dst0_l >>= 6;
1705
1706 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
1707 ST_SW(dst0_r, dst_tmp);
1708 dst_tmp += dst_stride;
1709
1710 dst0 = dst1;
1711 dst1 = dst2;
1712 dst2 = dst3;
1713 dst3 = dst4;
1714 dst4 = dst5;
1715 dst5 = dst6;
1716 dst6 = dst7;
1717 }
1718
1719 src += 8;
1720 dst += 8;
1721
1722 mask4 = LD_SB(ff_hevc_mask_arr + 16);
1723 mask5 = mask4 + 2;
1724 mask6 = mask4 + 4;
1725 mask7 = mask4 + 6;
1726
1727 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1728 src += (7 * src_stride);
1729 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1730
1731 VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
1732 VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
1733 VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1734 vec11);
1735 VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
1736 vec15);
1737 dst30 = const_vec;
1738 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst30,
1739 dst30, dst30, dst30);
1740 dst41 = const_vec;
1741 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, dst41,
1742 dst41, dst41, dst41);
1743 dst52 = const_vec;
1744 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, dst52,
1745 dst52, dst52, dst52);
1746 dst63 = const_vec;
1747 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3, dst63,
1748 dst63, dst63, dst63);
1749
1750 ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1751 ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1752 ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1753
1754 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1755
1756 for (loop_cnt = height >> 2; loop_cnt--;) {
1757 LD_SB4(src, src_stride, src7, src8, src9, src10);
1758 src += (4 * src_stride);
1759 XORI_B4_128_SB(src7, src8, src9, src10);
1760
1761 VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
1762 vec3);
1763 VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
1764 vec7);
1765 dst97 = const_vec;
1766 dst108 = const_vec;
1767 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst97,
1768 dst97, dst97, dst97);
1769 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, dst108,
1770 dst108, dst108, dst108);
1771
1772 dst76_r = __msa_ilvr_h(dst97, dst66);
1773 ILVRL_H2_SH(dst108, dst97, dst87_r, dst109_r);
1774 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
1775 dst98_r = __msa_ilvr_h(dst66, dst108);
1776
1777 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1778 filt_h1, filt_h2, filt_h3);
1779 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1780 filt_h1, filt_h2, filt_h3);
1781 dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1782 filt_h1, filt_h2, filt_h3);
1783 dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1784 filt_h1, filt_h2, filt_h3);
1785 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1786 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst2_r);
1787 ST8x4_UB(dst0_r, dst2_r, dst, dst_stride_in_bytes);
1788 dst += (4 * dst_stride);
1789
1790 dst10_r = dst54_r;
1791 dst32_r = dst76_r;
1792 dst54_r = dst98_r;
1793 dst21_r = dst65_r;
1794 dst43_r = dst87_r;
1795 dst65_r = dst109_r;
1796 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
1797 }
1798 }
1799
hevc_hv_8t_16w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)1800 static void hevc_hv_8t_16w_msa(uint8_t *src, int32_t src_stride,
1801 int16_t *dst, int32_t dst_stride,
1802 const int8_t *filter_x, const int8_t *filter_y,
1803 int32_t height)
1804 {
1805 hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride,
1806 filter_x, filter_y, height, 16);
1807 }
1808
hevc_hv_8t_24w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)1809 static void hevc_hv_8t_24w_msa(uint8_t *src, int32_t src_stride,
1810 int16_t *dst, int32_t dst_stride,
1811 const int8_t *filter_x, const int8_t *filter_y,
1812 int32_t height)
1813 {
1814 hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride,
1815 filter_x, filter_y, height, 24);
1816 }
1817
hevc_hv_8t_32w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)1818 static void hevc_hv_8t_32w_msa(uint8_t *src, int32_t src_stride,
1819 int16_t *dst, int32_t dst_stride,
1820 const int8_t *filter_x, const int8_t *filter_y,
1821 int32_t height)
1822 {
1823 hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride,
1824 filter_x, filter_y, height, 32);
1825 }
1826
hevc_hv_8t_48w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)1827 static void hevc_hv_8t_48w_msa(uint8_t *src, int32_t src_stride,
1828 int16_t *dst, int32_t dst_stride,
1829 const int8_t *filter_x, const int8_t *filter_y,
1830 int32_t height)
1831 {
1832 hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride,
1833 filter_x, filter_y, height, 48);
1834 }
1835
hevc_hv_8t_64w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)1836 static void hevc_hv_8t_64w_msa(uint8_t *src, int32_t src_stride,
1837 int16_t *dst, int32_t dst_stride,
1838 const int8_t *filter_x, const int8_t *filter_y,
1839 int32_t height)
1840 {
1841 hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride,
1842 filter_x, filter_y, height, 64);
1843 }
1844
hevc_hz_4t_4x2_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter)1845 static void hevc_hz_4t_4x2_msa(uint8_t *src,
1846 int32_t src_stride,
1847 int16_t *dst,
1848 int32_t dst_stride,
1849 const int8_t *filter)
1850 {
1851 v8i16 filt0, filt1;
1852 v16i8 src0, src1;
1853 v16i8 mask1, vec0, vec1;
1854 v8i16 dst0;
1855 v8i16 filter_vec, const_vec;
1856 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1857
1858 src -= 1;
1859
1860 filter_vec = LD_SH(filter);
1861 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
1862
1863 mask1 = mask0 + 2;
1864
1865 const_vec = __msa_ldi_h(128);
1866 const_vec <<= 6;
1867
1868 LD_SB2(src, src_stride, src0, src1);
1869 XORI_B2_128_SB(src0, src1);
1870
1871 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1872 dst0 = const_vec;
1873 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
1874
1875 ST8x2_UB(dst0, dst, 2 * dst_stride);
1876 }
1877
hevc_hz_4t_4x4_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter)1878 static void hevc_hz_4t_4x4_msa(uint8_t *src,
1879 int32_t src_stride,
1880 int16_t *dst,
1881 int32_t dst_stride,
1882 const int8_t *filter)
1883 {
1884 v8i16 filt0, filt1;
1885 v16i8 src0, src1, src2, src3;
1886 v16i8 mask1, vec0, vec1;
1887 v8i16 dst0, dst1;
1888 v8i16 filter_vec, const_vec;
1889 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1890
1891 src -= 1;
1892
1893 filter_vec = LD_SH(filter);
1894 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
1895
1896 mask1 = mask0 + 2;
1897
1898 const_vec = __msa_ldi_h(128);
1899 const_vec <<= 6;
1900
1901 LD_SB4(src, src_stride, src0, src1, src2, src3);
1902 XORI_B4_128_SB(src0, src1, src2, src3);
1903
1904 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1905 dst0 = const_vec;
1906 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
1907
1908 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
1909 dst1 = const_vec;
1910 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
1911
1912 ST8x4_UB(dst0, dst1, dst, 2 * dst_stride);
1913 }
1914
hevc_hz_4t_4x8multiple_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1915 static void hevc_hz_4t_4x8multiple_msa(uint8_t *src,
1916 int32_t src_stride,
1917 int16_t *dst,
1918 int32_t dst_stride,
1919 const int8_t *filter,
1920 int32_t height)
1921 {
1922 uint32_t loop_cnt;
1923 v8i16 filt0, filt1;
1924 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1925 v16i8 mask1, vec0, vec1;
1926 v8i16 dst0, dst1, dst2, dst3;
1927 v8i16 filter_vec, const_vec;
1928 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1929
1930 src -= 1;
1931
1932 filter_vec = LD_SH(filter);
1933 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
1934
1935 mask1 = mask0 + 2;
1936
1937 const_vec = __msa_ldi_h(128);
1938 const_vec <<= 6;
1939
1940 for (loop_cnt = (height >> 3); loop_cnt--;) {
1941 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1942 src += (8 * src_stride);
1943
1944 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1945
1946 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1947 dst0 = const_vec;
1948 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
1949 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
1950 dst1 = const_vec;
1951 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
1952 VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
1953 dst2 = const_vec;
1954 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
1955 VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
1956 dst3 = const_vec;
1957 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
1958
1959 ST8x8_UB(dst0, dst1, dst2, dst3, dst, 2 * dst_stride);
1960 dst += (8 * dst_stride);
1961 }
1962 }
1963
hevc_hz_4t_4w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1964 static void hevc_hz_4t_4w_msa(uint8_t *src,
1965 int32_t src_stride,
1966 int16_t *dst,
1967 int32_t dst_stride,
1968 const int8_t *filter,
1969 int32_t height)
1970 {
1971 if (2 == height) {
1972 hevc_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
1973 } else if (4 == height) {
1974 hevc_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
1975 } else if (0 == height % 8) {
1976 hevc_hz_4t_4x8multiple_msa(src, src_stride, dst, dst_stride,
1977 filter, height);
1978 }
1979 }
1980
hevc_hz_4t_6w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)1981 static void hevc_hz_4t_6w_msa(uint8_t *src,
1982 int32_t src_stride,
1983 int16_t *dst,
1984 int32_t dst_stride,
1985 const int8_t *filter,
1986 int32_t height)
1987 {
1988 uint32_t loop_cnt;
1989 uint64_t dst_val0, dst_val1, dst_val2, dst_val3;
1990 uint32_t dst_val_int0, dst_val_int1, dst_val_int2, dst_val_int3;
1991 v8i16 filt0, filt1, dst0, dst1, dst2, dst3;
1992 v16i8 src0, src1, src2, src3;
1993 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
1994 v16i8 mask1;
1995 v16i8 vec0, vec1;
1996 v8i16 filter_vec, const_vec;
1997
1998 src -= 1;
1999
2000 filter_vec = LD_SH(filter);
2001 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2002
2003 mask1 = mask0 + 2;
2004
2005 const_vec = __msa_ldi_h(128);
2006 const_vec <<= 6;
2007
2008 for (loop_cnt = 2; loop_cnt--;) {
2009 LD_SB4(src, src_stride, src0, src1, src2, src3);
2010 src += (4 * src_stride);
2011
2012 XORI_B4_128_SB(src0, src1, src2, src3);
2013
2014 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2015 dst0 = const_vec;
2016 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2017 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2018 dst1 = const_vec;
2019 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2020 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2021 dst2 = const_vec;
2022 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2023 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2024 dst3 = const_vec;
2025 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2026
2027 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
2028 dst_val1 = __msa_copy_u_d((v2i64) dst1, 0);
2029 dst_val2 = __msa_copy_u_d((v2i64) dst2, 0);
2030 dst_val3 = __msa_copy_u_d((v2i64) dst3, 0);
2031
2032 dst_val_int0 = __msa_copy_u_w((v4i32) dst0, 2);
2033 dst_val_int1 = __msa_copy_u_w((v4i32) dst1, 2);
2034 dst_val_int2 = __msa_copy_u_w((v4i32) dst2, 2);
2035 dst_val_int3 = __msa_copy_u_w((v4i32) dst3, 2);
2036
2037 SD(dst_val0, dst);
2038 SW(dst_val_int0, dst + 4);
2039 dst += dst_stride;
2040 SD(dst_val1, dst);
2041 SW(dst_val_int1, dst + 4);
2042 dst += dst_stride;
2043 SD(dst_val2, dst);
2044 SW(dst_val_int2, dst + 4);
2045 dst += dst_stride;
2046 SD(dst_val3, dst);
2047 SW(dst_val_int3, dst + 4);
2048 dst += dst_stride;
2049 }
2050 }
2051
hevc_hz_4t_8x2multiple_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2052 static void hevc_hz_4t_8x2multiple_msa(uint8_t *src,
2053 int32_t src_stride,
2054 int16_t *dst,
2055 int32_t dst_stride,
2056 const int8_t *filter,
2057 int32_t height)
2058 {
2059 uint32_t loop_cnt;
2060 v8i16 filt0, filt1, dst0, dst1;
2061 v16i8 src0, src1;
2062 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2063 v16i8 mask1;
2064 v16i8 vec0, vec1;
2065 v8i16 filter_vec, const_vec;
2066
2067 src -= 1;
2068
2069 filter_vec = LD_SH(filter);
2070 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2071
2072 mask1 = mask0 + 2;
2073
2074 const_vec = __msa_ldi_h(128);
2075 const_vec <<= 6;
2076
2077 for (loop_cnt = (height >> 1); loop_cnt--;) {
2078 LD_SB2(src, src_stride, src0, src1);
2079 src += (2 * src_stride);
2080
2081 XORI_B2_128_SB(src0, src1);
2082
2083 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2084 dst0 = const_vec;
2085 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2086
2087 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2088 dst1 = const_vec;
2089 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2090
2091 ST_SH2(dst0, dst1, dst, dst_stride);
2092 dst += (2 * dst_stride);
2093 }
2094 }
2095
hevc_hz_4t_8x4multiple_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2096 static void hevc_hz_4t_8x4multiple_msa(uint8_t *src,
2097 int32_t src_stride,
2098 int16_t *dst,
2099 int32_t dst_stride,
2100 const int8_t *filter,
2101 int32_t height)
2102 {
2103 uint32_t loop_cnt;
2104 v8i16 filt0, filt1;
2105 v16i8 src0, src1, src2, src3;
2106 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2107 v16i8 mask1;
2108 v16i8 vec0, vec1;
2109 v8i16 dst0, dst1, dst2, dst3;
2110 v8i16 filter_vec, const_vec;
2111
2112 src -= 1;
2113
2114 filter_vec = LD_SH(filter);
2115 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2116
2117 mask1 = mask0 + 2;
2118
2119 const_vec = __msa_ldi_h(128);
2120 const_vec <<= 6;
2121
2122 for (loop_cnt = (height >> 2); loop_cnt--;) {
2123 LD_SB4(src, src_stride, src0, src1, src2, src3);
2124 src += (4 * src_stride);
2125
2126 XORI_B4_128_SB(src0, src1, src2, src3);
2127
2128 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2129 dst0 = const_vec;
2130 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2131
2132 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2133 dst1 = const_vec;
2134 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2135
2136 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2137 dst2 = const_vec;
2138 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2139
2140 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2141 dst3 = const_vec;
2142 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2143
2144 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
2145 dst += (4 * dst_stride);
2146 }
2147 }
2148
hevc_hz_4t_8w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2149 static void hevc_hz_4t_8w_msa(uint8_t *src,
2150 int32_t src_stride,
2151 int16_t *dst,
2152 int32_t dst_stride,
2153 const int8_t *filter,
2154 int32_t height)
2155 {
2156 if (2 == height || 6 == height) {
2157 hevc_hz_4t_8x2multiple_msa(src, src_stride, dst, dst_stride,
2158 filter, height);
2159 } else {
2160 hevc_hz_4t_8x4multiple_msa(src, src_stride, dst, dst_stride,
2161 filter, height);
2162 }
2163 }
2164
hevc_hz_4t_12w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2165 static void hevc_hz_4t_12w_msa(uint8_t *src,
2166 int32_t src_stride,
2167 int16_t *dst,
2168 int32_t dst_stride,
2169 const int8_t *filter,
2170 int32_t height)
2171 {
2172 uint32_t loop_cnt;
2173 v8i16 filt0, filt1;
2174 v16i8 src0, src1, src2, src3;
2175 v16i8 mask1;
2176 v16i8 vec0, vec1;
2177 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2178 v8i16 filter_vec, const_vec;
2179 v16i8 mask3;
2180 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2181 v16i8 mask2 = {
2182 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
2183 };
2184
2185 src -= 1;
2186
2187 filter_vec = LD_SH(filter);
2188 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2189
2190 mask1 = mask0 + 2;
2191 mask3 = mask2 + 2;
2192
2193 const_vec = __msa_ldi_h(128);
2194 const_vec <<= 6;
2195
2196 for (loop_cnt = (height >> 2); loop_cnt--;) {
2197 LD_SB4(src, src_stride, src0, src1, src2, src3);
2198 src += (4 * src_stride);
2199 XORI_B4_128_SB(src0, src1, src2, src3);
2200
2201 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2202 dst0 = const_vec;
2203 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2204 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2205 dst1 = const_vec;
2206 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2207 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2208 dst2 = const_vec;
2209 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2210 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2211 dst3 = const_vec;
2212 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2213 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2214 dst4 = const_vec;
2215 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2216 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
2217 dst5 = const_vec;
2218 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
2219
2220 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
2221 ST8x4_UB(dst4, dst5, dst + 8, 2 * dst_stride);
2222 dst += (4 * dst_stride);
2223 }
2224 }
2225
hevc_hz_4t_16w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2226 static void hevc_hz_4t_16w_msa(uint8_t *src,
2227 int32_t src_stride,
2228 int16_t *dst,
2229 int32_t dst_stride,
2230 const int8_t *filter,
2231 int32_t height)
2232 {
2233 uint32_t loop_cnt;
2234 v16i8 src0, src1, src2, src3;
2235 v16i8 src4, src5, src6, src7;
2236 v8i16 filt0, filt1;
2237 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2238 v16i8 mask1;
2239 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2240 v16i8 vec0, vec1;
2241 v8i16 filter_vec, const_vec;
2242
2243 src -= 1;
2244
2245 filter_vec = LD_SH(filter);
2246 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2247
2248 mask1 = mask0 + 2;
2249
2250 const_vec = __msa_ldi_h(128);
2251 const_vec <<= 6;
2252
2253 for (loop_cnt = (height >> 2); loop_cnt--;) {
2254 LD_SB4(src, src_stride, src0, src2, src4, src6);
2255 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2256 src += (4 * src_stride);
2257
2258 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2259
2260 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2261 dst0 = const_vec;
2262 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2263
2264 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2265 dst1 = const_vec;
2266 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2267
2268 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2269 dst2 = const_vec;
2270 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2271
2272 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2273 dst3 = const_vec;
2274 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2275
2276 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2277 dst4 = const_vec;
2278 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2279
2280 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2281 dst5 = const_vec;
2282 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
2283
2284 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
2285 dst6 = const_vec;
2286 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
2287
2288 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
2289 dst7 = const_vec;
2290 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
2291
2292 ST_SH4(dst0, dst2, dst4, dst6, dst, dst_stride);
2293 ST_SH4(dst1, dst3, dst5, dst7, dst + 8, dst_stride);
2294 dst += (4 * dst_stride);
2295 }
2296 }
2297
hevc_hz_4t_24w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2298 static void hevc_hz_4t_24w_msa(uint8_t *src,
2299 int32_t src_stride,
2300 int16_t *dst,
2301 int32_t dst_stride,
2302 const int8_t *filter,
2303 int32_t height)
2304 {
2305 uint32_t loop_cnt;
2306 int16_t *dst_tmp = dst + 16;
2307 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2308 v8i16 filt0, filt1;
2309 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2310 v16i8 mask1, mask00, mask11;
2311 v16i8 vec0, vec1;
2312 v8i16 dst0, dst1, dst2, dst3;
2313 v8i16 filter_vec, const_vec;
2314
2315 src -= 1;
2316
2317 filter_vec = LD_SH(filter);
2318 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2319
2320 mask1 = mask0 + 2;
2321 mask00 = mask0 + 8;
2322 mask11 = mask0 + 10;
2323
2324 const_vec = __msa_ldi_h(128);
2325 const_vec <<= 6;
2326
2327 for (loop_cnt = (height >> 2); loop_cnt--;) {
2328 /* 16 width */
2329 LD_SB4(src, src_stride, src0, src2, src4, src6);
2330 LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
2331 src += (4 * src_stride);
2332
2333 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2334
2335 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2336 dst0 = const_vec;
2337 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2338
2339 VSHF_B2_SB(src0, src1, src0, src1, mask00, mask11, vec0, vec1);
2340 dst1 = const_vec;
2341 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2342
2343 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2344 dst2 = const_vec;
2345 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2346
2347 VSHF_B2_SB(src2, src3, src2, src3, mask00, mask11, vec0, vec1);
2348 dst3 = const_vec;
2349 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2350
2351 ST_SH2(dst0, dst1, dst, 8);
2352 dst += dst_stride;
2353 ST_SH2(dst2, dst3, dst, 8);
2354 dst += dst_stride;
2355
2356 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2357 dst0 = const_vec;
2358 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2359
2360 VSHF_B2_SB(src4, src5, src4, src5, mask00, mask11, vec0, vec1);
2361 dst1 = const_vec;
2362 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2363
2364 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
2365 dst2 = const_vec;
2366 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2367
2368 VSHF_B2_SB(src6, src7, src6, src7, mask00, mask11, vec0, vec1);
2369 dst3 = const_vec;
2370 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2371
2372 ST_SH2(dst0, dst1, dst, 8);
2373 dst += dst_stride;
2374 ST_SH2(dst2, dst3, dst, 8);
2375 dst += dst_stride;
2376
2377 /* 8 width */
2378 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2379 dst0 = const_vec;
2380 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2381
2382 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2383 dst1 = const_vec;
2384 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2385
2386 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2387 dst2 = const_vec;
2388 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2389
2390 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
2391 dst3 = const_vec;
2392 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2393
2394 ST_SH4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride);
2395 dst_tmp += (4 * dst_stride);
2396 }
2397 }
2398
hevc_hz_4t_32w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2399 static void hevc_hz_4t_32w_msa(uint8_t *src,
2400 int32_t src_stride,
2401 int16_t *dst,
2402 int32_t dst_stride,
2403 const int8_t *filter,
2404 int32_t height)
2405 {
2406 uint32_t loop_cnt;
2407 v16i8 src0, src1, src2;
2408 v8i16 filt0, filt1;
2409 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2410 v16i8 mask1, mask2, mask3;
2411 v8i16 dst0, dst1, dst2, dst3;
2412 v16i8 vec0, vec1, vec2, vec3;
2413 v8i16 filter_vec, const_vec;
2414
2415 src -= 1;
2416
2417 filter_vec = LD_SH(filter);
2418 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2419
2420 const_vec = __msa_ldi_h(128);
2421 const_vec <<= 6;
2422
2423 mask1 = mask0 + 2;
2424 mask2 = mask0 + 8;
2425 mask3 = mask0 + 10;
2426
2427 for (loop_cnt = height; loop_cnt--;) {
2428 LD_SB2(src, 16, src0, src1);
2429 src2 = LD_SB(src + 24);
2430 src += src_stride;
2431
2432 XORI_B3_128_SB(src0, src1, src2);
2433
2434 dst0 = const_vec;
2435 dst1 = const_vec;
2436 dst2 = const_vec;
2437 dst3 = const_vec;
2438 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask2, vec0, vec1);
2439 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
2440 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2441 dst1, dst2, dst3);
2442 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask3, vec0, vec1);
2443 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
2444 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2445 dst1, dst2, dst3);
2446 ST_SH4(dst0, dst1, dst2, dst3, dst, 8);
2447 dst += dst_stride;
2448 }
2449 }
2450
hevc_vt_4t_4x2_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter)2451 static void hevc_vt_4t_4x2_msa(uint8_t *src,
2452 int32_t src_stride,
2453 int16_t *dst,
2454 int32_t dst_stride,
2455 const int8_t *filter)
2456 {
2457 v16i8 src0, src1, src2, src3, src4;
2458 v16i8 src10_r, src32_r, src21_r, src43_r;
2459 v16i8 src2110, src4332;
2460 v8i16 dst10;
2461 v8i16 filt0, filt1;
2462 v8i16 filter_vec, const_vec;
2463
2464 src -= src_stride;
2465
2466 const_vec = __msa_ldi_h(128);
2467 const_vec <<= 6;
2468
2469 filter_vec = LD_SH(filter);
2470 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2471
2472 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2473 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
2474 src10_r, src21_r, src32_r, src43_r);
2475
2476 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
2477 XORI_B2_128_SB(src2110, src4332);
2478 dst10 = const_vec;
2479 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2480
2481 ST8x2_UB(dst10, dst, 2 * dst_stride);
2482 }
2483
hevc_vt_4t_4x4_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2484 static void hevc_vt_4t_4x4_msa(uint8_t *src,
2485 int32_t src_stride,
2486 int16_t *dst,
2487 int32_t dst_stride,
2488 const int8_t *filter,
2489 int32_t height)
2490 {
2491 v16i8 src0, src1, src2, src3, src4, src5, src6;
2492 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2493 v16i8 src2110, src4332, src6554;
2494 v8i16 dst10, dst32;
2495 v8i16 filt0, filt1;
2496 v8i16 filter_vec, const_vec;
2497
2498 src -= src_stride;
2499
2500 const_vec = __msa_ldi_h(128);
2501 const_vec <<= 6;
2502
2503 filter_vec = LD_SH(filter);
2504 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2505
2506 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
2507 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
2508 src10_r, src21_r, src32_r, src43_r);
2509 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2510 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
2511 src2110, src4332, src6554);
2512 XORI_B3_128_SB(src2110, src4332, src6554);
2513 dst10 = const_vec;
2514 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2515 dst32 = const_vec;
2516 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2517
2518 ST8x4_UB(dst10, dst32, dst, 2 * dst_stride);
2519 }
2520
hevc_vt_4t_4x8_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2521 static void hevc_vt_4t_4x8_msa(uint8_t *src,
2522 int32_t src_stride,
2523 int16_t *dst,
2524 int32_t dst_stride,
2525 const int8_t *filter,
2526 int32_t height)
2527 {
2528 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2529 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
2530 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
2531 v16i8 src2110, src4332, src6554, src8776, src10998;
2532 v8i16 dst10, dst32, dst54, dst76;
2533 v8i16 filt0, filt1;
2534 v8i16 filter_vec, const_vec;
2535
2536 src -= src_stride;
2537 const_vec = __msa_ldi_h(128);
2538 const_vec <<= 6;
2539
2540 filter_vec = LD_SH(filter);
2541 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2542
2543 LD_SB3(src, src_stride, src0, src1, src2);
2544 src += (3 * src_stride);
2545
2546 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2547 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2548 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2549
2550 LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
2551 src += (8 * src_stride);
2552 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2553 src32_r, src43_r, src54_r, src65_r);
2554 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
2555 src76_r, src87_r, src98_r, src109_r);
2556 ILVR_D4_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r, src109_r,
2557 src98_r, src4332, src6554, src8776, src10998);
2558 XORI_B4_128_SB(src4332, src6554, src8776, src10998);
2559 dst10 = const_vec;
2560 dst32 = const_vec;
2561 dst54 = const_vec;
2562 dst76 = const_vec;
2563 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2564 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2565 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
2566 DPADD_SB2_SH(src8776, src10998, filt0, filt1, dst76, dst76);
2567 ST8x8_UB(dst10, dst32, dst54, dst76, dst, 2 * dst_stride);
2568 dst += (8 * dst_stride);
2569 }
2570
hevc_vt_4t_4x16_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2571 static void hevc_vt_4t_4x16_msa(uint8_t *src, int32_t src_stride,
2572 int16_t *dst, int32_t dst_stride,
2573 const int8_t *filter, int32_t height)
2574 {
2575 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2576 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
2577 v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
2578 v16i8 src10998;
2579 v8i16 dst10, dst32, dst54, dst76, filt0, filt1, filter_vec, const_vec;
2580
2581 src -= src_stride;
2582 const_vec = __msa_ldi_h(128);
2583 const_vec <<= 6;
2584
2585 filter_vec = LD_SH(filter);
2586 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2587
2588 LD_SB3(src, src_stride, src0, src1, src2);
2589 src += (3 * src_stride);
2590
2591 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2592 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2593 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2594
2595 LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
2596 src += (8 * src_stride);
2597 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, src32_r, src43_r,
2598 src54_r, src65_r);
2599 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
2600 src87_r, src98_r, src109_r);
2601 ILVR_D4_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r, src109_r,
2602 src98_r, src4332, src6554, src8776, src10998);
2603 XORI_B4_128_SB(src4332, src6554, src8776, src10998);
2604
2605 dst10 = const_vec;
2606 dst32 = const_vec;
2607 dst54 = const_vec;
2608 dst76 = const_vec;
2609 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2610 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2611 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
2612 DPADD_SB2_SH(src8776, src10998, filt0, filt1, dst76, dst76);
2613 ST8x8_UB(dst10, dst32, dst54, dst76, dst, 2 * dst_stride);
2614 dst += (8 * dst_stride);
2615
2616 src2 = src10;
2617 src2110 = src10998;
2618
2619 LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
2620 src += (8 * src_stride);
2621
2622 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5, src32_r, src43_r,
2623 src54_r, src65_r);
2624 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
2625 src87_r, src98_r, src109_r);
2626 ILVR_D4_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r, src109_r,
2627 src98_r, src4332, src6554, src8776, src10998);
2628 XORI_B4_128_SB(src4332, src6554, src8776, src10998);
2629
2630 dst10 = const_vec;
2631 dst32 = const_vec;
2632 dst54 = const_vec;
2633 dst76 = const_vec;
2634 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2635 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2636 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
2637 DPADD_SB2_SH(src8776, src10998, filt0, filt1, dst76, dst76);
2638 ST8x8_UB(dst10, dst32, dst54, dst76, dst, 2 * dst_stride);
2639 dst += (8 * dst_stride);
2640 }
2641
hevc_vt_4t_4w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2642 static void hevc_vt_4t_4w_msa(uint8_t *src,
2643 int32_t src_stride,
2644 int16_t *dst,
2645 int32_t dst_stride,
2646 const int8_t *filter,
2647 int32_t height)
2648 {
2649 if (2 == height) {
2650 hevc_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
2651 } else if (4 == height) {
2652 hevc_vt_4t_4x4_msa(src, src_stride, dst, dst_stride, filter, height);
2653 } else if (8 == height) {
2654 hevc_vt_4t_4x8_msa(src, src_stride, dst, dst_stride, filter, height);
2655 } else if (16 == height) {
2656 hevc_vt_4t_4x16_msa(src, src_stride, dst, dst_stride, filter, height);
2657 }
2658 }
2659
hevc_vt_4t_6w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2660 static void hevc_vt_4t_6w_msa(uint8_t *src,
2661 int32_t src_stride,
2662 int16_t *dst,
2663 int32_t dst_stride,
2664 const int8_t *filter,
2665 int32_t height)
2666 {
2667 int32_t loop_cnt;
2668 uint32_t dst_val_int0, dst_val_int1, dst_val_int2, dst_val_int3;
2669 uint64_t dst_val0, dst_val1, dst_val2, dst_val3;
2670 v16i8 src0, src1, src2, src3, src4;
2671 v16i8 src10_r, src32_r, src21_r, src43_r;
2672 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
2673 v8i16 filt0, filt1;
2674 v8i16 filter_vec, const_vec;
2675
2676 src -= src_stride;
2677 const_vec = __msa_ldi_h(128);
2678 const_vec <<= 6;
2679
2680 filter_vec = LD_SH(filter);
2681 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2682
2683 LD_SB3(src, src_stride, src0, src1, src2);
2684 src += (3 * src_stride);
2685 XORI_B3_128_SB(src0, src1, src2);
2686 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2687
2688 for (loop_cnt = (height >> 2); loop_cnt--;) {
2689 LD_SB2(src, src_stride, src3, src4);
2690 src += (2 * src_stride);
2691 XORI_B2_128_SB(src3, src4);
2692 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2693
2694 dst0_r = const_vec;
2695 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2696 dst1_r = const_vec;
2697 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2698
2699 LD_SB2(src, src_stride, src1, src2);
2700 src += (2 * src_stride);
2701 XORI_B2_128_SB(src1, src2);
2702 ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
2703
2704 dst2_r = const_vec;
2705 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
2706 dst3_r = const_vec;
2707 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
2708
2709 dst_val0 = __msa_copy_u_d((v2i64) dst0_r, 0);
2710 dst_val1 = __msa_copy_u_d((v2i64) dst1_r, 0);
2711 dst_val2 = __msa_copy_u_d((v2i64) dst2_r, 0);
2712 dst_val3 = __msa_copy_u_d((v2i64) dst3_r, 0);
2713
2714 dst_val_int0 = __msa_copy_u_w((v4i32) dst0_r, 2);
2715 dst_val_int1 = __msa_copy_u_w((v4i32) dst1_r, 2);
2716 dst_val_int2 = __msa_copy_u_w((v4i32) dst2_r, 2);
2717 dst_val_int3 = __msa_copy_u_w((v4i32) dst3_r, 2);
2718
2719 SD(dst_val0, dst);
2720 SW(dst_val_int0, dst + 4);
2721 dst += dst_stride;
2722 SD(dst_val1, dst);
2723 SW(dst_val_int1, dst + 4);
2724 dst += dst_stride;
2725 SD(dst_val2, dst);
2726 SW(dst_val_int2, dst + 4);
2727 dst += dst_stride;
2728 SD(dst_val3, dst);
2729 SW(dst_val_int3, dst + 4);
2730 dst += dst_stride;
2731 }
2732 }
2733
hevc_vt_4t_8x2_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter)2734 static void hevc_vt_4t_8x2_msa(uint8_t *src,
2735 int32_t src_stride,
2736 int16_t *dst,
2737 int32_t dst_stride,
2738 const int8_t *filter)
2739 {
2740 v16i8 src0, src1, src2, src3, src4;
2741 v16i8 src10_r, src32_r, src21_r, src43_r;
2742 v8i16 dst0_r, dst1_r;
2743 v8i16 filt0, filt1;
2744 v8i16 filter_vec, const_vec;
2745
2746 src -= src_stride;
2747 const_vec = __msa_ldi_h(128);
2748 const_vec <<= 6;
2749
2750 filter_vec = LD_SH(filter);
2751 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2752
2753 LD_SB3(src, src_stride, src0, src1, src2);
2754 src += (3 * src_stride);
2755 XORI_B3_128_SB(src0, src1, src2);
2756 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2757
2758 LD_SB2(src, src_stride, src3, src4);
2759 XORI_B2_128_SB(src3, src4);
2760 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2761 dst0_r = const_vec;
2762 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2763 dst1_r = const_vec;
2764 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2765
2766 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2767 }
2768
hevc_vt_4t_8x6_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter)2769 static void hevc_vt_4t_8x6_msa(uint8_t *src,
2770 int32_t src_stride,
2771 int16_t *dst,
2772 int32_t dst_stride,
2773 const int8_t *filter)
2774 {
2775 v16i8 src0, src1, src2, src3, src4;
2776 v16i8 src10_r, src32_r, src21_r, src43_r;
2777 v8i16 dst0_r, dst1_r;
2778 v8i16 filt0, filt1;
2779 v8i16 filter_vec, const_vec;
2780
2781 src -= src_stride;
2782 const_vec = __msa_ldi_h(128);
2783 const_vec <<= 6;
2784
2785 filter_vec = LD_SH(filter);
2786 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2787
2788 LD_SB3(src, src_stride, src0, src1, src2);
2789 src += (3 * src_stride);
2790 XORI_B3_128_SB(src0, src1, src2);
2791 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2792
2793 LD_SB2(src, src_stride, src3, src4);
2794 src += (2 * src_stride);
2795 XORI_B2_128_SB(src3, src4);
2796
2797 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2798 dst0_r = const_vec;
2799 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2800 dst1_r = const_vec;
2801 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2802
2803 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2804 dst += (2 * dst_stride);
2805
2806 LD_SB2(src, src_stride, src1, src2);
2807 src += (2 * src_stride);
2808 XORI_B2_128_SB(src1, src2);
2809
2810 ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
2811 dst0_r = const_vec;
2812 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
2813 dst1_r = const_vec;
2814 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
2815
2816 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2817 dst += (2 * dst_stride);
2818
2819 LD_SB2(src, src_stride, src3, src4);
2820 XORI_B2_128_SB(src3, src4);
2821
2822 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2823 dst0_r = const_vec;
2824 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2825 dst1_r = const_vec;
2826 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2827
2828 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2829 }
2830
hevc_vt_4t_8x4multiple_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2831 static void hevc_vt_4t_8x4multiple_msa(uint8_t *src,
2832 int32_t src_stride,
2833 int16_t *dst,
2834 int32_t dst_stride,
2835 const int8_t *filter,
2836 int32_t height)
2837 {
2838 int32_t loop_cnt;
2839 v16i8 src0, src1, src2, src3, src4, src5, src6;
2840 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2841 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
2842 v8i16 filt0, filt1;
2843 v8i16 filter_vec, const_vec;
2844
2845 src -= src_stride;
2846 const_vec = __msa_ldi_h(128);
2847 const_vec <<= 6;
2848
2849 filter_vec = LD_SH(filter);
2850 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2851
2852 LD_SB3(src, src_stride, src0, src1, src2);
2853 src += (3 * src_stride);
2854 XORI_B3_128_SB(src0, src1, src2);
2855 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2856
2857 for (loop_cnt = (height >> 2); loop_cnt--;) {
2858 LD_SB4(src, src_stride, src3, src4, src5, src6);
2859 src += (4 * src_stride);
2860 XORI_B4_128_SB(src3, src4, src5, src6);
2861 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2862 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2863 dst0_r = const_vec;
2864 dst1_r = const_vec;
2865 dst2_r = const_vec;
2866 dst3_r = const_vec;
2867 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2868 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2869 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
2870 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
2871 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
2872 dst += (4 * dst_stride);
2873
2874 src2 = src6;
2875 src10_r = src54_r;
2876 src21_r = src65_r;
2877 }
2878 }
2879
hevc_vt_4t_8w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2880 static void hevc_vt_4t_8w_msa(uint8_t *src,
2881 int32_t src_stride,
2882 int16_t *dst,
2883 int32_t dst_stride,
2884 const int8_t *filter,
2885 int32_t height)
2886 {
2887 if (2 == height) {
2888 hevc_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter);
2889 } else if (6 == height) {
2890 hevc_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter);
2891 } else {
2892 hevc_vt_4t_8x4multiple_msa(src, src_stride, dst, dst_stride,
2893 filter, height);
2894 }
2895 }
2896
hevc_vt_4t_12w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2897 static void hevc_vt_4t_12w_msa(uint8_t *src,
2898 int32_t src_stride,
2899 int16_t *dst,
2900 int32_t dst_stride,
2901 const int8_t *filter,
2902 int32_t height)
2903 {
2904 int32_t loop_cnt;
2905 v16i8 src0, src1, src2, src3, src4, src5, src6;
2906 v16i8 src10_r, src32_r, src21_r, src43_r;
2907 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
2908 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
2909 v16i8 src2110, src4332;
2910 v16i8 src54_r, src65_r, src6554;
2911 v8i16 dst0_l, dst1_l;
2912 v8i16 filt0, filt1;
2913 v8i16 filter_vec, const_vec;
2914
2915 src -= (1 * src_stride);
2916 const_vec = __msa_ldi_h(128);
2917 const_vec <<= 6;
2918
2919 filter_vec = LD_SH(filter);
2920 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2921
2922 LD_SB3(src, src_stride, src0, src1, src2);
2923 src += (3 * src_stride);
2924 XORI_B3_128_SB(src0, src1, src2);
2925 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2926 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2927 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
2928
2929 for (loop_cnt = 4; loop_cnt--;) {
2930 LD_SB2(src, src_stride, src3, src4);
2931 src += (2 * src_stride);
2932 LD_SB2(src, src_stride, src5, src6);
2933 src += (2 * src_stride);
2934 XORI_B2_128_SB(src3, src4);
2935 XORI_B2_128_SB(src5, src6);
2936
2937 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2938 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2939 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
2940 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2941 ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l);
2942 src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
2943
2944 dst0_r = const_vec;
2945 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2946 dst1_r = const_vec;
2947 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2948 dst2_r = const_vec;
2949 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
2950 dst3_r = const_vec;
2951 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
2952 dst0_l = const_vec;
2953 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst0_l, dst0_l);
2954 dst1_l = const_vec;
2955 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst1_l, dst1_l);
2956
2957 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
2958 ST8x4_UB(dst0_l, dst1_l, dst + 8, (2 * dst_stride));
2959 dst += (4 * dst_stride);
2960
2961 src2 = src6;
2962 src10_r = src54_r;
2963 src21_r = src65_r;
2964 src2110 = src6554;
2965 }
2966 }
2967
hevc_vt_4t_16w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)2968 static void hevc_vt_4t_16w_msa(uint8_t *src,
2969 int32_t src_stride,
2970 int16_t *dst,
2971 int32_t dst_stride,
2972 const int8_t *filter,
2973 int32_t height)
2974 {
2975 int32_t loop_cnt;
2976 v16i8 src0, src1, src2, src3, src4, src5;
2977 v16i8 src10_r, src32_r, src21_r, src43_r;
2978 v16i8 src10_l, src32_l, src21_l, src43_l;
2979 v8i16 dst0_r, dst1_r, dst0_l, dst1_l;
2980 v8i16 filt0, filt1;
2981 v8i16 filter_vec, const_vec;
2982
2983 src -= src_stride;
2984 const_vec = __msa_ldi_h(128);
2985 const_vec <<= 6;
2986
2987 filter_vec = LD_SH(filter);
2988 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2989
2990 LD_SB3(src, src_stride, src0, src1, src2);
2991 src += (3 * src_stride);
2992 XORI_B3_128_SB(src0, src1, src2);
2993 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2994 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2995
2996 for (loop_cnt = (height >> 2); loop_cnt--;) {
2997 LD_SB2(src, src_stride, src3, src4);
2998 src += (2 * src_stride);
2999 XORI_B2_128_SB(src3, src4);
3000 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3001 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3002 dst0_r = const_vec;
3003 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3004 dst0_l = const_vec;
3005 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3006 dst1_r = const_vec;
3007 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3008 dst1_l = const_vec;
3009 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3010 ST_SH2(dst0_r, dst0_l, dst, 8);
3011 dst += dst_stride;
3012 ST_SH2(dst1_r, dst1_l, dst, 8);
3013 dst += dst_stride;
3014
3015 LD_SB2(src, src_stride, src5, src2);
3016 src += (2 * src_stride);
3017 XORI_B2_128_SB(src5, src2);
3018 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3019 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3020 dst0_r = const_vec;
3021 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3022 dst0_l = const_vec;
3023 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3024 dst1_r = const_vec;
3025 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3026 dst1_l = const_vec;
3027 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3028 ST_SH2(dst0_r, dst0_l, dst, 8);
3029 dst += dst_stride;
3030 ST_SH2(dst1_r, dst1_l, dst, 8);
3031 dst += dst_stride;
3032 }
3033 }
3034
hevc_vt_4t_24w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)3035 static void hevc_vt_4t_24w_msa(uint8_t *src,
3036 int32_t src_stride,
3037 int16_t *dst,
3038 int32_t dst_stride,
3039 const int8_t *filter,
3040 int32_t height)
3041 {
3042 int32_t loop_cnt;
3043 v16i8 src0, src1, src2, src3, src4, src5;
3044 v16i8 src6, src7, src8, src9, src10, src11;
3045 v16i8 src10_r, src32_r, src76_r, src98_r;
3046 v16i8 src21_r, src43_r, src87_r, src109_r;
3047 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3048 v16i8 src10_l, src32_l, src21_l, src43_l;
3049 v8i16 dst0_l, dst1_l;
3050 v8i16 filt0, filt1;
3051 v8i16 filter_vec, const_vec;
3052
3053 src -= src_stride;
3054 const_vec = __msa_ldi_h(128);
3055 const_vec <<= 6;
3056
3057 filter_vec = LD_SH(filter);
3058 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3059
3060 LD_SB3(src, src_stride, src0, src1, src2);
3061 XORI_B3_128_SB(src0, src1, src2);
3062 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3063 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3064
3065 LD_SB3(src + 16, src_stride, src6, src7, src8);
3066 src += (3 * src_stride);
3067 XORI_B3_128_SB(src6, src7, src8);
3068 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3069
3070 for (loop_cnt = (height >> 2); loop_cnt--;) {
3071 LD_SB2(src, src_stride, src3, src4);
3072 XORI_B2_128_SB(src3, src4);
3073 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3074 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3075
3076 LD_SB2(src + 16, src_stride, src9, src10);
3077 src += (2 * src_stride);
3078 XORI_B2_128_SB(src9, src10);
3079 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3080
3081 dst0_r = const_vec;
3082 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3083 dst0_l = const_vec;
3084 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3085 dst1_r = const_vec;
3086 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3087 dst1_l = const_vec;
3088 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3089 dst2_r = const_vec;
3090 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3091 dst3_r = const_vec;
3092 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3093
3094 ST_SH2(dst0_r, dst0_l, dst, 8);
3095 ST_SH(dst2_r, dst + 16);
3096 dst += dst_stride;
3097 ST_SH2(dst1_r, dst1_l, dst, 8);
3098 ST_SH(dst3_r, dst + 16);
3099 dst += dst_stride;
3100
3101 LD_SB2(src, src_stride, src5, src2);
3102 XORI_B2_128_SB(src5, src2);
3103 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3104 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3105
3106 LD_SB2(src + 16, src_stride, src11, src8);
3107 src += (2 * src_stride);
3108 XORI_B2_128_SB(src11, src8);
3109 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
3110
3111 dst0_r = const_vec;
3112 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3113 dst0_l = const_vec;
3114 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3115 dst1_r = const_vec;
3116 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3117 dst1_l = const_vec;
3118 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3119 dst2_r = const_vec;
3120 DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
3121 dst3_r = const_vec;
3122 DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
3123
3124 ST_SH2(dst0_r, dst0_l, dst, 8);
3125 ST_SH(dst2_r, dst + 16);
3126 dst += dst_stride;
3127 ST_SH2(dst1_r, dst1_l, dst, 8);
3128 ST_SH(dst3_r, dst + 16);
3129 dst += dst_stride;
3130 }
3131 }
3132
hevc_vt_4t_32w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter,int32_t height)3133 static void hevc_vt_4t_32w_msa(uint8_t *src,
3134 int32_t src_stride,
3135 int16_t *dst,
3136 int32_t dst_stride,
3137 const int8_t *filter,
3138 int32_t height)
3139 {
3140 int32_t loop_cnt;
3141 v16i8 src0, src1, src2, src3, src4, src5;
3142 v16i8 src6, src7, src8, src9, src10, src11;
3143 v16i8 src10_r, src32_r, src76_r, src98_r;
3144 v16i8 src21_r, src43_r, src87_r, src109_r;
3145 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3146 v16i8 src10_l, src32_l, src76_l, src98_l;
3147 v16i8 src21_l, src43_l, src87_l, src109_l;
3148 v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
3149 v8i16 filt0, filt1;
3150 v8i16 filter_vec, const_vec;
3151
3152 src -= src_stride;
3153 const_vec = __msa_ldi_h(128);
3154 const_vec <<= 6;
3155
3156 filter_vec = LD_SH(filter);
3157 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3158
3159 LD_SB3(src, src_stride, src0, src1, src2);
3160 XORI_B3_128_SB(src0, src1, src2);
3161 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3162 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3163
3164 LD_SB3(src + 16, src_stride, src6, src7, src8);
3165 src += (3 * src_stride);
3166 XORI_B3_128_SB(src6, src7, src8);
3167 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3168 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3169
3170 for (loop_cnt = (height >> 2); loop_cnt--;) {
3171 LD_SB2(src, src_stride, src3, src4);
3172 XORI_B2_128_SB(src3, src4);
3173 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3174 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3175
3176 LD_SB2(src + 16, src_stride, src9, src10);
3177 src += (2 * src_stride);
3178 XORI_B2_128_SB(src9, src10);
3179 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3180 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3181
3182 dst0_r = const_vec;
3183 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3184 dst0_l = const_vec;
3185 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3186 dst1_r = const_vec;
3187 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3188 dst1_l = const_vec;
3189 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3190 dst2_r = const_vec;
3191 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3192 dst2_l = const_vec;
3193 DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, dst2_l, dst2_l);
3194 dst3_r = const_vec;
3195 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3196 dst3_l = const_vec;
3197 DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, dst3_l, dst3_l);
3198
3199 ST_SH4(dst0_r, dst0_l, dst2_r, dst2_l, dst, 8);
3200 dst += dst_stride;
3201 ST_SH4(dst1_r, dst1_l, dst3_r, dst3_l, dst, 8);
3202 dst += dst_stride;
3203
3204 LD_SB2(src, src_stride, src5, src2);
3205 XORI_B2_128_SB(src5, src2);
3206 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3207 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3208
3209 LD_SB2(src + 16, src_stride, src11, src8);
3210 src += (2 * src_stride);
3211 XORI_B2_128_SB(src11, src8);
3212 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
3213 ILVL_B2_SB(src11, src10, src8, src11, src76_l, src87_l);
3214
3215 dst0_r = const_vec;
3216 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3217 dst0_l = const_vec;
3218 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3219 dst1_r = const_vec;
3220 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3221 dst1_l = const_vec;
3222 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3223 dst2_r = const_vec;
3224 DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
3225 dst2_l = const_vec;
3226 DPADD_SB2_SH(src98_l, src76_l, filt0, filt1, dst2_l, dst2_l);
3227 dst3_r = const_vec;
3228 DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
3229 dst3_l = const_vec;
3230 DPADD_SB2_SH(src109_l, src87_l, filt0, filt1, dst3_l, dst3_l);
3231
3232 ST_SH4(dst0_r, dst0_l, dst2_r, dst2_l, dst, 8);
3233 dst += dst_stride;
3234 ST_SH4(dst1_r, dst1_l, dst3_r, dst3_l, dst, 8);
3235 dst += dst_stride;
3236 }
3237 }
3238
hevc_hv_4t_4x2_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y)3239 static void hevc_hv_4t_4x2_msa(uint8_t *src,
3240 int32_t src_stride,
3241 int16_t *dst,
3242 int32_t dst_stride,
3243 const int8_t *filter_x,
3244 const int8_t *filter_y)
3245 {
3246 int32_t dst_stride_in_bytes = 2 * dst_stride;
3247 v16i8 src0, src1, src2, src3, src4;
3248 v8i16 filt0, filt1;
3249 v8i16 filt_h0, filt_h1;
3250 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3251 v16i8 mask1;
3252 v8i16 filter_vec, const_vec;
3253 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3254 v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43;
3255 v4i32 dst0, dst1;
3256
3257 src -= (src_stride + 1);
3258 filter_vec = LD_SH(filter_x);
3259 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3260
3261 filter_vec = LD_SH(filter_y);
3262 UNPCK_R_SB_SH(filter_vec, filter_vec);
3263
3264 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3265
3266 mask1 = mask0 + 2;
3267
3268 const_vec = __msa_ldi_h(128);
3269 const_vec <<= 6;
3270
3271 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3272 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3273 VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
3274 VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
3275 VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
3276
3277 dst20 = const_vec;
3278 dst31 = const_vec;
3279 dst42 = const_vec;
3280 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst20, dst20);
3281 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst31, dst31);
3282 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst42, dst42);
3283 ILVRL_H2_SH(dst31, dst20, dst10, dst32);
3284 ILVRL_H2_SH(dst42, dst31, dst21, dst43);
3285
3286 dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3287 dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3288 dst0 >>= 6;
3289 dst1 >>= 6;
3290 dst0 = (v4i32) __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
3291 ST8x2_UB(dst0, dst, dst_stride_in_bytes);
3292 }
3293
hevc_hv_4t_4x4_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y)3294 static void hevc_hv_4t_4x4_msa(uint8_t *src,
3295 int32_t src_stride,
3296 int16_t *dst,
3297 int32_t dst_stride,
3298 const int8_t *filter_x,
3299 const int8_t *filter_y)
3300 {
3301 int32_t dst_stride_in_bytes = 2 * dst_stride;
3302 v16i8 src0, src1, src2, src3, src4, src5, src6;
3303 v8i16 filt0, filt1;
3304 v8i16 filt_h0, filt_h1;
3305 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3306 v16i8 mask1;
3307 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3308 v8i16 filter_vec, const_vec;
3309 v8i16 dst30, dst41, dst52, dst63, dst10, dst32, dst54, dst21, dst43, dst65;
3310 v4i32 dst0, dst1, dst2, dst3;
3311
3312 src -= (src_stride + 1);
3313
3314 filter_vec = LD_SH(filter_x);
3315 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3316
3317 filter_vec = LD_SH(filter_y);
3318 UNPCK_R_SB_SH(filter_vec, filter_vec);
3319
3320 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3321
3322 mask1 = mask0 + 2;
3323
3324 const_vec = __msa_ldi_h(128);
3325 const_vec <<= 6;
3326
3327 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3328 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3329
3330 VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
3331 VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
3332 VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
3333 VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
3334
3335 dst30 = const_vec;
3336 dst41 = const_vec;
3337 dst52 = const_vec;
3338 dst63 = const_vec;
3339 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst30, dst30);
3340 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst41, dst41);
3341 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst52, dst52);
3342 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst63, dst63);
3343
3344 ILVRL_H2_SH(dst41, dst30, dst10, dst43);
3345 ILVRL_H2_SH(dst52, dst41, dst21, dst54);
3346 ILVRL_H2_SH(dst63, dst52, dst32, dst65);
3347
3348 dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3349 dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3350 dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
3351 dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
3352 SRA_4V(dst0, dst1, dst2, dst3, 6);
3353 PCKEV_H2_SW(dst1, dst0, dst3, dst2, dst0, dst2);
3354 ST8x4_UB(dst0, dst2, dst, dst_stride_in_bytes);
3355 }
3356
3357
hevc_hv_4t_4multx8mult_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)3358 static void hevc_hv_4t_4multx8mult_msa(uint8_t *src,
3359 int32_t src_stride,
3360 int16_t *dst,
3361 int32_t dst_stride,
3362 const int8_t *filter_x,
3363 const int8_t *filter_y,
3364 int32_t height)
3365 {
3366 uint32_t loop_cnt;
3367 v16i8 src0, src1, src2, src3, src4, src5, src6;
3368 v16i8 src7, src8, src9, src10;
3369 v8i16 filt0, filt1;
3370 v8i16 filt_h0, filt_h1;
3371 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3372 v16i8 mask1;
3373 v8i16 filter_vec, const_vec;
3374 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3375 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3376 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r;
3377 v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
3378 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3379
3380 src -= (src_stride + 1);
3381 filter_vec = LD_SH(filter_x);
3382 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3383
3384 filter_vec = LD_SH(filter_y);
3385 UNPCK_R_SB_SH(filter_vec, filter_vec);
3386
3387 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3388
3389 mask1 = mask0 + 2;
3390
3391 const_vec = __msa_ldi_h(128);
3392 const_vec <<= 6;
3393
3394 LD_SB3(src, src_stride, src0, src1, src2);
3395 src += (3 * src_stride);
3396 XORI_B3_128_SB(src0, src1, src2);
3397 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
3398 VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
3399 dst10 = const_vec;
3400 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst10, dst10);
3401 dst21 = const_vec;
3402 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst21, dst21);
3403 ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
3404 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3405
3406 for (loop_cnt = height >> 3; loop_cnt--;) {
3407 LD_SB8(src, src_stride,
3408 src3, src4, src5, src6, src7, src8, src9, src10);
3409 src += (8 * src_stride);
3410 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3411
3412 VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
3413 VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
3414 VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
3415 VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
3416
3417 dst73 = const_vec;
3418 dst84 = const_vec;
3419 dst95 = const_vec;
3420 dst106 = const_vec;
3421 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst73, dst73);
3422 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst84, dst84);
3423 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst95, dst95);
3424 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst106, dst106);
3425
3426 dst32_r = __msa_ilvr_h(dst73, dst22);
3427 ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
3428 ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
3429 ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
3430 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3431 dst76_r = __msa_ilvr_h(dst22, dst106);
3432
3433 dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3434 dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3435 dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3436 dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3437 dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3438 dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3439 dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3440 dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3441 SRA_4V(dst0, dst1, dst2, dst3, 6);
3442 SRA_4V(dst4, dst5, dst6, dst7, 6);
3443 PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
3444 dst0, dst1, dst2, dst3);
3445 ST8x8_UB(dst0, dst1, dst2, dst3, dst, 2 * dst_stride);
3446 dst += (8 * dst_stride);
3447
3448 dst10_r = dst98_r;
3449 dst21_r = dst109_r;
3450 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
3451 }
3452 }
3453
hevc_hv_4t_4w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)3454 static void hevc_hv_4t_4w_msa(uint8_t *src,
3455 int32_t src_stride,
3456 int16_t *dst,
3457 int32_t dst_stride,
3458 const int8_t *filter_x,
3459 const int8_t *filter_y,
3460 int32_t height)
3461 {
3462 if (2 == height) {
3463 hevc_hv_4t_4x2_msa(src, src_stride, dst, dst_stride,
3464 filter_x, filter_y);
3465 } else if (4 == height) {
3466 hevc_hv_4t_4x4_msa(src, src_stride, dst, dst_stride,
3467 filter_x, filter_y);
3468 } else if (0 == (height % 8)) {
3469 hevc_hv_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
3470 filter_x, filter_y, height);
3471 }
3472 }
3473
hevc_hv_4t_6w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)3474 static void hevc_hv_4t_6w_msa(uint8_t *src,
3475 int32_t src_stride,
3476 int16_t *dst,
3477 int32_t dst_stride,
3478 const int8_t *filter_x,
3479 const int8_t *filter_y,
3480 int32_t height)
3481 {
3482 int32_t dst_stride_in_bytes = 2 * dst_stride;
3483 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3484 v8i16 filt0, filt1;
3485 v8i16 filt_h0, filt_h1;
3486 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3487 v16i8 mask1;
3488 v8i16 filter_vec, const_vec;
3489 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3490 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
3491 v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3492 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
3493 v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
3494 v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l;
3495 v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
3496 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3497 v4i32 dst0_l, dst1_l, dst2_l, dst3_l;
3498
3499 src -= (src_stride + 1);
3500 filter_vec = LD_SH(filter_x);
3501 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3502
3503 filter_vec = LD_SH(filter_y);
3504 UNPCK_R_SB_SH(filter_vec, filter_vec);
3505
3506 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3507
3508 mask1 = mask0 + 2;
3509
3510 const_vec = __msa_ldi_h(128);
3511 const_vec <<= 6;
3512
3513 LD_SB3(src, src_stride, src0, src1, src2);
3514 src += (3 * src_stride);
3515 XORI_B3_128_SB(src0, src1, src2);
3516
3517 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3518 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3519 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3520
3521 dsth0 = const_vec;
3522 dsth1 = const_vec;
3523 dsth2 = const_vec;
3524 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dsth0, dsth0);
3525 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dsth1, dsth1);
3526 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dsth2, dsth2);
3527
3528 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
3529 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
3530
3531 LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
3532 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3533
3534 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3535 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3536 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3537 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3538
3539 dsth3 = const_vec;
3540 dsth4 = const_vec;
3541 dsth5 = const_vec;
3542 dsth6 = const_vec;
3543 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dsth3, dsth3);
3544 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dsth4, dsth4);
3545 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dsth5, dsth5);
3546 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dsth6, dsth6);
3547
3548 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3549 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
3550 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
3551 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
3552
3553 dsth7 = const_vec;
3554 dsth8 = const_vec;
3555 dsth9 = const_vec;
3556 dsth10 = const_vec;
3557 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dsth7, dsth7);
3558 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dsth8, dsth8);
3559 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dsth9, dsth9);
3560 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dsth10, dsth10);
3561
3562 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
3563 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
3564 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
3565 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
3566 ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
3567 ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
3568 ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
3569 ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
3570
3571 PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
3572 PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
3573 dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
3574
3575 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3576 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3577 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3578 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3579 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3580 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3581 dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3582 dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3583 dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
3584 dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
3585 dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
3586 dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
3587 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3588 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3589 SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
3590 PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
3591 PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
3592 PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
3593 ST8x4_UB(tmp0, tmp1, dst, dst_stride_in_bytes);
3594 ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, dst + 4, dst_stride_in_bytes);
3595 dst += 4 * dst_stride;
3596 ST8x4_UB(tmp2, tmp3, dst, dst_stride_in_bytes);
3597 ST4x4_UB(tmp5, tmp5, 0, 1, 2, 3, dst + 4, dst_stride_in_bytes);
3598 }
3599
hevc_hv_4t_8x2_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y)3600 static void hevc_hv_4t_8x2_msa(uint8_t *src,
3601 int32_t src_stride,
3602 int16_t *dst,
3603 int32_t dst_stride,
3604 const int8_t *filter_x,
3605 const int8_t *filter_y)
3606 {
3607 v16i8 src0, src1, src2, src3, src4;
3608 v8i16 filt0, filt1;
3609 v8i16 filt_h0, filt_h1;
3610 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3611 v16i8 mask1;
3612 v8i16 filter_vec, const_vec;
3613 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3614 v8i16 dst0, dst1, dst2, dst3, dst4;
3615 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3616 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3617 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3618
3619 src -= (src_stride + 1);
3620
3621 filter_vec = LD_SH(filter_x);
3622 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3623
3624 filter_vec = LD_SH(filter_y);
3625 UNPCK_R_SB_SH(filter_vec, filter_vec);
3626
3627 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3628
3629 mask1 = mask0 + 2;
3630
3631 const_vec = __msa_ldi_h(128);
3632 const_vec <<= 6;
3633
3634 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3635 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3636
3637 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3638 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3639 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3640 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3641 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3642
3643 dst0 = const_vec;
3644 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3645 dst1 = const_vec;
3646 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3647 dst2 = const_vec;
3648 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3649 dst3 = const_vec;
3650 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst3, dst3);
3651 dst4 = const_vec;
3652 DPADD_SB2_SH(vec8, vec9, filt0, filt1, dst4, dst4);
3653
3654 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3655 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3656 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3657 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3658 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3659 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3660 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3661 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3662 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3663 PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3664 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3665 }
3666
hevc_hv_4t_8multx4_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t width8mult)3667 static void hevc_hv_4t_8multx4_msa(uint8_t *src, int32_t src_stride,
3668 int16_t *dst, int32_t dst_stride,
3669 const int8_t *filter_x,
3670 const int8_t *filter_y, int32_t width8mult)
3671 {
3672 int32_t cnt;
3673 v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
3674 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3675 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, const_vec;
3676 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6;
3677 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3678 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3679 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3680
3681 src -= (src_stride + 1);
3682
3683 filter_vec = LD_SH(filter_x);
3684 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3685
3686 filter_vec = LD_SH(filter_y);
3687 UNPCK_R_SB_SH(filter_vec, filter_vec);
3688
3689 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3690
3691 mask0 = LD_SB(ff_hevc_mask_arr);
3692 mask1 = mask0 + 2;
3693
3694 const_vec = __msa_ldi_h(128);
3695 const_vec <<= 6;
3696
3697 for (cnt = width8mult; cnt--;) {
3698 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3699 src += 8;
3700 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3701
3702 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3703 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3704 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3705
3706 dst0 = const_vec;
3707 dst1 = const_vec;
3708 dst2 = const_vec;
3709 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3710 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3711 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3712
3713 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3714 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3715
3716 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3717 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3718 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3719 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3720 dst3 = const_vec;
3721 dst4 = const_vec;
3722 dst5 = const_vec;
3723 dst6 = const_vec;
3724 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3725 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst4, dst4);
3726 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst5, dst5);
3727 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst6, dst6);
3728 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3729 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3730 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3731 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3732 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3733 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3734 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3735 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3736
3737 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3738 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3739 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3740 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3741 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3742 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3743 PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3744 PCKEV_H2_SW(dst2_l, dst2_r, dst3_l, dst3_r, dst2_r, dst3_r);
3745
3746 ST_SW4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
3747 dst += 8;
3748 }
3749 }
3750
hevc_hv_4t_8x6_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y)3751 static void hevc_hv_4t_8x6_msa(uint8_t *src,
3752 int32_t src_stride,
3753 int16_t *dst,
3754 int32_t dst_stride,
3755 const int8_t *filter_x,
3756 const int8_t *filter_y)
3757 {
3758 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3759 v8i16 filt0, filt1;
3760 v8i16 filt_h0, filt_h1;
3761 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3762 v16i8 mask1;
3763 v8i16 filter_vec, const_vec;
3764 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3765 v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
3766 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3767 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3768 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
3769 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
3770 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
3771 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
3772 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
3773
3774 src -= (src_stride + 1);
3775
3776 filter_vec = LD_SH(filter_x);
3777 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3778
3779 filter_vec = LD_SH(filter_y);
3780 UNPCK_R_SB_SH(filter_vec, filter_vec);
3781
3782 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3783
3784 mask1 = mask0 + 2;
3785
3786 const_vec = __msa_ldi_h(128);
3787 const_vec <<= 6;
3788
3789 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3790 src += (5 * src_stride);
3791 LD_SB4(src, src_stride, src5, src6, src7, src8);
3792
3793 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3794 XORI_B4_128_SB(src5, src6, src7, src8);
3795
3796 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3797 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3798 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3799 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3800 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3801 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
3802 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
3803 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
3804 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
3805
3806 dst0 = const_vec;
3807 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3808 dst1 = const_vec;
3809 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3810 dst2 = const_vec;
3811 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3812 dst3 = const_vec;
3813 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst3, dst3);
3814 dst4 = const_vec;
3815 DPADD_SB2_SH(vec8, vec9, filt0, filt1, dst4, dst4);
3816 dst5 = const_vec;
3817 DPADD_SB2_SH(vec10, vec11, filt0, filt1, dst5, dst5);
3818 dst6 = const_vec;
3819 DPADD_SB2_SH(vec12, vec13, filt0, filt1, dst6, dst6);
3820 dst7 = const_vec;
3821 DPADD_SB2_SH(vec14, vec15, filt0, filt1, dst7, dst7);
3822 dst8 = const_vec;
3823 DPADD_SB2_SH(vec16, vec17, filt0, filt1, dst8, dst8);
3824
3825 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3826 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3827 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3828 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3829 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3830 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3831 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
3832 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
3833
3834 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3835 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3836 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3837 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3838 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3839 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3840 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3841 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3842 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3843 dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
3844 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3845 dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
3846
3847 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3848 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3849 SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
3850
3851 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r,
3852 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r, dst2_r, dst3_r);
3853 PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r, dst5_r);
3854
3855 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3856 dst += (2 * dst_stride);
3857 ST_SW2(dst2_r, dst3_r, dst, dst_stride);
3858 dst += (2 * dst_stride);
3859 ST_SW2(dst4_r, dst5_r, dst, dst_stride);
3860 }
3861
hevc_hv_4t_8multx4mult_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height,int32_t width8mult)3862 static void hevc_hv_4t_8multx4mult_msa(uint8_t *src,
3863 int32_t src_stride,
3864 int16_t *dst,
3865 int32_t dst_stride,
3866 const int8_t *filter_x,
3867 const int8_t *filter_y,
3868 int32_t height,
3869 int32_t width8mult)
3870 {
3871 uint32_t loop_cnt, cnt;
3872 uint8_t *src_tmp;
3873 int16_t *dst_tmp;
3874 v16i8 src0, src1, src2, src3, src4, src5, src6;
3875 v8i16 filt0, filt1;
3876 v8i16 filt_h0, filt_h1;
3877 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3878 v16i8 mask1;
3879 v8i16 filter_vec, const_vec;
3880 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3881 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6;
3882 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3883 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3884 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3885
3886 src -= (src_stride + 1);
3887
3888 filter_vec = LD_SH(filter_x);
3889 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3890
3891 filter_vec = LD_SH(filter_y);
3892 UNPCK_R_SB_SH(filter_vec, filter_vec);
3893
3894 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3895
3896 mask1 = mask0 + 2;
3897
3898 const_vec = __msa_ldi_h(128);
3899 const_vec <<= 6;
3900
3901 for (cnt = width8mult; cnt--;) {
3902 src_tmp = src;
3903 dst_tmp = dst;
3904
3905 LD_SB3(src_tmp, src_stride, src0, src1, src2);
3906 src_tmp += (3 * src_stride);
3907
3908 XORI_B3_128_SB(src0, src1, src2);
3909
3910 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3911 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3912 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3913
3914 dst0 = const_vec;
3915 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3916 dst1 = const_vec;
3917 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3918 dst2 = const_vec;
3919 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3920
3921 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3922 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3923
3924 for (loop_cnt = height >> 2; loop_cnt--;) {
3925 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3926 src_tmp += (4 * src_stride);
3927 XORI_B4_128_SB(src3, src4, src5, src6);
3928
3929 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3930 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3931 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3932 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3933
3934 dst3 = const_vec;
3935 dst4 = const_vec;
3936 dst5 = const_vec;
3937 dst6 = const_vec;
3938 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3939 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst4, dst4);
3940 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst5, dst5);
3941 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst6, dst6);
3942
3943 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3944 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3945 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3946 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3947
3948 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3949 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3950 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3951 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3952 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3953 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3954 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3955 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3956
3957 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3958 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3959
3960 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r,
3961 dst2_l, dst2_r, dst3_l, dst3_r,
3962 dst0_r, dst1_r, dst2_r, dst3_r);
3963
3964 ST_SW4(dst0_r, dst1_r, dst2_r, dst3_r, dst_tmp, dst_stride);
3965 dst_tmp += (4 * dst_stride);
3966
3967 dst10_r = dst54_r;
3968 dst10_l = dst54_l;
3969 dst21_r = dst65_r;
3970 dst21_l = dst65_l;
3971 dst2 = dst6;
3972 }
3973
3974 src += 8;
3975 dst += 8;
3976 }
3977 }
3978
hevc_hv_4t_8w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)3979 static void hevc_hv_4t_8w_msa(uint8_t *src,
3980 int32_t src_stride,
3981 int16_t *dst,
3982 int32_t dst_stride,
3983 const int8_t *filter_x,
3984 const int8_t *filter_y,
3985 int32_t height)
3986 {
3987
3988 if (2 == height) {
3989 hevc_hv_4t_8x2_msa(src, src_stride, dst, dst_stride,
3990 filter_x, filter_y);
3991 } else if (4 == height) {
3992 hevc_hv_4t_8multx4_msa(src, src_stride, dst, dst_stride,
3993 filter_x, filter_y, 1);
3994 } else if (6 == height) {
3995 hevc_hv_4t_8x6_msa(src, src_stride, dst, dst_stride,
3996 filter_x, filter_y);
3997 } else if (0 == (height % 4)) {
3998 hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
3999 filter_x, filter_y, height, 1);
4000 }
4001 }
4002
hevc_hv_4t_12w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)4003 static void hevc_hv_4t_12w_msa(uint8_t *src,
4004 int32_t src_stride,
4005 int16_t *dst,
4006 int32_t dst_stride,
4007 const int8_t *filter_x,
4008 const int8_t *filter_y,
4009 int32_t height)
4010 {
4011 uint32_t loop_cnt;
4012 uint8_t *src_tmp;
4013 int16_t *dst_tmp;
4014 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4015 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4016 v16i8 mask0, mask1, mask2, mask3;
4017 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, const_vec;
4018 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst10, dst21, dst22, dst73;
4019 v8i16 dst84, dst95, dst106, dst76_r, dst98_r, dst87_r, dst109_r;
4020 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
4021 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
4022 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4023 v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4024
4025 src -= (src_stride + 1);
4026
4027 filter_vec = LD_SH(filter_x);
4028 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4029
4030 filter_vec = LD_SH(filter_y);
4031 UNPCK_R_SB_SH(filter_vec, filter_vec);
4032
4033 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4034
4035 mask0 = LD_SB(ff_hevc_mask_arr);
4036 mask1 = mask0 + 2;
4037
4038 const_vec = __msa_ldi_h(128);
4039 const_vec <<= 6;
4040
4041 src_tmp = src;
4042 dst_tmp = dst;
4043
4044 LD_SB3(src_tmp, src_stride, src0, src1, src2);
4045 src_tmp += (3 * src_stride);
4046
4047 XORI_B3_128_SB(src0, src1, src2);
4048
4049 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4050 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4051 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4052
4053 dst0 = const_vec;
4054 dst1 = const_vec;
4055 dst2 = const_vec;
4056 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
4057 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
4058 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
4059
4060 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4061 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4062
4063 for (loop_cnt = 4; loop_cnt--;) {
4064 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
4065 src_tmp += (4 * src_stride);
4066 XORI_B4_128_SB(src3, src4, src5, src6);
4067
4068 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4069 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4070 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4071 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4072
4073 dst3 = const_vec;
4074 dst4 = const_vec;
4075 dst5 = const_vec;
4076 dst6 = const_vec;
4077 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
4078 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst4, dst4);
4079 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst5, dst5);
4080 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst6, dst6);
4081
4082 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4083 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4084 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
4085 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
4086
4087 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4088 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4089 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4090 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4091 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4092 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4093 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4094 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4095
4096 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4097 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4098 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4099 dst3_r, dst0_r, dst1_r, dst2_r, dst3_r);
4100 ST_SW4(dst0_r, dst1_r, dst2_r, dst3_r, dst_tmp, dst_stride);
4101 dst_tmp += (4 * dst_stride);
4102
4103 dst10_r = dst54_r;
4104 dst10_l = dst54_l;
4105 dst21_r = dst65_r;
4106 dst21_l = dst65_l;
4107 dst2 = dst6;
4108 }
4109
4110 src += 8;
4111 dst += 8;
4112
4113 mask2 = LD_SB(ff_hevc_mask_arr + 16);
4114 mask3 = mask2 + 2;
4115
4116 LD_SB3(src, src_stride, src0, src1, src2);
4117 src += (3 * src_stride);
4118 XORI_B3_128_SB(src0, src1, src2);
4119 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
4120 VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
4121 dst10 = const_vec;
4122 dst21 = const_vec;
4123 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst10, dst10);
4124 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst21, dst21);
4125 ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
4126 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4127
4128 for (loop_cnt = 2; loop_cnt--;) {
4129 LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9,
4130 src10);
4131 src += (8 * src_stride);
4132 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4133 VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
4134 VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
4135 VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
4136 VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
4137
4138 dst73 = const_vec;
4139 dst84 = const_vec;
4140 dst95 = const_vec;
4141 dst106 = const_vec;
4142 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst73, dst73);
4143 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst84, dst84);
4144 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst95, dst95);
4145 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst106, dst106);
4146
4147 dst32_r = __msa_ilvr_h(dst73, dst22);
4148 ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
4149 ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
4150 ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
4151 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4152 dst76_r = __msa_ilvr_h(dst22, dst106);
4153
4154 tmp0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4155 tmp1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4156 tmp2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4157 tmp3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4158 tmp4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4159 tmp5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4160 tmp6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4161 tmp7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4162
4163 SRA_4V(tmp0, tmp1, tmp2, tmp3, 6);
4164 SRA_4V(tmp4, tmp5, tmp6, tmp7, 6);
4165 PCKEV_H4_SW(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, tmp0, tmp1,
4166 tmp2, tmp3);
4167 ST8x8_UB(tmp0, tmp1, tmp2, tmp3, dst, 2 * dst_stride);
4168 dst += (8 * dst_stride);
4169
4170 dst10_r = dst98_r;
4171 dst21_r = dst109_r;
4172 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4173 }
4174 }
4175
hevc_hv_4t_16w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)4176 static void hevc_hv_4t_16w_msa(uint8_t *src,
4177 int32_t src_stride,
4178 int16_t *dst,
4179 int32_t dst_stride,
4180 const int8_t *filter_x,
4181 const int8_t *filter_y,
4182 int32_t height)
4183 {
4184 if (4 == height) {
4185 hevc_hv_4t_8multx4_msa(src, src_stride, dst, dst_stride,
4186 filter_x, filter_y, 2);
4187 } else {
4188 hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4189 filter_x, filter_y, height, 2);
4190 }
4191 }
4192
hevc_hv_4t_24w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)4193 static void hevc_hv_4t_24w_msa(uint8_t *src,
4194 int32_t src_stride,
4195 int16_t *dst,
4196 int32_t dst_stride,
4197 const int8_t *filter_x,
4198 const int8_t *filter_y,
4199 int32_t height)
4200 {
4201 hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4202 filter_x, filter_y, height, 3);
4203 }
4204
hevc_hv_4t_32w_msa(uint8_t * src,int32_t src_stride,int16_t * dst,int32_t dst_stride,const int8_t * filter_x,const int8_t * filter_y,int32_t height)4205 static void hevc_hv_4t_32w_msa(uint8_t *src,
4206 int32_t src_stride,
4207 int16_t *dst,
4208 int32_t dst_stride,
4209 const int8_t *filter_x,
4210 const int8_t *filter_y,
4211 int32_t height)
4212 {
4213 hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4214 filter_x, filter_y, height, 4);
4215 }
4216
4217 #define MC_COPY(WIDTH) \
4218 void ff_hevc_put_hevc_pel_pixels##WIDTH##_8_msa(int16_t *dst, \
4219 uint8_t *src, \
4220 ptrdiff_t src_stride, \
4221 int height, \
4222 intptr_t mx, \
4223 intptr_t my, \
4224 int width) \
4225 { \
4226 hevc_copy_##WIDTH##w_msa(src, src_stride, dst, MAX_PB_SIZE, height); \
4227 }
4228
4229 MC_COPY(4);
4230 MC_COPY(6);
4231 MC_COPY(8);
4232 MC_COPY(12);
4233 MC_COPY(16);
4234 MC_COPY(24);
4235 MC_COPY(32);
4236 MC_COPY(48);
4237 MC_COPY(64);
4238
4239 #undef MC_COPY
4240
4241 #define MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
4242 void ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_msa(int16_t *dst, \
4243 uint8_t *src, \
4244 ptrdiff_t src_stride, \
4245 int height, \
4246 intptr_t mx, \
4247 intptr_t my, \
4248 int width) \
4249 { \
4250 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
4251 \
4252 hevc_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \
4253 MAX_PB_SIZE, filter, height); \
4254 }
4255
4256 MC(qpel, h, 4, 8, hz, mx);
4257 MC(qpel, h, 8, 8, hz, mx);
4258 MC(qpel, h, 12, 8, hz, mx);
4259 MC(qpel, h, 16, 8, hz, mx);
4260 MC(qpel, h, 24, 8, hz, mx);
4261 MC(qpel, h, 32, 8, hz, mx);
4262 MC(qpel, h, 48, 8, hz, mx);
4263 MC(qpel, h, 64, 8, hz, mx);
4264
4265 MC(qpel, v, 4, 8, vt, my);
4266 MC(qpel, v, 8, 8, vt, my);
4267 MC(qpel, v, 12, 8, vt, my);
4268 MC(qpel, v, 16, 8, vt, my);
4269 MC(qpel, v, 24, 8, vt, my);
4270 MC(qpel, v, 32, 8, vt, my);
4271 MC(qpel, v, 48, 8, vt, my);
4272 MC(qpel, v, 64, 8, vt, my);
4273
4274 MC(epel, h, 4, 4, hz, mx);
4275 MC(epel, h, 6, 4, hz, mx);
4276 MC(epel, h, 8, 4, hz, mx);
4277 MC(epel, h, 12, 4, hz, mx);
4278 MC(epel, h, 16, 4, hz, mx);
4279 MC(epel, h, 24, 4, hz, mx);
4280 MC(epel, h, 32, 4, hz, mx);
4281
4282 MC(epel, v, 4, 4, vt, my);
4283 MC(epel, v, 6, 4, vt, my);
4284 MC(epel, v, 8, 4, vt, my);
4285 MC(epel, v, 12, 4, vt, my);
4286 MC(epel, v, 16, 4, vt, my);
4287 MC(epel, v, 24, 4, vt, my);
4288 MC(epel, v, 32, 4, vt, my);
4289
4290 #undef MC
4291
4292 #define MC_HV(PEL, WIDTH, TAP) \
4293 void ff_hevc_put_hevc_##PEL##_hv##WIDTH##_8_msa(int16_t *dst, \
4294 uint8_t *src, \
4295 ptrdiff_t src_stride, \
4296 int height, \
4297 intptr_t mx, \
4298 intptr_t my, \
4299 int width) \
4300 { \
4301 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
4302 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
4303 \
4304 hevc_hv_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, MAX_PB_SIZE, \
4305 filter_x, filter_y, height); \
4306 }
4307
4308 MC_HV(qpel, 4, 8);
4309 MC_HV(qpel, 8, 8);
4310 MC_HV(qpel, 12, 8);
4311 MC_HV(qpel, 16, 8);
4312 MC_HV(qpel, 24, 8);
4313 MC_HV(qpel, 32, 8);
4314 MC_HV(qpel, 48, 8);
4315 MC_HV(qpel, 64, 8);
4316
4317 MC_HV(epel, 4, 4);
4318 MC_HV(epel, 6, 4);
4319 MC_HV(epel, 8, 4);
4320 MC_HV(epel, 12, 4);
4321 MC_HV(epel, 16, 4);
4322 MC_HV(epel, 24, 4);
4323 MC_HV(epel, 32, 4);
4324
4325 #undef MC_HV
4326