1 /*
2 * Copyright (c) 2015 - 2017 Shivraj Patil (Shivraj.Patil@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "libavcodec/vp9dsp.h"
22 #include "libavutil/mips/generic_macros_msa.h"
23 #include "vp9dsp_mips.h"
24
25 #define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
26 p1_out, p0_out, q0_out, q1_out) \
27 { \
28 v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt, filt1, filt2; \
29 const v16i8 cnst4b = __msa_ldi_b(4); \
30 const v16i8 cnst3b = __msa_ldi_b(3); \
31 \
32 p1_m = (v16i8) __msa_xori_b(p1_in, 0x80); \
33 p0_m = (v16i8) __msa_xori_b(p0_in, 0x80); \
34 q0_m = (v16i8) __msa_xori_b(q0_in, 0x80); \
35 q1_m = (v16i8) __msa_xori_b(q1_in, 0x80); \
36 \
37 filt = __msa_subs_s_b(p1_m, q1_m); \
38 \
39 filt = filt & (v16i8) hev_in; \
40 \
41 q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m); \
42 filt = __msa_adds_s_b(filt, q0_sub_p0); \
43 filt = __msa_adds_s_b(filt, q0_sub_p0); \
44 filt = __msa_adds_s_b(filt, q0_sub_p0); \
45 filt = filt & (v16i8) mask_in; \
46 \
47 filt1 = __msa_adds_s_b(filt, cnst4b); \
48 filt1 >>= 3; \
49 \
50 filt2 = __msa_adds_s_b(filt, cnst3b); \
51 filt2 >>= 3; \
52 \
53 q0_m = __msa_subs_s_b(q0_m, filt1); \
54 q0_out = __msa_xori_b((v16u8) q0_m, 0x80); \
55 p0_m = __msa_adds_s_b(p0_m, filt2); \
56 p0_out = __msa_xori_b((v16u8) p0_m, 0x80); \
57 \
58 filt = __msa_srari_b(filt1, 1); \
59 hev_in = __msa_xori_b((v16u8) hev_in, 0xff); \
60 filt = filt & (v16i8) hev_in; \
61 \
62 q1_m = __msa_subs_s_b(q1_m, filt); \
63 q1_out = __msa_xori_b((v16u8) q1_m, 0x80); \
64 p1_m = __msa_adds_s_b(p1_m, filt); \
65 p1_out = __msa_xori_b((v16u8) p1_m, 0x80); \
66 }
67
68 #define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \
69 { \
70 v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \
71 v16u8 zero_in = { 0 }; \
72 \
73 tmp = __msa_ori_b(zero_in, 1); \
74 p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in); \
75 q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in); \
76 p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in); \
77 q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in); \
78 \
79 p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0); \
80 flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out); \
81 p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0); \
82 flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out); \
83 \
84 flat_out = (tmp < (v16u8) flat_out); \
85 flat_out = __msa_xori_b(flat_out, 0xff); \
86 flat_out = flat_out & (mask); \
87 }
88
89 #define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, \
90 q5_in, q6_in, q7_in, flat_in, flat2_out) \
91 { \
92 v16u8 tmp, zero_in = { 0 }; \
93 v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \
94 v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \
95 \
96 tmp = __msa_ori_b(zero_in, 1); \
97 p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in); \
98 q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in); \
99 p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in); \
100 q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in); \
101 p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in); \
102 q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in); \
103 p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in); \
104 q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in); \
105 \
106 p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0); \
107 flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0); \
108 flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out); \
109 p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0); \
110 flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out); \
111 p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0); \
112 flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out); \
113 \
114 flat2_out = (tmp < (v16u8) flat2_out); \
115 flat2_out = __msa_xori_b(flat2_out, 0xff); \
116 flat2_out = flat2_out & flat_in; \
117 }
118
119 #define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, \
120 q0_in, q1_in, q2_in, q3_in, \
121 p2_filt8_out, p1_filt8_out, p0_filt8_out, \
122 q0_filt8_out, q1_filt8_out, q2_filt8_out) \
123 { \
124 v8u16 tmp0, tmp1, tmp2; \
125 \
126 tmp2 = p2_in + p1_in + p0_in; \
127 tmp0 = p3_in << 1; \
128 \
129 tmp0 = tmp0 + tmp2 + q0_in; \
130 tmp1 = tmp0 + p3_in + p2_in; \
131 p2_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
132 \
133 tmp1 = tmp0 + p1_in + q1_in; \
134 p1_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
135 \
136 tmp1 = q2_in + q1_in + q0_in; \
137 tmp2 = tmp2 + tmp1; \
138 tmp0 = tmp2 + (p0_in); \
139 tmp0 = tmp0 + (p3_in); \
140 p0_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp0, 3); \
141 \
142 tmp0 = q2_in + q3_in; \
143 tmp0 = p0_in + tmp1 + tmp0; \
144 tmp1 = q3_in + q3_in; \
145 tmp1 = tmp1 + tmp0; \
146 q2_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
147 \
148 tmp0 = tmp2 + q3_in; \
149 tmp1 = tmp0 + q0_in; \
150 q0_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
151 \
152 tmp1 = tmp0 - p2_in; \
153 tmp0 = q1_in + q3_in; \
154 tmp1 = tmp0 + tmp1; \
155 q1_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
156 }
157
158 #define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, \
159 q0_in, q1_in, q2_in, q3_in, \
160 limit_in, b_limit_in, thresh_in, \
161 hev_out, mask_out, flat_out) \
162 { \
163 v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \
164 v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \
165 \
166 /* absolute subtraction of pixel values */ \
167 p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in); \
168 p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in); \
169 p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in); \
170 q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in); \
171 q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in); \
172 q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in); \
173 p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in); \
174 p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in); \
175 \
176 /* calculation of hev */ \
177 flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \
178 hev_out = thresh_in < (v16u8) flat_out; \
179 \
180 /* calculation of mask */ \
181 p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \
182 p1_asub_q1_m >>= 1; \
183 p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \
184 \
185 mask_out = b_limit_in < p0_asub_q0_m; \
186 mask_out = __msa_max_u_b(flat_out, mask_out); \
187 p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \
188 mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \
189 q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \
190 mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \
191 \
192 mask_out = limit_in < (v16u8) mask_out; \
193 mask_out = __msa_xori_b(mask_out, 0xff); \
194 }
195
ff_loop_filter_v_4_8_msa(uint8_t * src,ptrdiff_t pitch,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)196 void ff_loop_filter_v_4_8_msa(uint8_t *src, ptrdiff_t pitch,
197 int32_t b_limit_ptr,
198 int32_t limit_ptr,
199 int32_t thresh_ptr)
200 {
201 uint64_t p1_d, p0_d, q0_d, q1_d;
202 v16u8 mask, hev, flat, thresh, b_limit, limit;
203 v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;
204
205 /* load vector elements */
206 LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
207
208 thresh = (v16u8) __msa_fill_b(thresh_ptr);
209 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
210 limit = (v16u8) __msa_fill_b(limit_ptr);
211
212 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
213 hev, mask, flat);
214 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
215 q1_out);
216
217 p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
218 p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
219 q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
220 q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
221 SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
222 }
223
224
ff_loop_filter_v_44_16_msa(uint8_t * src,ptrdiff_t pitch,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)225 void ff_loop_filter_v_44_16_msa(uint8_t *src, ptrdiff_t pitch,
226 int32_t b_limit_ptr,
227 int32_t limit_ptr,
228 int32_t thresh_ptr)
229 {
230 v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
231 v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
232
233 /* load vector elements */
234 LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
235
236 thresh0 = (v16u8) __msa_fill_b(thresh_ptr);
237 thresh1 = (v16u8) __msa_fill_b(thresh_ptr >> 8);
238 thresh0 = (v16u8) __msa_ilvr_d((v2i64) thresh1, (v2i64) thresh0);
239
240 b_limit0 = (v16u8) __msa_fill_b(b_limit_ptr);
241 b_limit1 = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
242 b_limit0 = (v16u8) __msa_ilvr_d((v2i64) b_limit1, (v2i64) b_limit0);
243
244 limit0 = (v16u8) __msa_fill_b(limit_ptr);
245 limit1 = (v16u8) __msa_fill_b(limit_ptr >> 8);
246 limit0 = (v16u8) __msa_ilvr_d((v2i64) limit1, (v2i64) limit0);
247
248 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
249 hev, mask, flat);
250 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
251
252 ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
253 }
254
ff_loop_filter_v_8_8_msa(uint8_t * src,ptrdiff_t pitch,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)255 void ff_loop_filter_v_8_8_msa(uint8_t *src, ptrdiff_t pitch,
256 int32_t b_limit_ptr,
257 int32_t limit_ptr,
258 int32_t thresh_ptr)
259 {
260 uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
261 v16u8 mask, hev, flat, thresh, b_limit, limit;
262 v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
263 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
264 v8i16 p2_filter8, p1_filter8, p0_filter8;
265 v8i16 q0_filter8, q1_filter8, q2_filter8;
266 v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
267 v16i8 zero = { 0 };
268
269 /* load vector elements */
270 LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
271
272 thresh = (v16u8) __msa_fill_b(thresh_ptr);
273 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
274 limit = (v16u8) __msa_fill_b(limit_ptr);
275
276 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
277 hev, mask, flat);
278 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
279 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
280 q1_out);
281
282 flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
283
284 /* if flat is zero for all pixels, then no need to calculate other filter */
285 if (__msa_test_bz_v(flat)) {
286 p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
287 p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
288 q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
289 q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
290 SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
291 } else {
292 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
293 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
294 q2_r, q3_r);
295 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
296 p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
297
298 /* convert 16 bit output data into 8 bit */
299 PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
300 zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
301 q0_filter8);
302 PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
303
304 /* store pixel values */
305 p2_out = __msa_bmnz_v(p2, (v16u8) p2_filter8, flat);
306 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filter8, flat);
307 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filter8, flat);
308 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filter8, flat);
309 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filter8, flat);
310 q2_out = __msa_bmnz_v(q2, (v16u8) q2_filter8, flat);
311
312 p2_d = __msa_copy_u_d((v2i64) p2_out, 0);
313 p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
314 p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
315 q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
316 q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
317 q2_d = __msa_copy_u_d((v2i64) q2_out, 0);
318
319 src -= 3 * pitch;
320
321 SD4(p2_d, p1_d, p0_d, q0_d, src, pitch);
322 src += (4 * pitch);
323 SD(q1_d, src);
324 src += pitch;
325 SD(q2_d, src);
326 }
327 }
328
ff_loop_filter_v_88_16_msa(uint8_t * src,ptrdiff_t pitch,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)329 void ff_loop_filter_v_88_16_msa(uint8_t *src, ptrdiff_t pitch,
330 int32_t b_limit_ptr,
331 int32_t limit_ptr,
332 int32_t thresh_ptr)
333 {
334 v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
335 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
336 v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
337 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
338 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
339 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
340 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
341 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
342 v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
343 v16u8 zero = { 0 };
344
345 /* load vector elements */
346 LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
347
348 thresh = (v16u8) __msa_fill_b(thresh_ptr);
349 tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
350 thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
351
352 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
353 tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
354 b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
355
356 limit = (v16u8) __msa_fill_b(limit_ptr);
357 tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
358 limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
359
360 /* mask and hev */
361 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
362 hev, mask, flat);
363 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
364 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
365 q1_out);
366
367 /* if flat is zero for all pixels, then no need to calculate other filter */
368 if (__msa_test_bz_v(flat)) {
369 ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
370 } else {
371 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
372 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
373 q2_r, q3_r);
374 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
375 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
376
377 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
378 p0_l);
379 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
380 q3_l);
381 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
382 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
383
384 /* convert 16 bit output data into 8 bit */
385 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
386 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
387 p0_filt8_r, q0_filt8_r);
388 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r,
389 q1_filt8_r, q2_filt8_r);
390
391 /* store pixel values */
392 p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
393 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
394 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
395 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
396 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
397 q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
398
399 src -= 3 * pitch;
400
401 ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
402 src += (4 * pitch);
403 ST_UB2(q1_out, q2_out, src, pitch);
404 src += (2 * pitch);
405 }
406 }
407
ff_loop_filter_v_84_16_msa(uint8_t * src,ptrdiff_t pitch,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)408 void ff_loop_filter_v_84_16_msa(uint8_t *src, ptrdiff_t pitch,
409 int32_t b_limit_ptr,
410 int32_t limit_ptr,
411 int32_t thresh_ptr)
412 {
413 v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
414 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
415 v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
416 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
417 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
418 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
419 v16u8 zero = { 0 };
420
421 /* load vector elements */
422 LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
423
424 thresh = (v16u8) __msa_fill_b(thresh_ptr);
425 tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
426 thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
427
428 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
429 tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
430 b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
431
432 limit = (v16u8) __msa_fill_b(limit_ptr);
433 tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
434 limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
435
436 /* mask and hev */
437 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
438 hev, mask, flat);
439 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
440 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
441 q1_out);
442
443 flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
444
445 /* if flat is zero for all pixels, then no need to calculate other filter */
446 if (__msa_test_bz_v(flat)) {
447 ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
448 } else {
449 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
450 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
451 q2_r, q3_r);
452 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
453 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
454
455 /* convert 16 bit output data into 8 bit */
456 PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r,
457 p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r,
458 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r);
459 PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r,
460 q1_filt8_r, q2_filt8_r);
461
462 /* store pixel values */
463 p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
464 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
465 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
466 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
467 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
468 q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
469
470 src -= 3 * pitch;
471
472 ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
473 src += (4 * pitch);
474 ST_UB2(q1_out, q2_out, src, pitch);
475 src += (2 * pitch);
476 }
477 }
478
ff_loop_filter_v_48_16_msa(uint8_t * src,ptrdiff_t pitch,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)479 void ff_loop_filter_v_48_16_msa(uint8_t *src, ptrdiff_t pitch,
480 int32_t b_limit_ptr,
481 int32_t limit_ptr,
482 int32_t thresh_ptr)
483 {
484 v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
485 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
486 v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
487 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
488 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
489 v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
490 v16u8 zero = { 0 };
491
492 /* load vector elements */
493 LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
494
495 thresh = (v16u8) __msa_fill_b(thresh_ptr);
496 tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
497 thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
498
499 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
500 tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
501 b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
502
503 limit = (v16u8) __msa_fill_b(limit_ptr);
504 tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
505 limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
506
507 /* mask and hev */
508 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
509 hev, mask, flat);
510 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
511 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
512 q1_out);
513
514 flat = (v16u8) __msa_insve_d((v2i64) flat, 0, (v2i64) zero);
515
516 /* if flat is zero for all pixels, then no need to calculate other filter */
517 if (__msa_test_bz_v(flat)) {
518 ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
519 } else {
520 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
521 p0_l);
522 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
523 q3_l);
524 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
525 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
526
527 /* convert 16 bit output data into 8 bit */
528 PCKEV_B4_SH(p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
529 p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l,
530 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
531 PCKEV_B2_SH(q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
532 q1_filt8_l, q2_filt8_l);
533
534 /* store pixel values */
535 p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_l, flat);
536 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_l, flat);
537 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_l, flat);
538 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_l, flat);
539 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_l, flat);
540 q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_l, flat);
541
542 src -= 3 * pitch;
543
544 ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
545 src += (4 * pitch);
546 ST_UB2(q1_out, q2_out, src, pitch);
547 src += (2 * pitch);
548 }
549 }
550
vp9_hz_lpf_t4_and_t8_16w(uint8_t * src,ptrdiff_t pitch,uint8_t * filter48,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)551 static int32_t vp9_hz_lpf_t4_and_t8_16w(uint8_t *src, ptrdiff_t pitch,
552 uint8_t *filter48,
553 int32_t b_limit_ptr,
554 int32_t limit_ptr,
555 int32_t thresh_ptr)
556 {
557 v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
558 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
559 v16u8 flat, mask, hev, thresh, b_limit, limit;
560 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
561 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
562 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
563 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
564 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
565 v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
566 v16u8 zero = { 0 };
567
568 /* load vector elements */
569 LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
570
571 thresh = (v16u8) __msa_fill_b(thresh_ptr);
572 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
573 limit = (v16u8) __msa_fill_b(limit_ptr);
574
575 /* mask and hev */
576 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
577 hev, mask, flat);
578 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
579 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
580 q1_out);
581
582 /* if flat is zero for all pixels, then no need to calculate other filter */
583 if (__msa_test_bz_v(flat)) {
584 ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
585
586 return 1;
587 } else {
588 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
589 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
590 q2_r, q3_r);
591 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
592 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
593
594 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
595 p0_l);
596 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
597 q3_l);
598 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
599 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
600
601 /* convert 16 bit output data into 8 bit */
602 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
603 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
604 p0_filt8_r, q0_filt8_r);
605 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
606 q2_filt8_r);
607
608 /* store pixel values */
609 p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
610 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
611 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
612 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
613 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
614 q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
615
616 ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
617 filter48 += (4 * 16);
618 ST_UB2(q1_out, q2_out, filter48, 16);
619 filter48 += (2 * 16);
620 ST_UB(flat, filter48);
621
622 return 0;
623 }
624 }
625
vp9_hz_lpf_t16_16w(uint8_t * src,ptrdiff_t pitch,uint8_t * filter48)626 static void vp9_hz_lpf_t16_16w(uint8_t *src, ptrdiff_t pitch, uint8_t *filter48)
627 {
628 v16u8 flat, flat2, filter8;
629 v16i8 zero = { 0 };
630 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
631 v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
632 v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
633 v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
634 v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
635 v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
636 v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
637 v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
638 v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
639 v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
640 v8i16 l_out, r_out;
641
642 flat = LD_UB(filter48 + 96);
643
644 LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0);
645 LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7);
646 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
647
648 /* if flat2 is zero for all pixels, then no need to calculate other filter */
649 if (__msa_test_bz_v(flat2)) {
650 LD_UB4(filter48, 16, p2, p1, p0, q0);
651 LD_UB2(filter48 + 4 * 16, 16, q1, q2);
652
653 src -= 3 * pitch;
654 ST_UB4(p2, p1, p0, q0, src, pitch);
655 src += (4 * pitch);
656 ST_UB2(q1, q2, src, pitch);
657 } else {
658 src -= 7 * pitch;
659
660 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
661 zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
662 p3_r_in, p2_r_in, p1_r_in, p0_r_in);
663
664 q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0);
665
666 tmp0_r = p7_r_in << 3;
667 tmp0_r -= p7_r_in;
668 tmp0_r += p6_r_in;
669 tmp0_r += q0_r_in;
670 tmp1_r = p6_r_in + p5_r_in;
671 tmp1_r += p4_r_in;
672 tmp1_r += p3_r_in;
673 tmp1_r += p2_r_in;
674 tmp1_r += p1_r_in;
675 tmp1_r += p0_r_in;
676 tmp1_r += tmp0_r;
677 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
678
679 ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
680 p5_l_in, p4_l_in);
681 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
682 p1_l_in, p0_l_in);
683 q0_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q0);
684
685 tmp0_l = p7_l_in << 3;
686 tmp0_l -= p7_l_in;
687 tmp0_l += p6_l_in;
688 tmp0_l += q0_l_in;
689 tmp1_l = p6_l_in + p5_l_in;
690 tmp1_l += p4_l_in;
691 tmp1_l += p3_l_in;
692 tmp1_l += p2_l_in;
693 tmp1_l += p1_l_in;
694 tmp1_l += p0_l_in;
695 tmp1_l += tmp0_l;
696 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
697
698 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
699 p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
700 ST_UB(p6, src);
701 src += pitch;
702
703 /* p5 */
704 q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1);
705 tmp0_r = p5_r_in - p6_r_in;
706 tmp0_r += q1_r_in;
707 tmp0_r -= p7_r_in;
708 tmp1_r += tmp0_r;
709 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
710
711 q1_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q1);
712 tmp0_l = p5_l_in - p6_l_in;
713 tmp0_l += q1_l_in;
714 tmp0_l -= p7_l_in;
715 tmp1_l += tmp0_l;
716 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
717
718 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
719 p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
720 ST_UB(p5, src);
721 src += pitch;
722
723 /* p4 */
724 q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
725 tmp0_r = p4_r_in - p5_r_in;
726 tmp0_r += q2_r_in;
727 tmp0_r -= p7_r_in;
728 tmp1_r += tmp0_r;
729 r_out = (v8i16) __msa_srari_h((v8i16) tmp1_r, 4);
730
731 q2_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q2);
732 tmp0_l = p4_l_in - p5_l_in;
733 tmp0_l += q2_l_in;
734 tmp0_l -= p7_l_in;
735 tmp1_l += tmp0_l;
736 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
737
738 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
739 p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
740 ST_UB(p4, src);
741 src += pitch;
742
743 /* p3 */
744 q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
745 tmp0_r = p3_r_in - p4_r_in;
746 tmp0_r += q3_r_in;
747 tmp0_r -= p7_r_in;
748 tmp1_r += tmp0_r;
749 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
750
751 q3_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q3);
752 tmp0_l = p3_l_in - p4_l_in;
753 tmp0_l += q3_l_in;
754 tmp0_l -= p7_l_in;
755 tmp1_l += tmp0_l;
756 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
757
758 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
759 p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
760 ST_UB(p3, src);
761 src += pitch;
762
763 /* p2 */
764 q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
765 filter8 = LD_UB(filter48);
766 tmp0_r = p2_r_in - p3_r_in;
767 tmp0_r += q4_r_in;
768 tmp0_r -= p7_r_in;
769 tmp1_r += tmp0_r;
770 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
771
772 q4_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q4);
773 tmp0_l = p2_l_in - p3_l_in;
774 tmp0_l += q4_l_in;
775 tmp0_l -= p7_l_in;
776 tmp1_l += tmp0_l;
777 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
778
779 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
780 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
781 ST_UB(filter8, src);
782 src += pitch;
783
784 /* p1 */
785 q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
786 filter8 = LD_UB(filter48 + 16);
787 tmp0_r = p1_r_in - p2_r_in;
788 tmp0_r += q5_r_in;
789 tmp0_r -= p7_r_in;
790 tmp1_r += tmp0_r;
791 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
792
793 q5_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q5);
794 tmp0_l = p1_l_in - p2_l_in;
795 tmp0_l += q5_l_in;
796 tmp0_l -= p7_l_in;
797 tmp1_l += tmp0_l;
798 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
799
800 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
801 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
802 ST_UB(filter8, src);
803 src += pitch;
804
805 /* p0 */
806 q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
807 filter8 = LD_UB(filter48 + 32);
808 tmp0_r = p0_r_in - p1_r_in;
809 tmp0_r += q6_r_in;
810 tmp0_r -= p7_r_in;
811 tmp1_r += tmp0_r;
812 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
813
814 q6_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q6);
815 tmp0_l = p0_l_in - p1_l_in;
816 tmp0_l += q6_l_in;
817 tmp0_l -= p7_l_in;
818 tmp1_l += tmp0_l;
819 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
820
821 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
822 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
823 ST_UB(filter8, src);
824 src += pitch;
825
826 /* q0 */
827 q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
828 filter8 = LD_UB(filter48 + 48);
829 tmp0_r = q7_r_in - p0_r_in;
830 tmp0_r += q0_r_in;
831 tmp0_r -= p7_r_in;
832 tmp1_r += tmp0_r;
833 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
834
835 q7_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q7);
836 tmp0_l = q7_l_in - p0_l_in;
837 tmp0_l += q0_l_in;
838 tmp0_l -= p7_l_in;
839 tmp1_l += tmp0_l;
840 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
841
842 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
843 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
844 ST_UB(filter8, src);
845 src += pitch;
846
847 /* q1 */
848 filter8 = LD_UB(filter48 + 64);
849 tmp0_r = q7_r_in - q0_r_in;
850 tmp0_r += q1_r_in;
851 tmp0_r -= p6_r_in;
852 tmp1_r += tmp0_r;
853 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
854
855 tmp0_l = q7_l_in - q0_l_in;
856 tmp0_l += q1_l_in;
857 tmp0_l -= p6_l_in;
858 tmp1_l += tmp0_l;
859 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
860
861 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
862 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
863 ST_UB(filter8, src);
864 src += pitch;
865
866 /* q2 */
867 filter8 = LD_UB(filter48 + 80);
868 tmp0_r = q7_r_in - q1_r_in;
869 tmp0_r += q2_r_in;
870 tmp0_r -= p5_r_in;
871 tmp1_r += tmp0_r;
872 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
873
874 tmp0_l = q7_l_in - q1_l_in;
875 tmp0_l += q2_l_in;
876 tmp0_l -= p5_l_in;
877 tmp1_l += tmp0_l;
878 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
879
880 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
881 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
882 ST_UB(filter8, src);
883 src += pitch;
884
885 /* q3 */
886 tmp0_r = q7_r_in - q2_r_in;
887 tmp0_r += q3_r_in;
888 tmp0_r -= p4_r_in;
889 tmp1_r += tmp0_r;
890 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
891
892 tmp0_l = q7_l_in - q2_l_in;
893 tmp0_l += q3_l_in;
894 tmp0_l -= p4_l_in;
895 tmp1_l += tmp0_l;
896 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
897
898 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
899 q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
900 ST_UB(q3, src);
901 src += pitch;
902
903 /* q4 */
904 tmp0_r = q7_r_in - q3_r_in;
905 tmp0_r += q4_r_in;
906 tmp0_r -= p3_r_in;
907 tmp1_r += tmp0_r;
908 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
909
910 tmp0_l = q7_l_in - q3_l_in;
911 tmp0_l += q4_l_in;
912 tmp0_l -= p3_l_in;
913 tmp1_l += tmp0_l;
914 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
915
916 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
917 q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
918 ST_UB(q4, src);
919 src += pitch;
920
921 /* q5 */
922 tmp0_r = q7_r_in - q4_r_in;
923 tmp0_r += q5_r_in;
924 tmp0_r -= p2_r_in;
925 tmp1_r += tmp0_r;
926 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
927
928 tmp0_l = q7_l_in - q4_l_in;
929 tmp0_l += q5_l_in;
930 tmp0_l -= p2_l_in;
931 tmp1_l += tmp0_l;
932 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
933
934 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
935 q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
936 ST_UB(q5, src);
937 src += pitch;
938
939 /* q6 */
940 tmp0_r = q7_r_in - q5_r_in;
941 tmp0_r += q6_r_in;
942 tmp0_r -= p1_r_in;
943 tmp1_r += tmp0_r;
944 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
945
946 tmp0_l = q7_l_in - q5_l_in;
947 tmp0_l += q6_l_in;
948 tmp0_l -= p1_l_in;
949 tmp1_l += tmp0_l;
950 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
951
952 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
953 q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
954 ST_UB(q6, src);
955 }
956 }
957
ff_loop_filter_v_16_16_msa(uint8_t * src,ptrdiff_t pitch,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)958 void ff_loop_filter_v_16_16_msa(uint8_t *src, ptrdiff_t pitch,
959 int32_t b_limit_ptr,
960 int32_t limit_ptr,
961 int32_t thresh_ptr)
962 {
963 uint8_t filter48[16 * 8] ALLOC_ALIGNED(ALIGNMENT);
964 uint8_t early_exit = 0;
965
966 early_exit = vp9_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0],
967 b_limit_ptr, limit_ptr, thresh_ptr);
968
969 if (0 == early_exit) {
970 vp9_hz_lpf_t16_16w(src, pitch, filter48);
971 }
972 }
973
ff_loop_filter_v_16_8_msa(uint8_t * src,ptrdiff_t pitch,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)974 void ff_loop_filter_v_16_8_msa(uint8_t *src, ptrdiff_t pitch,
975 int32_t b_limit_ptr,
976 int32_t limit_ptr,
977 int32_t thresh_ptr)
978 {
979 uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
980 uint64_t dword0, dword1;
981 v16u8 flat2, mask, hev, flat, thresh, b_limit, limit;
982 v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
983 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
984 v16u8 p0_filter16, p1_filter16;
985 v8i16 p2_filter8, p1_filter8, p0_filter8;
986 v8i16 q0_filter8, q1_filter8, q2_filter8;
987 v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r;
988 v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
989 v16i8 zero = { 0 };
990 v8u16 tmp0, tmp1, tmp2;
991
992 /* load vector elements */
993 LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
994
995 thresh = (v16u8) __msa_fill_b(thresh_ptr);
996 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
997 limit = (v16u8) __msa_fill_b(limit_ptr);
998
999 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1000 hev, mask, flat);
1001 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1002 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1003 q1_out);
1004
1005 flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
1006
1007 /* if flat is zero for all pixels, then no need to calculate other filter */
1008 if (__msa_test_bz_v(flat)) {
1009 p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
1010 p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
1011 q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
1012 q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
1013 SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch);
1014 } else {
1015 /* convert 8 bit input data into 16 bit */
1016 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero,
1017 q1, zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r,
1018 q1_r, q2_r, q3_r);
1019 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r,
1020 p2_filter8, p1_filter8, p0_filter8, q0_filter8,
1021 q1_filter8, q2_filter8);
1022
1023 /* convert 16 bit output data into 8 bit */
1024 PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
1025 zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
1026 q0_filter8);
1027 PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8,
1028 q2_filter8);
1029
1030 /* store pixel values */
1031 p2_out = __msa_bmnz_v(p2, (v16u8) p2_filter8, flat);
1032 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filter8, flat);
1033 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filter8, flat);
1034 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filter8, flat);
1035 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filter8, flat);
1036 q2_out = __msa_bmnz_v(q2, (v16u8) q2_filter8, flat);
1037
1038 /* load 16 vector elements */
1039 LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4);
1040 LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7);
1041
1042 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
1043
1044 /* if flat2 is zero for all pixels, then no need to calculate other filter */
1045 if (__msa_test_bz_v(flat2)) {
1046 p2_d = __msa_copy_u_d((v2i64) p2_out, 0);
1047 p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
1048 p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
1049 q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
1050 q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
1051 q2_d = __msa_copy_u_d((v2i64) q2_out, 0);
1052
1053 SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch);
1054 SD(q1_d, src + pitch);
1055 SD(q2_d, src + 2 * pitch);
1056 } else {
1057 /* LSB(right) 8 pixel operation */
1058 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4,
1059 zero, q5, zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r,
1060 q4_r, q5_r, q6_r, q7_r);
1061
1062 tmp0 = p7_r << 3;
1063 tmp0 -= p7_r;
1064 tmp0 += p6_r;
1065 tmp0 += q0_r;
1066
1067 src -= 7 * pitch;
1068
1069 /* calculation of p6 and p5 */
1070 tmp1 = p6_r + p5_r + p4_r + p3_r;
1071 tmp1 += (p2_r + p1_r + p0_r);
1072 tmp1 += tmp0;
1073 p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1074 tmp0 = p5_r - p6_r + q1_r - p7_r;
1075 tmp1 += tmp0;
1076 p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1077 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1078 p0_filter16, p1_filter16);
1079 p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2);
1080 p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2);
1081 dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1082 dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1083 SD(dword0, src);
1084 src += pitch;
1085 SD(dword1, src);
1086 src += pitch;
1087
1088 /* calculation of p4 and p3 */
1089 tmp0 = p4_r - p5_r + q2_r - p7_r;
1090 tmp2 = p3_r - p4_r + q3_r - p7_r;
1091 tmp1 += tmp0;
1092 p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1093 tmp1 += tmp2;
1094 p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1095 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1096 p0_filter16, p1_filter16);
1097 p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2);
1098 p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2);
1099 dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1100 dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1101 SD(dword0, src);
1102 src += pitch;
1103 SD(dword1, src);
1104 src += pitch;
1105
1106 /* calculation of p2 and p1 */
1107 tmp0 = p2_r - p3_r + q4_r - p7_r;
1108 tmp2 = p1_r - p2_r + q5_r - p7_r;
1109 tmp1 += tmp0;
1110 p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1111 tmp1 += tmp2;
1112 p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1113 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1114 p0_filter16, p1_filter16);
1115 p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2);
1116 p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2);
1117 dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1118 dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1119 SD(dword0, src);
1120 src += pitch;
1121 SD(dword1, src);
1122 src += pitch;
1123
1124 /* calculation of p0 and q0 */
1125 tmp0 = (p0_r - p1_r) + (q6_r - p7_r);
1126 tmp2 = (q7_r - p0_r) + (q0_r - p7_r);
1127 tmp1 += tmp0;
1128 p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1129 tmp1 += tmp2;
1130 p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1131 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1132 p0_filter16, p1_filter16);
1133 p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2);
1134 p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2);
1135 dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1136 dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1137 SD(dword0, src);
1138 src += pitch;
1139 SD(dword1, src);
1140 src += pitch;
1141
1142 /* calculation of q1 and q2 */
1143 tmp0 = q7_r - q0_r + q1_r - p6_r;
1144 tmp2 = q7_r - q1_r + q2_r - p5_r;
1145 tmp1 += tmp0;
1146 p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1147 tmp1 += tmp2;
1148 p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1149 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1150 p0_filter16, p1_filter16);
1151 p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2);
1152 p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2);
1153 dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1154 dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1155 SD(dword0, src);
1156 src += pitch;
1157 SD(dword1, src);
1158 src += pitch;
1159
1160 /* calculation of q3 and q4 */
1161 tmp0 = (q7_r - q2_r) + (q3_r - p4_r);
1162 tmp2 = (q7_r - q3_r) + (q4_r - p3_r);
1163 tmp1 += tmp0;
1164 p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1165 tmp1 += tmp2;
1166 p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1167 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1168 p0_filter16, p1_filter16);
1169 p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2);
1170 p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2);
1171 dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1172 dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1173 SD(dword0, src);
1174 src += pitch;
1175 SD(dword1, src);
1176 src += pitch;
1177
1178 /* calculation of q5 and q6 */
1179 tmp0 = (q7_r - q4_r) + (q5_r - p2_r);
1180 tmp2 = (q7_r - q5_r) + (q6_r - p1_r);
1181 tmp1 += tmp0;
1182 p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1183 tmp1 += tmp2;
1184 p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1185 PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1186 p0_filter16, p1_filter16);
1187 p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2);
1188 p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2);
1189 dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1190 dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1191 SD(dword0, src);
1192 src += pitch;
1193 SD(dword1, src);
1194 }
1195 }
1196 }
1197
ff_loop_filter_h_4_8_msa(uint8_t * src,ptrdiff_t pitch,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)1198 void ff_loop_filter_h_4_8_msa(uint8_t *src, ptrdiff_t pitch,
1199 int32_t b_limit_ptr,
1200 int32_t limit_ptr,
1201 int32_t thresh_ptr)
1202 {
1203 v16u8 mask, hev, flat, limit, thresh, b_limit;
1204 v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1205 v8i16 vec0, vec1, vec2, vec3;
1206
1207 LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
1208
1209 thresh = (v16u8) __msa_fill_b(thresh_ptr);
1210 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1211 limit = (v16u8) __msa_fill_b(limit_ptr);
1212
1213 TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3,
1214 p3, p2, p1, p0, q0, q1, q2, q3);
1215 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1216 hev, mask, flat);
1217 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
1218 ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1);
1219 ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1220
1221 src -= 2;
1222 ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
1223 }
1224
ff_loop_filter_h_44_16_msa(uint8_t * src,ptrdiff_t pitch,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)1225 void ff_loop_filter_h_44_16_msa(uint8_t *src, ptrdiff_t pitch,
1226 int32_t b_limit_ptr,
1227 int32_t limit_ptr,
1228 int32_t thresh_ptr)
1229 {
1230 v16u8 mask, hev, flat;
1231 v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
1232 v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1233 v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1234 v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
1235 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1236
1237 LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
1238 LD_UB8(src - 4 + (8 * pitch), pitch,
1239 row8, row9, row10, row11, row12, row13, row14, row15);
1240
1241 TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
1242 row8, row9, row10, row11, row12, row13, row14, row15,
1243 p3, p2, p1, p0, q0, q1, q2, q3);
1244
1245 thresh0 = (v16u8) __msa_fill_b(thresh_ptr);
1246 thresh1 = (v16u8) __msa_fill_b(thresh_ptr >> 8);
1247 thresh0 = (v16u8) __msa_ilvr_d((v2i64) thresh1, (v2i64) thresh0);
1248
1249 b_limit0 = (v16u8) __msa_fill_b(b_limit_ptr);
1250 b_limit1 = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
1251 b_limit0 = (v16u8) __msa_ilvr_d((v2i64) b_limit1, (v2i64) b_limit0);
1252
1253 limit0 = (v16u8) __msa_fill_b(limit_ptr);
1254 limit1 = (v16u8) __msa_fill_b(limit_ptr >> 8);
1255 limit0 = (v16u8) __msa_ilvr_d((v2i64) limit1, (v2i64) limit0);
1256
1257 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
1258 hev, mask, flat);
1259 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
1260 ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
1261 ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
1262 ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
1263 ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
1264
1265 src -= 2;
1266
1267 ST_W8(tmp2, tmp3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
1268 ST_W8(tmp4, tmp5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch);
1269 }
1270
ff_loop_filter_h_8_8_msa(uint8_t * src,ptrdiff_t pitch,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)1271 void ff_loop_filter_h_8_8_msa(uint8_t *src, ptrdiff_t pitch,
1272 int32_t b_limit_ptr,
1273 int32_t limit_ptr,
1274 int32_t thresh_ptr)
1275 {
1276 v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1277 v16u8 p1_out, p0_out, q0_out, q1_out;
1278 v16u8 flat, mask, hev, thresh, b_limit, limit;
1279 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1280 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1281 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1282 v16u8 zero = { 0 };
1283 v8i16 vec0, vec1, vec2, vec3, vec4;
1284
1285 /* load vector elements */
1286 LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
1287
1288 TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3,
1289 p3, p2, p1, p0, q0, q1, q2, q3);
1290
1291 thresh = (v16u8) __msa_fill_b(thresh_ptr);
1292 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1293 limit = (v16u8) __msa_fill_b(limit_ptr);
1294
1295 /* mask and hev */
1296 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1297 hev, mask, flat);
1298 /* flat4 */
1299 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1300 /* filter4 */
1301 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1302 q1_out);
1303
1304 flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
1305
1306 /* if flat is zero for all pixels, then no need to calculate other filter */
1307 if (__msa_test_bz_v(flat)) {
1308 /* Store 4 pixels p1-_q1 */
1309 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1310 ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1311
1312 src -= 2;
1313 ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
1314 } else {
1315 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1316 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1317 q3_r);
1318 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1319 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1320 /* convert 16 bit output data into 8 bit */
1321 PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r,
1322 p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r,
1323 p0_filt8_r, q0_filt8_r);
1324 PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r,
1325 q2_filt8_r);
1326
1327 /* store pixel values */
1328 p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
1329 p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
1330 p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
1331 q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
1332 q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
1333 q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
1334
1335 /* Store 6 pixels p2-_q2 */
1336 ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1337 ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1338 vec4 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1);
1339
1340 src -= 3;
1341 ST_W4(vec2, 0, 1, 2, 3, src, pitch);
1342 ST_H4(vec4, 0, 1, 2, 3, src + 4, pitch);
1343 src += (4 * pitch);
1344 ST_W4(vec3, 0, 1, 2, 3, src, pitch);
1345 ST_H4(vec4, 4, 5, 6, 7, src + 4, pitch);
1346 }
1347 }
1348
ff_loop_filter_h_88_16_msa(uint8_t * src,ptrdiff_t pitch,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)1349 void ff_loop_filter_h_88_16_msa(uint8_t *src, ptrdiff_t pitch,
1350 int32_t b_limit_ptr,
1351 int32_t limit_ptr,
1352 int32_t thresh_ptr)
1353 {
1354 uint8_t *temp_src;
1355 v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1356 v16u8 p1_out, p0_out, q0_out, q1_out;
1357 v16u8 flat, mask, hev, thresh, b_limit, limit;
1358 v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
1359 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1360 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1361 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1362 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1363 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
1364 v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
1365 v16u8 zero = { 0 };
1366 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1367
1368 temp_src = src - 4;
1369
1370 LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
1371 temp_src += (8 * pitch);
1372 LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
1373
1374 /* transpose 16x8 matrix into 8x16 */
1375 TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
1376 q3, q2, q1, q0, row12, row13, row14, row15,
1377 p3, p2, p1, p0, q0, q1, q2, q3);
1378
1379 thresh = (v16u8) __msa_fill_b(thresh_ptr);
1380 vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
1381 thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
1382
1383 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1384 vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
1385 b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
1386
1387 limit = (v16u8) __msa_fill_b(limit_ptr);
1388 vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
1389 limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
1390
1391 /* mask and hev */
1392 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1393 hev, mask, flat);
1394 /* flat4 */
1395 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1396 /* filter4 */
1397 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1398 q1_out);
1399
1400 /* if flat is zero for all pixels, then no need to calculate other filter */
1401 if (__msa_test_bz_v(flat)) {
1402 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1403 ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1404 ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1405 ILVRL_H2_SH(vec1, vec0, vec4, vec5);
1406
1407 src -= 2;
1408 ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
1409 ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch);
1410 } else {
1411 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1412 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1413 q3_r);
1414 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1415 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1416
1417 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
1418 p0_l);
1419 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
1420 q3_l);
1421
1422 /* filter8 */
1423 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1424 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1425
1426 /* convert 16 bit output data into 8 bit */
1427 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
1428 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
1429 p0_filt8_r, q0_filt8_r);
1430 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
1431 q2_filt8_r);
1432
1433 /* store pixel values */
1434 p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
1435 p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
1436 p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
1437 q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
1438 q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
1439 q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
1440
1441 ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1442 ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1443 ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
1444 ILVRL_H2_SH(vec1, vec0, vec6, vec7);
1445 ILVRL_B2_SH(q2, q1, vec2, vec5);
1446
1447 src -= 3;
1448 ST_W4(vec3, 0, 1, 2, 3, src, pitch);
1449 ST_H4(vec2, 0, 1, 2, 3, src + 4, pitch);
1450 src += (4 * pitch);
1451 ST_W4(vec4, 0, 1, 2, 3, src, pitch);
1452 ST_H4(vec2, 4, 5, 6, 7, src + 4, pitch);
1453 src += (4 * pitch);
1454 ST_W4(vec6, 0, 1, 2, 3, src, pitch);
1455 ST_H4(vec5, 0, 1, 2, 3, src + 4, pitch);
1456 src += (4 * pitch);
1457 ST_W4(vec7, 0, 1, 2, 3, src, pitch);
1458 ST_H4(vec5, 4, 5, 6, 7, src + 4, pitch);
1459 }
1460 }
1461
ff_loop_filter_h_84_16_msa(uint8_t * src,ptrdiff_t pitch,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)1462 void ff_loop_filter_h_84_16_msa(uint8_t *src, ptrdiff_t pitch,
1463 int32_t b_limit_ptr,
1464 int32_t limit_ptr,
1465 int32_t thresh_ptr)
1466 {
1467 uint8_t *temp_src;
1468 v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1469 v16u8 p1_out, p0_out, q0_out, q1_out;
1470 v16u8 flat, mask, hev, thresh, b_limit, limit;
1471 v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
1472 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1473 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1474 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1475 v16u8 zero = { 0 };
1476 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1477
1478 temp_src = src - 4;
1479
1480 LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
1481 temp_src += (8 * pitch);
1482 LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
1483
1484 /* transpose 16x8 matrix into 8x16 */
1485 TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
1486 q3, q2, q1, q0, row12, row13, row14, row15,
1487 p3, p2, p1, p0, q0, q1, q2, q3);
1488
1489 thresh = (v16u8) __msa_fill_b(thresh_ptr);
1490 vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
1491 thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
1492
1493 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1494 vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
1495 b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
1496
1497 limit = (v16u8) __msa_fill_b(limit_ptr);
1498 vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
1499 limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
1500
1501 /* mask and hev */
1502 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1503 hev, mask, flat);
1504 /* flat4 */
1505 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1506 /* filter4 */
1507 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1508 q1_out);
1509
1510 flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
1511
1512 /* if flat is zero for all pixels, then no need to calculate other filter */
1513 if (__msa_test_bz_v(flat)) {
1514 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1515 ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1516 ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1517 ILVRL_H2_SH(vec1, vec0, vec4, vec5);
1518
1519 src -= 2;
1520 ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
1521 ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch);
1522 } else {
1523 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1524 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1525 q3_r);
1526 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1527 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1528
1529 /* convert 16 bit output data into 8 bit */
1530 PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r,
1531 p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r,
1532 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r);
1533 PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r,
1534 q1_filt8_r, q2_filt8_r);
1535
1536 /* store pixel values */
1537 p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
1538 p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
1539 p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
1540 q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
1541 q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
1542 q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
1543
1544 ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1545 ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1546 ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
1547 ILVRL_H2_SH(vec1, vec0, vec6, vec7);
1548 ILVRL_B2_SH(q2, q1, vec2, vec5);
1549
1550 src -= 3;
1551 ST_W4(vec3, 0, 1, 2, 3, src, pitch);
1552 ST_H4(vec2, 0, 1, 2, 3, src + 4, pitch);
1553 src += (4 * pitch);
1554 ST_W4(vec4, 0, 1, 2, 3, src, pitch);
1555 ST_H4(vec2, 4, 5, 6, 7, src + 4, pitch);
1556 src += (4 * pitch);
1557 ST_W4(vec6, 0, 1, 2, 3, src, pitch);
1558 ST_H4(vec5, 0, 1, 2, 3, src + 4, pitch);
1559 src += (4 * pitch);
1560 ST_W4(vec7, 0, 1, 2, 3, src, pitch);
1561 ST_H4(vec5, 4, 5, 6, 7, src + 4, pitch);
1562 }
1563 }
1564
ff_loop_filter_h_48_16_msa(uint8_t * src,ptrdiff_t pitch,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)1565 void ff_loop_filter_h_48_16_msa(uint8_t *src, ptrdiff_t pitch,
1566 int32_t b_limit_ptr,
1567 int32_t limit_ptr,
1568 int32_t thresh_ptr)
1569 {
1570 uint8_t *temp_src;
1571 v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1572 v16u8 p1_out, p0_out, q0_out, q1_out;
1573 v16u8 flat, mask, hev, thresh, b_limit, limit;
1574 v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
1575 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1576 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
1577 v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
1578 v16u8 zero = { 0 };
1579 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1580
1581 temp_src = src - 4;
1582
1583 LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
1584 temp_src += (8 * pitch);
1585 LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
1586
1587 /* transpose 16x8 matrix into 8x16 */
1588 TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
1589 q3, q2, q1, q0, row12, row13, row14, row15,
1590 p3, p2, p1, p0, q0, q1, q2, q3);
1591
1592 thresh = (v16u8) __msa_fill_b(thresh_ptr);
1593 vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
1594 thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
1595
1596 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1597 vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
1598 b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
1599
1600 limit = (v16u8) __msa_fill_b(limit_ptr);
1601 vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
1602 limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
1603
1604 /* mask and hev */
1605 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1606 hev, mask, flat);
1607 /* flat4 */
1608 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1609 /* filter4 */
1610 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1611 q1_out);
1612
1613 flat = (v16u8) __msa_insve_d((v2i64) flat, 0, (v2i64) zero);
1614
1615 /* if flat is zero for all pixels, then no need to calculate other filter */
1616 if (__msa_test_bz_v(flat)) {
1617 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1618 ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1619 ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1620 ILVRL_H2_SH(vec1, vec0, vec4, vec5);
1621
1622 src -= 2;
1623 ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
1624 ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch);
1625 } else {
1626 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
1627 p0_l);
1628 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
1629 q3_l);
1630
1631 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1632 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1633
1634 /* convert 16 bit output data into 8 bit */
1635 PCKEV_B4_SH(p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
1636 p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l,
1637 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
1638 PCKEV_B2_SH(q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
1639 q1_filt8_l, q2_filt8_l);
1640
1641 /* store pixel values */
1642 p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_l, flat);
1643 p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_l, flat);
1644 p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_l, flat);
1645 q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_l, flat);
1646 q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_l, flat);
1647 q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_l, flat);
1648
1649 ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1650 ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1651 ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
1652 ILVRL_H2_SH(vec1, vec0, vec6, vec7);
1653 ILVRL_B2_SH(q2, q1, vec2, vec5);
1654
1655 src -= 3;
1656 ST_W4(vec3, 0, 1, 2, 3, src, pitch);
1657 ST_H4(vec2, 0, 1, 2, 3, src + 4, pitch);
1658 src += (4 * pitch);
1659 ST_W4(vec4, 0, 1, 2, 3, src, pitch);
1660 ST_H4(vec2, 4, 5, 6, 7, src + 4, pitch);
1661 src += (4 * pitch);
1662 ST_W4(vec6, 0, 1, 2, 3, src, pitch);
1663 ST_H4(vec5, 0, 1, 2, 3, src + 4, pitch);
1664 src += (4 * pitch);
1665 ST_W4(vec7, 0, 1, 2, 3, src, pitch);
1666 ST_H4(vec5, 4, 5, 6, 7, src + 4, pitch);
1667 }
1668 }
1669
vp9_transpose_16x8_to_8x16(uint8_t * input,int32_t in_pitch,uint8_t * output,int32_t out_pitch)1670 static void vp9_transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
1671 uint8_t *output, int32_t out_pitch)
1672 {
1673 v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
1674 v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1675 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1676 v16i8 zeros = { 0 };
1677
1678 LD_UB8(input, in_pitch,
1679 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org);
1680 /* 8x8 transpose */
1681 TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
1682 p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
1683 /* 8x8 transpose */
1684 ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org,
1685 tmp0, tmp1, tmp2, tmp3);
1686 ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
1687 ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
1688 ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4);
1689 ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6);
1690 SLDI_B4_UB(zeros, q0, zeros, q2, zeros, q4, zeros, q6, 8, q1, q3, q5, q7);
1691
1692 ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
1693 output += (8 * out_pitch);
1694 ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
1695 }
1696
vp9_transpose_8x16_to_16x8(uint8_t * input,int32_t in_pitch,uint8_t * output,int32_t out_pitch)1697 static void vp9_transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
1698 uint8_t *output, int32_t out_pitch)
1699 {
1700 v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
1701 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1702
1703 LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0);
1704 LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7);
1705 TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
1706 q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
1707 ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
1708 }
1709
vp9_transpose_16x16(uint8_t * input,int32_t in_pitch,uint8_t * output,int32_t out_pitch)1710 static void vp9_transpose_16x16(uint8_t *input, int32_t in_pitch,
1711 uint8_t *output, int32_t out_pitch)
1712 {
1713 v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1714 v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
1715 v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
1716 v4i32 tmp2, tmp3;
1717 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1718
1719 LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
1720 input += (8 * in_pitch);
1721 LD_UB8(input, in_pitch,
1722 row8, row9, row10, row11, row12, row13, row14, row15);
1723
1724 TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
1725 row8, row9, row10, row11, row12, row13, row14, row15,
1726 p7, p6, p5, p4, p3, p2, p1, p0);
1727
1728 /* transpose 16x8 matrix into 8x16 */
1729 /* total 8 intermediate register and 32 instructions */
1730 q7 = (v16u8) __msa_ilvod_d((v2i64) row8, (v2i64) row0);
1731 q6 = (v16u8) __msa_ilvod_d((v2i64) row9, (v2i64) row1);
1732 q5 = (v16u8) __msa_ilvod_d((v2i64) row10, (v2i64) row2);
1733 q4 = (v16u8) __msa_ilvod_d((v2i64) row11, (v2i64) row3);
1734 q3 = (v16u8) __msa_ilvod_d((v2i64) row12, (v2i64) row4);
1735 q2 = (v16u8) __msa_ilvod_d((v2i64) row13, (v2i64) row5);
1736 q1 = (v16u8) __msa_ilvod_d((v2i64) row14, (v2i64) row6);
1737 q0 = (v16u8) __msa_ilvod_d((v2i64) row15, (v2i64) row7);
1738
1739 ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1);
1740 tmp4 = (v8i16) __msa_ilvod_b((v16i8) q6, (v16i8) q7);
1741 tmp5 = (v8i16) __msa_ilvod_b((v16i8) q4, (v16i8) q5);
1742
1743 ILVEV_B2_UB(q3, q2, q1, q0, q5, q7);
1744 tmp6 = (v8i16) __msa_ilvod_b((v16i8) q2, (v16i8) q3);
1745 tmp7 = (v8i16) __msa_ilvod_b((v16i8) q0, (v16i8) q1);
1746
1747 ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3);
1748 q0 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1749 q4 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1750
1751 tmp2 = (v4i32) __msa_ilvod_h(tmp1, tmp0);
1752 tmp3 = (v4i32) __msa_ilvod_h((v8i16) q7, (v8i16) q5);
1753 q2 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1754 q6 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1755
1756 ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3);
1757 q1 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1758 q5 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1759
1760 tmp2 = (v4i32) __msa_ilvod_h(tmp5, tmp4);
1761 tmp3 = (v4i32) __msa_ilvod_h(tmp7, tmp6);
1762 q3 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1763 q7 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1764
1765 ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
1766 output += (8 * out_pitch);
1767 ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
1768 }
1769
vp9_vt_lpf_t4_and_t8_8w(uint8_t * src,uint8_t * filter48,uint8_t * src_org,int32_t pitch_org,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)1770 static int32_t vp9_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
1771 uint8_t *src_org, int32_t pitch_org,
1772 int32_t b_limit_ptr,
1773 int32_t limit_ptr,
1774 int32_t thresh_ptr)
1775 {
1776 v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1777 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
1778 v16u8 flat, mask, hev, thresh, b_limit, limit;
1779 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1780 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1781 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1782 v16i8 zero = { 0 };
1783 v8i16 vec0, vec1, vec2, vec3;
1784
1785 /* load vector elements */
1786 LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
1787
1788 thresh = (v16u8) __msa_fill_b(thresh_ptr);
1789 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1790 limit = (v16u8) __msa_fill_b(limit_ptr);
1791
1792 /* mask and hev */
1793 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1794 hev, mask, flat);
1795 /* flat4 */
1796 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1797 /* filter4 */
1798 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1799 q1_out);
1800
1801 flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
1802
1803 /* if flat is zero for all pixels, then no need to calculate other filter */
1804 if (__msa_test_bz_v(flat)) {
1805 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1806 ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1807 ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, (src_org - 2), pitch_org);
1808 return 1;
1809 } else {
1810 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1811 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1812 q3_r);
1813 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1814 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1815
1816 /* convert 16 bit output data into 8 bit */
1817 p2_r = (v8u16) __msa_pckev_b((v16i8) p2_filt8_r, (v16i8) p2_filt8_r);
1818 p1_r = (v8u16) __msa_pckev_b((v16i8) p1_filt8_r, (v16i8) p1_filt8_r);
1819 p0_r = (v8u16) __msa_pckev_b((v16i8) p0_filt8_r, (v16i8) p0_filt8_r);
1820 q0_r = (v8u16) __msa_pckev_b((v16i8) q0_filt8_r, (v16i8) q0_filt8_r);
1821 q1_r = (v8u16) __msa_pckev_b((v16i8) q1_filt8_r, (v16i8) q1_filt8_r);
1822 q2_r = (v8u16) __msa_pckev_b((v16i8) q2_filt8_r, (v16i8) q2_filt8_r);
1823
1824 /* store pixel values */
1825 p2_out = __msa_bmnz_v(p2, (v16u8) p2_r, flat);
1826 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_r, flat);
1827 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_r, flat);
1828 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_r, flat);
1829 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_r, flat);
1830 q2_out = __msa_bmnz_v(q2, (v16u8) q2_r, flat);
1831
1832 ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
1833 filter48 += (4 * 16);
1834 ST_UB2(q1_out, q2_out, filter48, 16);
1835 filter48 += (2 * 16);
1836 ST_UB(flat, filter48);
1837
1838 return 0;
1839 }
1840 }
1841
vp9_vt_lpf_t16_8w(uint8_t * src,uint8_t * src_org,ptrdiff_t pitch,uint8_t * filter48)1842 static int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch,
1843 uint8_t *filter48)
1844 {
1845 v16i8 zero = { 0 };
1846 v16u8 filter8, flat, flat2;
1847 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1848 v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
1849 v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
1850 v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
1851 v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
1852 v8u16 tmp0_r, tmp1_r;
1853 v8i16 r_out;
1854
1855 flat = LD_UB(filter48 + 6 * 16);
1856
1857 LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
1858 LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
1859
1860 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
1861
1862 /* if flat2 is zero for all pixels, then no need to calculate other filter */
1863 if (__msa_test_bz_v(flat2)) {
1864 v8i16 vec0, vec1, vec2, vec3, vec4;
1865
1866 LD_UB4(filter48, 16, p2, p1, p0, q0);
1867 LD_UB2(filter48 + 4 * 16, 16, q1, q2);
1868
1869 ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1870 ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1871 vec2 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1);
1872
1873 src_org -= 3;
1874 ST_W4(vec3, 0, 1, 2, 3, src_org, pitch);
1875 ST_H4(vec2, 0, 1, 2, 3, (src_org + 4), pitch);
1876 src_org += (4 * pitch);
1877 ST_W4(vec4, 0, 1, 2, 3, src_org, pitch);
1878 ST_H4(vec2, 4, 5, 6, 7, (src_org + 4), pitch);
1879
1880 return 1;
1881 } else {
1882 src -= 7 * 16;
1883
1884 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
1885 zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
1886 p3_r_in, p2_r_in, p1_r_in, p0_r_in);
1887 q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0);
1888
1889 tmp0_r = p7_r_in << 3;
1890 tmp0_r -= p7_r_in;
1891 tmp0_r += p6_r_in;
1892 tmp0_r += q0_r_in;
1893 tmp1_r = p6_r_in + p5_r_in;
1894 tmp1_r += p4_r_in;
1895 tmp1_r += p3_r_in;
1896 tmp1_r += p2_r_in;
1897 tmp1_r += p1_r_in;
1898 tmp1_r += p0_r_in;
1899 tmp1_r += tmp0_r;
1900
1901 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1902 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1903 p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
1904 ST_D1(p6, 0, src);
1905 src += 16;
1906
1907 /* p5 */
1908 q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1);
1909 tmp0_r = p5_r_in - p6_r_in;
1910 tmp0_r += q1_r_in;
1911 tmp0_r -= p7_r_in;
1912 tmp1_r += tmp0_r;
1913 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1914 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1915 p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
1916 ST_D1(p5, 0, src);
1917 src += 16;
1918
1919 /* p4 */
1920 q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
1921 tmp0_r = p4_r_in - p5_r_in;
1922 tmp0_r += q2_r_in;
1923 tmp0_r -= p7_r_in;
1924 tmp1_r += tmp0_r;
1925 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1926 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1927 p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
1928 ST_D1(p4, 0, src);
1929 src += 16;
1930
1931 /* p3 */
1932 q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
1933 tmp0_r = p3_r_in - p4_r_in;
1934 tmp0_r += q3_r_in;
1935 tmp0_r -= p7_r_in;
1936 tmp1_r += tmp0_r;
1937 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1938 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1939 p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
1940 ST_D1(p3, 0, src);
1941 src += 16;
1942
1943 /* p2 */
1944 q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
1945 filter8 = LD_UB(filter48);
1946 tmp0_r = p2_r_in - p3_r_in;
1947 tmp0_r += q4_r_in;
1948 tmp0_r -= p7_r_in;
1949 tmp1_r += tmp0_r;
1950 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1951 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1952 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
1953 ST_D1(filter8, 0, src);
1954 src += 16;
1955
1956 /* p1 */
1957 q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
1958 filter8 = LD_UB(filter48 + 16);
1959 tmp0_r = p1_r_in - p2_r_in;
1960 tmp0_r += q5_r_in;
1961 tmp0_r -= p7_r_in;
1962 tmp1_r += tmp0_r;
1963 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1964 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1965 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
1966 ST_D1(filter8, 0, src);
1967 src += 16;
1968
1969 /* p0 */
1970 q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
1971 filter8 = LD_UB(filter48 + 32);
1972 tmp0_r = p0_r_in - p1_r_in;
1973 tmp0_r += q6_r_in;
1974 tmp0_r -= p7_r_in;
1975 tmp1_r += tmp0_r;
1976 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1977 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1978 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
1979 ST_D1(filter8, 0, src);
1980 src += 16;
1981
1982 /* q0 */
1983 q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
1984 filter8 = LD_UB(filter48 + 48);
1985 tmp0_r = q7_r_in - p0_r_in;
1986 tmp0_r += q0_r_in;
1987 tmp0_r -= p7_r_in;
1988 tmp1_r += tmp0_r;
1989 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1990 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1991 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
1992 ST_D1(filter8, 0, src);
1993 src += 16;
1994
1995 /* q1 */
1996 filter8 = LD_UB(filter48 + 64);
1997 tmp0_r = q7_r_in - q0_r_in;
1998 tmp0_r += q1_r_in;
1999 tmp0_r -= p6_r_in;
2000 tmp1_r += tmp0_r;
2001 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2002 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2003 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2004 ST_D1(filter8, 0, src);
2005 src += 16;
2006
2007 /* q2 */
2008 filter8 = LD_UB(filter48 + 80);
2009 tmp0_r = q7_r_in - q1_r_in;
2010 tmp0_r += q2_r_in;
2011 tmp0_r -= p5_r_in;
2012 tmp1_r += tmp0_r;
2013 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2014 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2015 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2016 ST_D1(filter8, 0, src);
2017 src += 16;
2018
2019 /* q3 */
2020 tmp0_r = q7_r_in - q2_r_in;
2021 tmp0_r += q3_r_in;
2022 tmp0_r -= p4_r_in;
2023 tmp1_r += tmp0_r;
2024 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2025 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2026 q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
2027 ST_D1(q3, 0, src);
2028 src += 16;
2029
2030 /* q4 */
2031 tmp0_r = q7_r_in - q3_r_in;
2032 tmp0_r += q4_r_in;
2033 tmp0_r -= p3_r_in;
2034 tmp1_r += tmp0_r;
2035 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2036 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2037 q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
2038 ST_D1(q4, 0, src);
2039 src += 16;
2040
2041 /* q5 */
2042 tmp0_r = q7_r_in - q4_r_in;
2043 tmp0_r += q5_r_in;
2044 tmp0_r -= p2_r_in;
2045 tmp1_r += tmp0_r;
2046 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2047 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2048 q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
2049 ST_D1(q5, 0, src);
2050 src += 16;
2051
2052 /* q6 */
2053 tmp0_r = q7_r_in - q5_r_in;
2054 tmp0_r += q6_r_in;
2055 tmp0_r -= p1_r_in;
2056 tmp1_r += tmp0_r;
2057 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2058 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2059 q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
2060 ST_D1(q6, 0, src);
2061
2062 return 0;
2063 }
2064 }
2065
ff_loop_filter_h_16_8_msa(uint8_t * src,ptrdiff_t pitch,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)2066 void ff_loop_filter_h_16_8_msa(uint8_t *src, ptrdiff_t pitch,
2067 int32_t b_limit_ptr,
2068 int32_t limit_ptr,
2069 int32_t thresh_ptr)
2070 {
2071 uint8_t early_exit = 0;
2072 uint8_t transposed_input[16 * 24] ALLOC_ALIGNED(ALIGNMENT);
2073 uint8_t *filter48 = &transposed_input[16 * 16];
2074
2075 vp9_transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
2076
2077 early_exit = vp9_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8),
2078 &filter48[0], src, pitch,
2079 b_limit_ptr, limit_ptr, thresh_ptr);
2080
2081 if (0 == early_exit) {
2082 early_exit = vp9_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
2083 &filter48[0]);
2084
2085 if (0 == early_exit) {
2086 vp9_transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
2087 }
2088 }
2089 }
2090
vp9_vt_lpf_t4_and_t8_16w(uint8_t * src,uint8_t * filter48,uint8_t * src_org,ptrdiff_t pitch,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)2091 static int32_t vp9_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
2092 uint8_t *src_org, ptrdiff_t pitch,
2093 int32_t b_limit_ptr,
2094 int32_t limit_ptr,
2095 int32_t thresh_ptr)
2096 {
2097 v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
2098 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
2099 v16u8 flat, mask, hev, thresh, b_limit, limit;
2100 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
2101 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
2102 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
2103 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
2104 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
2105 v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
2106 v16i8 zero = { 0 };
2107 v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
2108
2109 /* load vector elements */
2110 LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
2111
2112 thresh = (v16u8) __msa_fill_b(thresh_ptr);
2113 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
2114 limit = (v16u8) __msa_fill_b(limit_ptr);
2115
2116 /* mask and hev */
2117 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
2118 hev, mask, flat);
2119 /* flat4 */
2120 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
2121 /* filter4 */
2122 VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
2123 q1_out);
2124
2125 /* if flat is zero for all pixels, then no need to calculate other filter */
2126 if (__msa_test_bz_v(flat)) {
2127 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2128 ILVRL_H2_SH(vec1, vec0, vec2, vec3);
2129 ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2130 ILVRL_H2_SH(vec1, vec0, vec4, vec5);
2131
2132 src_org -= 2;
2133 ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src_org, pitch);
2134 ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src_org + 8 * pitch, pitch);
2135
2136 return 1;
2137 } else {
2138 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
2139 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
2140 q3_r);
2141 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
2142 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
2143 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
2144 p0_l);
2145 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
2146 q3_l);
2147 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
2148 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
2149
2150 /* convert 16 bit output data into 8 bit */
2151 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
2152 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
2153 p0_filt8_r, q0_filt8_r);
2154 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
2155 q2_filt8_r);
2156
2157 /* store pixel values */
2158 p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
2159 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
2160 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
2161 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
2162 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
2163 q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
2164
2165 ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
2166 filter48 += (4 * 16);
2167 ST_UB2(q1_out, q2_out, filter48, 16);
2168 filter48 += (2 * 16);
2169 ST_UB(flat, filter48);
2170
2171 return 0;
2172 }
2173 }
2174
vp9_vt_lpf_t16_16w(uint8_t * src,uint8_t * src_org,ptrdiff_t pitch,uint8_t * filter48)2175 static int32_t vp9_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch,
2176 uint8_t *filter48)
2177 {
2178 v16u8 flat, flat2, filter8;
2179 v16i8 zero = { 0 };
2180 v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
2181 v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
2182 v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
2183 v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
2184 v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
2185 v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
2186 v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
2187 v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
2188 v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
2189 v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
2190 v8i16 l_out, r_out;
2191
2192 flat = LD_UB(filter48 + 6 * 16);
2193
2194 LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
2195 LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
2196
2197 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
2198
2199 /* if flat2 is zero for all pixels, then no need to calculate other filter */
2200 if (__msa_test_bz_v(flat2)) {
2201 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2202
2203 LD_UB4(filter48, 16, p2, p1, p0, q0);
2204 LD_UB2(filter48 + 4 * 16, 16, q1, q2);
2205
2206 ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
2207 ILVRL_H2_SH(vec1, vec0, vec3, vec4);
2208 ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
2209 ILVRL_H2_SH(vec1, vec0, vec6, vec7);
2210 ILVRL_B2_SH(q2, q1, vec2, vec5);
2211
2212 src_org -= 3;
2213 ST_W4(vec3, 0, 1, 2, 3, src_org, pitch);
2214 ST_H4(vec2, 0, 1, 2, 3, (src_org + 4), pitch);
2215 src_org += (4 * pitch);
2216 ST_W4(vec4, 0, 1, 2, 3, src_org, pitch);
2217 ST_H4(vec2, 4, 5, 6, 7, (src_org + 4), pitch);
2218 src_org += (4 * pitch);
2219 ST_W4(vec6, 0, 1, 2, 3, src_org, pitch);
2220 ST_H4(vec5, 0, 1, 2, 3, (src_org + 4), pitch);
2221 src_org += (4 * pitch);
2222 ST_W4(vec7, 0, 1, 2, 3, src_org, pitch);
2223 ST_H4(vec5, 4, 5, 6, 7, (src_org + 4), pitch);
2224
2225 return 1;
2226 } else {
2227 src -= 7 * 16;
2228
2229 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
2230 zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
2231 p3_r_in, p2_r_in, p1_r_in, p0_r_in);
2232 q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0);
2233
2234 tmp0_r = p7_r_in << 3;
2235 tmp0_r -= p7_r_in;
2236 tmp0_r += p6_r_in;
2237 tmp0_r += q0_r_in;
2238 tmp1_r = p6_r_in + p5_r_in;
2239 tmp1_r += p4_r_in;
2240 tmp1_r += p3_r_in;
2241 tmp1_r += p2_r_in;
2242 tmp1_r += p1_r_in;
2243 tmp1_r += p0_r_in;
2244 tmp1_r += tmp0_r;
2245 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2246
2247 ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
2248 p5_l_in, p4_l_in);
2249 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
2250 p1_l_in, p0_l_in);
2251 q0_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q0);
2252
2253 tmp0_l = p7_l_in << 3;
2254 tmp0_l -= p7_l_in;
2255 tmp0_l += p6_l_in;
2256 tmp0_l += q0_l_in;
2257 tmp1_l = p6_l_in + p5_l_in;
2258 tmp1_l += p4_l_in;
2259 tmp1_l += p3_l_in;
2260 tmp1_l += p2_l_in;
2261 tmp1_l += p1_l_in;
2262 tmp1_l += p0_l_in;
2263 tmp1_l += tmp0_l;
2264 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2265
2266 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2267 p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
2268 ST_UB(p6, src);
2269 src += 16;
2270
2271 /* p5 */
2272 q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1);
2273 tmp0_r = p5_r_in - p6_r_in;
2274 tmp0_r += q1_r_in;
2275 tmp0_r -= p7_r_in;
2276 tmp1_r += tmp0_r;
2277 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2278 q1_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q1);
2279 tmp0_l = p5_l_in - p6_l_in;
2280 tmp0_l += q1_l_in;
2281 tmp0_l -= p7_l_in;
2282 tmp1_l += tmp0_l;
2283 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2284 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2285 p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
2286 ST_UB(p5, src);
2287 src += 16;
2288
2289 /* p4 */
2290 q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
2291 tmp0_r = p4_r_in - p5_r_in;
2292 tmp0_r += q2_r_in;
2293 tmp0_r -= p7_r_in;
2294 tmp1_r += tmp0_r;
2295 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2296 q2_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q2);
2297 tmp0_l = p4_l_in - p5_l_in;
2298 tmp0_l += q2_l_in;
2299 tmp0_l -= p7_l_in;
2300 tmp1_l += tmp0_l;
2301 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2302 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2303 p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
2304 ST_UB(p4, src);
2305 src += 16;
2306
2307 /* p3 */
2308 q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
2309 tmp0_r = p3_r_in - p4_r_in;
2310 tmp0_r += q3_r_in;
2311 tmp0_r -= p7_r_in;
2312 tmp1_r += tmp0_r;
2313 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2314 q3_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q3);
2315 tmp0_l = p3_l_in - p4_l_in;
2316 tmp0_l += q3_l_in;
2317 tmp0_l -= p7_l_in;
2318 tmp1_l += tmp0_l;
2319 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2320 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2321 p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
2322 ST_UB(p3, src);
2323 src += 16;
2324
2325 /* p2 */
2326 q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
2327 filter8 = LD_UB(filter48);
2328 tmp0_r = p2_r_in - p3_r_in;
2329 tmp0_r += q4_r_in;
2330 tmp0_r -= p7_r_in;
2331 tmp1_r += tmp0_r;
2332 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2333 q4_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q4);
2334 tmp0_l = p2_l_in - p3_l_in;
2335 tmp0_l += q4_l_in;
2336 tmp0_l -= p7_l_in;
2337 tmp1_l += tmp0_l;
2338 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2339 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2340 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2341 ST_UB(filter8, src);
2342 src += 16;
2343
2344 /* p1 */
2345 q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
2346 filter8 = LD_UB(filter48 + 16);
2347 tmp0_r = p1_r_in - p2_r_in;
2348 tmp0_r += q5_r_in;
2349 tmp0_r -= p7_r_in;
2350 tmp1_r += tmp0_r;
2351 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2352 q5_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q5);
2353 tmp0_l = p1_l_in - p2_l_in;
2354 tmp0_l += q5_l_in;
2355 tmp0_l -= p7_l_in;
2356 tmp1_l += tmp0_l;
2357 l_out = __msa_srari_h((v8i16) (tmp1_l), 4);
2358 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2359 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2360 ST_UB(filter8, src);
2361 src += 16;
2362
2363 /* p0 */
2364 q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
2365 filter8 = LD_UB(filter48 + 32);
2366 tmp0_r = p0_r_in - p1_r_in;
2367 tmp0_r += q6_r_in;
2368 tmp0_r -= p7_r_in;
2369 tmp1_r += tmp0_r;
2370 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2371 q6_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q6);
2372 tmp0_l = p0_l_in - p1_l_in;
2373 tmp0_l += q6_l_in;
2374 tmp0_l -= p7_l_in;
2375 tmp1_l += tmp0_l;
2376 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2377 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2378 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2379 ST_UB(filter8, src);
2380 src += 16;
2381
2382 /* q0 */
2383 q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
2384 filter8 = LD_UB(filter48 + 48);
2385 tmp0_r = q7_r_in - p0_r_in;
2386 tmp0_r += q0_r_in;
2387 tmp0_r -= p7_r_in;
2388 tmp1_r += tmp0_r;
2389 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2390 q7_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q7);
2391 tmp0_l = q7_l_in - p0_l_in;
2392 tmp0_l += q0_l_in;
2393 tmp0_l -= p7_l_in;
2394 tmp1_l += tmp0_l;
2395 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2396 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2397 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2398 ST_UB(filter8, src);
2399 src += 16;
2400
2401 /* q1 */
2402 filter8 = LD_UB(filter48 + 64);
2403 tmp0_r = q7_r_in - q0_r_in;
2404 tmp0_r += q1_r_in;
2405 tmp0_r -= p6_r_in;
2406 tmp1_r += tmp0_r;
2407 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2408 tmp0_l = q7_l_in - q0_l_in;
2409 tmp0_l += q1_l_in;
2410 tmp0_l -= p6_l_in;
2411 tmp1_l += tmp0_l;
2412 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2413 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2414 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2415 ST_UB(filter8, src);
2416 src += 16;
2417
2418 /* q2 */
2419 filter8 = LD_UB(filter48 + 80);
2420 tmp0_r = q7_r_in - q1_r_in;
2421 tmp0_r += q2_r_in;
2422 tmp0_r -= p5_r_in;
2423 tmp1_r += tmp0_r;
2424 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2425 tmp0_l = q7_l_in - q1_l_in;
2426 tmp0_l += q2_l_in;
2427 tmp0_l -= p5_l_in;
2428 tmp1_l += tmp0_l;
2429 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2430 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2431 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2432 ST_UB(filter8, src);
2433 src += 16;
2434
2435 /* q3 */
2436 tmp0_r = q7_r_in - q2_r_in;
2437 tmp0_r += q3_r_in;
2438 tmp0_r -= p4_r_in;
2439 tmp1_r += tmp0_r;
2440 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2441 tmp0_l = q7_l_in - q2_l_in;
2442 tmp0_l += q3_l_in;
2443 tmp0_l -= p4_l_in;
2444 tmp1_l += tmp0_l;
2445 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2446 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2447 q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
2448 ST_UB(q3, src);
2449 src += 16;
2450
2451 /* q4 */
2452 tmp0_r = q7_r_in - q3_r_in;
2453 tmp0_r += q4_r_in;
2454 tmp0_r -= p3_r_in;
2455 tmp1_r += tmp0_r;
2456 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2457 tmp0_l = q7_l_in - q3_l_in;
2458 tmp0_l += q4_l_in;
2459 tmp0_l -= p3_l_in;
2460 tmp1_l += tmp0_l;
2461 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2462 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2463 q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
2464 ST_UB(q4, src);
2465 src += 16;
2466
2467 /* q5 */
2468 tmp0_r = q7_r_in - q4_r_in;
2469 tmp0_r += q5_r_in;
2470 tmp0_r -= p2_r_in;
2471 tmp1_r += tmp0_r;
2472 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2473 tmp0_l = q7_l_in - q4_l_in;
2474 tmp0_l += q5_l_in;
2475 tmp0_l -= p2_l_in;
2476 tmp1_l += tmp0_l;
2477 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2478 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2479 q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
2480 ST_UB(q5, src);
2481 src += 16;
2482
2483 /* q6 */
2484 tmp0_r = q7_r_in - q5_r_in;
2485 tmp0_r += q6_r_in;
2486 tmp0_r -= p1_r_in;
2487 tmp1_r += tmp0_r;
2488 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2489 tmp0_l = q7_l_in - q5_l_in;
2490 tmp0_l += q6_l_in;
2491 tmp0_l -= p1_l_in;
2492 tmp1_l += tmp0_l;
2493 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2494 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2495 q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
2496 ST_UB(q6, src);
2497
2498 return 0;
2499 }
2500 }
2501
ff_loop_filter_h_16_16_msa(uint8_t * src,ptrdiff_t pitch,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)2502 void ff_loop_filter_h_16_16_msa(uint8_t *src, ptrdiff_t pitch,
2503 int32_t b_limit_ptr,
2504 int32_t limit_ptr,
2505 int32_t thresh_ptr)
2506 {
2507 uint8_t early_exit = 0;
2508 uint8_t transposed_input[16 * 24] ALLOC_ALIGNED(ALIGNMENT);
2509 uint8_t *filter48 = &transposed_input[16 * 16];
2510
2511 vp9_transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
2512
2513 early_exit = vp9_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8),
2514 &filter48[0], src, pitch,
2515 b_limit_ptr, limit_ptr, thresh_ptr);
2516
2517 if (0 == early_exit) {
2518 early_exit = vp9_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
2519 &filter48[0]);
2520
2521 if (0 == early_exit) {
2522 vp9_transpose_16x16(transposed_input, 16, (src - 8), pitch);
2523 }
2524 }
2525 }
2526