1 /*
2  * Copyright (c) 2015 - 2017 Shivraj Patil (Shivraj.Patil@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavcodec/vp9dsp.h"
22 #include "libavutil/mips/generic_macros_msa.h"
23 #include "vp9dsp_mips.h"
24 
25 #define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in,  \
26                            p1_out, p0_out, q0_out, q1_out)               \
27 {                                                                        \
28     v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt, filt1, filt2;         \
29     const v16i8 cnst4b = __msa_ldi_b(4);                                 \
30     const v16i8 cnst3b = __msa_ldi_b(3);                                 \
31                                                                          \
32     p1_m = (v16i8) __msa_xori_b(p1_in, 0x80);                            \
33     p0_m = (v16i8) __msa_xori_b(p0_in, 0x80);                            \
34     q0_m = (v16i8) __msa_xori_b(q0_in, 0x80);                            \
35     q1_m = (v16i8) __msa_xori_b(q1_in, 0x80);                            \
36                                                                          \
37     filt = __msa_subs_s_b(p1_m, q1_m);                                   \
38                                                                          \
39     filt = filt & (v16i8) hev_in;                                        \
40                                                                          \
41     q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m);                              \
42     filt = __msa_adds_s_b(filt, q0_sub_p0);                              \
43     filt = __msa_adds_s_b(filt, q0_sub_p0);                              \
44     filt = __msa_adds_s_b(filt, q0_sub_p0);                              \
45     filt = filt & (v16i8) mask_in;                                       \
46                                                                          \
47     filt1 = __msa_adds_s_b(filt, cnst4b);                                \
48     filt1 >>= 3;                                                         \
49                                                                          \
50     filt2 = __msa_adds_s_b(filt, cnst3b);                                \
51     filt2 >>= 3;                                                         \
52                                                                          \
53     q0_m = __msa_subs_s_b(q0_m, filt1);                                  \
54     q0_out = __msa_xori_b((v16u8) q0_m, 0x80);                           \
55     p0_m = __msa_adds_s_b(p0_m, filt2);                                  \
56     p0_out = __msa_xori_b((v16u8) p0_m, 0x80);                           \
57                                                                          \
58     filt = __msa_srari_b(filt1, 1);                                      \
59     hev_in = __msa_xori_b((v16u8) hev_in, 0xff);                         \
60     filt = filt & (v16i8) hev_in;                                        \
61                                                                          \
62     q1_m = __msa_subs_s_b(q1_m, filt);                                   \
63     q1_out = __msa_xori_b((v16u8) q1_m, 0x80);                           \
64     p1_m = __msa_adds_s_b(p1_m, filt);                                   \
65     p1_out = __msa_xori_b((v16u8) p1_m, 0x80);                           \
66 }
67 
68 #define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out)  \
69 {                                                                      \
70     v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0;     \
71     v16u8 zero_in = { 0 };                                             \
72                                                                        \
73     tmp = __msa_ori_b(zero_in, 1);                                     \
74     p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in);                        \
75     q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in);                        \
76     p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in);                        \
77     q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in);                        \
78                                                                        \
79     p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0);             \
80     flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out);                   \
81     p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0);             \
82     flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out);                   \
83                                                                        \
84     flat_out = (tmp < (v16u8) flat_out);                               \
85     flat_out = __msa_xori_b(flat_out, 0xff);                           \
86     flat_out = flat_out & (mask);                                      \
87 }
88 
89 #define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in,  \
90                   q5_in, q6_in, q7_in, flat_in, flat2_out)          \
91 {                                                                   \
92     v16u8 tmp, zero_in = { 0 };                                     \
93     v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0;       \
94     v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0;       \
95                                                                     \
96     tmp = __msa_ori_b(zero_in, 1);                                  \
97     p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in);                     \
98     q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in);                     \
99     p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in);                     \
100     q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in);                     \
101     p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in);                     \
102     q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in);                     \
103     p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in);                     \
104     q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in);                     \
105                                                                     \
106     p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0);          \
107     flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0);            \
108     flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out);              \
109     p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0);          \
110     flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out);              \
111     p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0);          \
112     flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out);              \
113                                                                     \
114     flat2_out = (tmp < (v16u8) flat2_out);                          \
115     flat2_out = __msa_xori_b(flat2_out, 0xff);                      \
116     flat2_out = flat2_out & flat_in;                                \
117 }
118 
119 #define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in,                \
120                     q0_in, q1_in, q2_in, q3_in,                \
121                     p2_filt8_out, p1_filt8_out, p0_filt8_out,  \
122                     q0_filt8_out, q1_filt8_out, q2_filt8_out)  \
123 {                                                              \
124     v8u16 tmp0, tmp1, tmp2;                                    \
125                                                                \
126     tmp2 = p2_in + p1_in + p0_in;                              \
127     tmp0 = p3_in << 1;                                         \
128                                                                \
129     tmp0 = tmp0 + tmp2 + q0_in;                                \
130     tmp1 = tmp0 + p3_in + p2_in;                               \
131     p2_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3);     \
132                                                                \
133     tmp1 = tmp0 + p1_in + q1_in;                               \
134     p1_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3);     \
135                                                                \
136     tmp1 = q2_in + q1_in + q0_in;                              \
137     tmp2 = tmp2 + tmp1;                                        \
138     tmp0 = tmp2 + (p0_in);                                     \
139     tmp0 = tmp0 + (p3_in);                                     \
140     p0_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp0, 3);     \
141                                                                \
142     tmp0 = q2_in + q3_in;                                      \
143     tmp0 = p0_in + tmp1 + tmp0;                                \
144     tmp1 = q3_in + q3_in;                                      \
145     tmp1 = tmp1 + tmp0;                                        \
146     q2_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3);     \
147                                                                \
148     tmp0 = tmp2 + q3_in;                                       \
149     tmp1 = tmp0 + q0_in;                                       \
150     q0_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3);     \
151                                                                \
152     tmp1 = tmp0 - p2_in;                                       \
153     tmp0 = q1_in + q3_in;                                      \
154     tmp1 = tmp0 + tmp1;                                        \
155     q1_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3);     \
156 }
157 
158 #define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in,                   \
159                      q0_in, q1_in, q2_in, q3_in,                   \
160                      limit_in, b_limit_in, thresh_in,              \
161                      hev_out, mask_out, flat_out)                  \
162 {                                                                  \
163     v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;  \
164     v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;  \
165                                                                    \
166     /* absolute subtraction of pixel values */                     \
167     p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in);                   \
168     p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in);                   \
169     p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in);                   \
170     q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in);                   \
171     q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in);                   \
172     q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in);                   \
173     p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in);                   \
174     p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in);                   \
175                                                                    \
176     /* calculation of hev */                                       \
177     flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m);          \
178     hev_out = thresh_in < (v16u8) flat_out;                        \
179                                                                    \
180     /* calculation of mask */                                      \
181     p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m);     \
182     p1_asub_q1_m >>= 1;                                            \
183     p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m);     \
184                                                                    \
185     mask_out = b_limit_in < p0_asub_q0_m;                          \
186     mask_out = __msa_max_u_b(flat_out, mask_out);                  \
187     p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m);      \
188     mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out);              \
189     q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m);      \
190     mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out);              \
191                                                                    \
192     mask_out = limit_in < (v16u8) mask_out;                        \
193     mask_out = __msa_xori_b(mask_out, 0xff);                       \
194 }
195 
ff_loop_filter_v_4_8_msa(uint8_t * src,ptrdiff_t pitch,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)196 void ff_loop_filter_v_4_8_msa(uint8_t *src, ptrdiff_t pitch,
197                               int32_t b_limit_ptr,
198                               int32_t limit_ptr,
199                               int32_t thresh_ptr)
200 {
201     uint64_t p1_d, p0_d, q0_d, q1_d;
202     v16u8 mask, hev, flat, thresh, b_limit, limit;
203     v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;
204 
205     /* load vector elements */
206     LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
207 
208     thresh = (v16u8) __msa_fill_b(thresh_ptr);
209     b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
210     limit = (v16u8) __msa_fill_b(limit_ptr);
211 
212     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
213                  hev, mask, flat);
214     VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
215                        q1_out);
216 
217     p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
218     p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
219     q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
220     q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
221     SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
222 }
223 
224 
ff_loop_filter_v_44_16_msa(uint8_t * src,ptrdiff_t pitch,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)225 void ff_loop_filter_v_44_16_msa(uint8_t *src, ptrdiff_t pitch,
226                                 int32_t b_limit_ptr,
227                                 int32_t limit_ptr,
228                                 int32_t thresh_ptr)
229 {
230     v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
231     v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
232 
233     /* load vector elements */
234     LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
235 
236     thresh0 = (v16u8) __msa_fill_b(thresh_ptr);
237     thresh1 = (v16u8) __msa_fill_b(thresh_ptr >> 8);
238     thresh0 = (v16u8) __msa_ilvr_d((v2i64) thresh1, (v2i64) thresh0);
239 
240     b_limit0 = (v16u8) __msa_fill_b(b_limit_ptr);
241     b_limit1 = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
242     b_limit0 = (v16u8) __msa_ilvr_d((v2i64) b_limit1, (v2i64) b_limit0);
243 
244     limit0 = (v16u8) __msa_fill_b(limit_ptr);
245     limit1 = (v16u8) __msa_fill_b(limit_ptr >> 8);
246     limit0 = (v16u8) __msa_ilvr_d((v2i64) limit1, (v2i64) limit0);
247 
248     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
249                  hev, mask, flat);
250     VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
251 
252     ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
253 }
254 
ff_loop_filter_v_8_8_msa(uint8_t * src,ptrdiff_t pitch,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)255 void ff_loop_filter_v_8_8_msa(uint8_t *src, ptrdiff_t pitch,
256                               int32_t b_limit_ptr,
257                               int32_t limit_ptr,
258                               int32_t thresh_ptr)
259 {
260     uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
261     v16u8 mask, hev, flat, thresh, b_limit, limit;
262     v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
263     v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
264     v8i16 p2_filter8, p1_filter8, p0_filter8;
265     v8i16 q0_filter8, q1_filter8, q2_filter8;
266     v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
267     v16i8 zero = { 0 };
268 
269     /* load vector elements */
270     LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
271 
272     thresh = (v16u8) __msa_fill_b(thresh_ptr);
273     b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
274     limit = (v16u8) __msa_fill_b(limit_ptr);
275 
276     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
277                  hev, mask, flat);
278     VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
279     VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
280                        q1_out);
281 
282     flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
283 
284     /* if flat is zero for all pixels, then no need to calculate other filter */
285     if (__msa_test_bz_v(flat)) {
286         p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
287         p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
288         q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
289         q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
290         SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
291     } else {
292         ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
293                    zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
294                    q2_r, q3_r);
295         VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
296                     p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
297 
298         /* convert 16 bit output data into 8 bit */
299         PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
300                     zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
301                     q0_filter8);
302         PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
303 
304         /* store pixel values */
305         p2_out = __msa_bmnz_v(p2, (v16u8) p2_filter8, flat);
306         p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filter8, flat);
307         p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filter8, flat);
308         q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filter8, flat);
309         q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filter8, flat);
310         q2_out = __msa_bmnz_v(q2, (v16u8) q2_filter8, flat);
311 
312         p2_d = __msa_copy_u_d((v2i64) p2_out, 0);
313         p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
314         p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
315         q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
316         q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
317         q2_d = __msa_copy_u_d((v2i64) q2_out, 0);
318 
319         src -= 3 * pitch;
320 
321         SD4(p2_d, p1_d, p0_d, q0_d, src, pitch);
322         src += (4 * pitch);
323         SD(q1_d, src);
324         src += pitch;
325         SD(q2_d, src);
326     }
327 }
328 
ff_loop_filter_v_88_16_msa(uint8_t * src,ptrdiff_t pitch,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)329 void ff_loop_filter_v_88_16_msa(uint8_t *src, ptrdiff_t pitch,
330                                 int32_t b_limit_ptr,
331                                 int32_t limit_ptr,
332                                 int32_t thresh_ptr)
333 {
334     v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
335     v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
336     v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
337     v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
338     v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
339     v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
340     v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
341     v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
342     v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
343     v16u8 zero = { 0 };
344 
345     /* load vector elements */
346     LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
347 
348     thresh = (v16u8) __msa_fill_b(thresh_ptr);
349     tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
350     thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
351 
352     b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
353     tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
354     b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
355 
356     limit = (v16u8) __msa_fill_b(limit_ptr);
357     tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
358     limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
359 
360     /* mask and hev */
361     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
362                  hev, mask, flat);
363     VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
364     VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
365                        q1_out);
366 
367     /* if flat is zero for all pixels, then no need to calculate other filter */
368     if (__msa_test_bz_v(flat)) {
369         ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
370     } else {
371         ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
372                    zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
373                    q2_r, q3_r);
374         VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
375                     p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
376 
377         ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
378                    p0_l);
379         ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
380                    q3_l);
381         VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
382                     p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
383 
384         /* convert 16 bit output data into 8 bit */
385         PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
386                     p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
387                     p0_filt8_r, q0_filt8_r);
388         PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r,
389                     q1_filt8_r, q2_filt8_r);
390 
391         /* store pixel values */
392         p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
393         p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
394         p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
395         q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
396         q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
397         q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
398 
399         src -= 3 * pitch;
400 
401         ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
402         src += (4 * pitch);
403         ST_UB2(q1_out, q2_out, src, pitch);
404         src += (2 * pitch);
405     }
406 }
407 
ff_loop_filter_v_84_16_msa(uint8_t * src,ptrdiff_t pitch,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)408 void ff_loop_filter_v_84_16_msa(uint8_t *src, ptrdiff_t pitch,
409                                 int32_t b_limit_ptr,
410                                 int32_t limit_ptr,
411                                 int32_t thresh_ptr)
412 {
413     v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
414     v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
415     v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
416     v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
417     v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
418     v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
419     v16u8 zero = { 0 };
420 
421     /* load vector elements */
422     LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
423 
424     thresh = (v16u8) __msa_fill_b(thresh_ptr);
425     tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
426     thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
427 
428     b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
429     tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
430     b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
431 
432     limit = (v16u8) __msa_fill_b(limit_ptr);
433     tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
434     limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
435 
436     /* mask and hev */
437     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
438                  hev, mask, flat);
439     VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
440     VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
441                        q1_out);
442 
443     flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
444 
445     /* if flat is zero for all pixels, then no need to calculate other filter */
446     if (__msa_test_bz_v(flat)) {
447         ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
448     } else {
449         ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
450                    zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
451                    q2_r, q3_r);
452         VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
453                     p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
454 
455         /* convert 16 bit output data into 8 bit */
456         PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r,
457                     p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r,
458                     p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r);
459         PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r,
460                     q1_filt8_r, q2_filt8_r);
461 
462         /* store pixel values */
463         p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
464         p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
465         p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
466         q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
467         q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
468         q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
469 
470         src -= 3 * pitch;
471 
472         ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
473         src += (4 * pitch);
474         ST_UB2(q1_out, q2_out, src, pitch);
475         src += (2 * pitch);
476     }
477 }
478 
ff_loop_filter_v_48_16_msa(uint8_t * src,ptrdiff_t pitch,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)479 void ff_loop_filter_v_48_16_msa(uint8_t *src, ptrdiff_t pitch,
480                                 int32_t b_limit_ptr,
481                                 int32_t limit_ptr,
482                                 int32_t thresh_ptr)
483 {
484     v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
485     v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
486     v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
487     v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
488     v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
489     v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
490     v16u8 zero = { 0 };
491 
492     /* load vector elements */
493     LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
494 
495     thresh = (v16u8) __msa_fill_b(thresh_ptr);
496     tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
497     thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
498 
499     b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
500     tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
501     b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
502 
503     limit = (v16u8) __msa_fill_b(limit_ptr);
504     tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
505     limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
506 
507     /* mask and hev */
508     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
509                  hev, mask, flat);
510     VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
511     VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
512                        q1_out);
513 
514     flat = (v16u8) __msa_insve_d((v2i64) flat, 0, (v2i64) zero);
515 
516     /* if flat is zero for all pixels, then no need to calculate other filter */
517     if (__msa_test_bz_v(flat)) {
518         ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
519     } else {
520         ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
521                    p0_l);
522         ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
523                    q3_l);
524         VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
525                     p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
526 
527         /* convert 16 bit output data into 8 bit */
528         PCKEV_B4_SH(p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
529                     p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l,
530                     p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
531         PCKEV_B2_SH(q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
532                     q1_filt8_l, q2_filt8_l);
533 
534         /* store pixel values */
535         p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_l, flat);
536         p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_l, flat);
537         p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_l, flat);
538         q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_l, flat);
539         q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_l, flat);
540         q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_l, flat);
541 
542         src -= 3 * pitch;
543 
544         ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
545         src += (4 * pitch);
546         ST_UB2(q1_out, q2_out, src, pitch);
547         src += (2 * pitch);
548     }
549 }
550 
vp9_hz_lpf_t4_and_t8_16w(uint8_t * src,ptrdiff_t pitch,uint8_t * filter48,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)551 static int32_t vp9_hz_lpf_t4_and_t8_16w(uint8_t *src, ptrdiff_t pitch,
552                                         uint8_t *filter48,
553                                         int32_t b_limit_ptr,
554                                         int32_t limit_ptr,
555                                         int32_t thresh_ptr)
556 {
557     v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
558     v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
559     v16u8 flat, mask, hev, thresh, b_limit, limit;
560     v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
561     v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
562     v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
563     v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
564     v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
565     v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
566     v16u8 zero = { 0 };
567 
568     /* load vector elements */
569     LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
570 
571     thresh = (v16u8) __msa_fill_b(thresh_ptr);
572     b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
573     limit = (v16u8) __msa_fill_b(limit_ptr);
574 
575     /* mask and hev */
576     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
577                  hev, mask, flat);
578     VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
579     VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
580                        q1_out);
581 
582     /* if flat is zero for all pixels, then no need to calculate other filter */
583     if (__msa_test_bz_v(flat)) {
584         ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
585 
586         return 1;
587     } else {
588         ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
589                    zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
590                    q2_r, q3_r);
591         VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
592                     p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
593 
594         ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
595                    p0_l);
596         ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
597                    q3_l);
598         VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
599                     p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
600 
601         /* convert 16 bit output data into 8 bit */
602         PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
603                     p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
604                     p0_filt8_r, q0_filt8_r);
605         PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
606                     q2_filt8_r);
607 
608         /* store pixel values */
609         p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
610         p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
611         p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
612         q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
613         q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
614         q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
615 
616         ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
617         filter48 += (4 * 16);
618         ST_UB2(q1_out, q2_out, filter48, 16);
619         filter48 += (2 * 16);
620         ST_UB(flat, filter48);
621 
622         return 0;
623     }
624 }
625 
vp9_hz_lpf_t16_16w(uint8_t * src,ptrdiff_t pitch,uint8_t * filter48)626 static void vp9_hz_lpf_t16_16w(uint8_t *src, ptrdiff_t pitch, uint8_t *filter48)
627 {
628     v16u8 flat, flat2, filter8;
629     v16i8 zero = { 0 };
630     v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
631     v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
632     v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
633     v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
634     v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
635     v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
636     v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
637     v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
638     v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
639     v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
640     v8i16 l_out, r_out;
641 
642     flat = LD_UB(filter48 + 96);
643 
644     LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0);
645     LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7);
646     VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
647 
648     /* if flat2 is zero for all pixels, then no need to calculate other filter */
649     if (__msa_test_bz_v(flat2)) {
650         LD_UB4(filter48, 16, p2, p1, p0, q0);
651         LD_UB2(filter48 + 4 * 16, 16, q1, q2);
652 
653         src -= 3 * pitch;
654         ST_UB4(p2, p1, p0, q0, src, pitch);
655         src += (4 * pitch);
656         ST_UB2(q1, q2, src, pitch);
657     } else {
658         src -= 7 * pitch;
659 
660         ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
661                    zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
662                    p3_r_in, p2_r_in, p1_r_in, p0_r_in);
663 
664         q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0);
665 
666         tmp0_r = p7_r_in << 3;
667         tmp0_r -= p7_r_in;
668         tmp0_r += p6_r_in;
669         tmp0_r += q0_r_in;
670         tmp1_r = p6_r_in + p5_r_in;
671         tmp1_r += p4_r_in;
672         tmp1_r += p3_r_in;
673         tmp1_r += p2_r_in;
674         tmp1_r += p1_r_in;
675         tmp1_r += p0_r_in;
676         tmp1_r += tmp0_r;
677         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
678 
679         ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
680                    p5_l_in, p4_l_in);
681         ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
682                    p1_l_in, p0_l_in);
683         q0_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q0);
684 
685         tmp0_l = p7_l_in << 3;
686         tmp0_l -= p7_l_in;
687         tmp0_l += p6_l_in;
688         tmp0_l += q0_l_in;
689         tmp1_l = p6_l_in + p5_l_in;
690         tmp1_l += p4_l_in;
691         tmp1_l += p3_l_in;
692         tmp1_l += p2_l_in;
693         tmp1_l += p1_l_in;
694         tmp1_l += p0_l_in;
695         tmp1_l += tmp0_l;
696         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
697 
698         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
699         p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
700         ST_UB(p6, src);
701         src += pitch;
702 
703         /* p5 */
704         q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1);
705         tmp0_r = p5_r_in - p6_r_in;
706         tmp0_r += q1_r_in;
707         tmp0_r -= p7_r_in;
708         tmp1_r += tmp0_r;
709         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
710 
711         q1_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q1);
712         tmp0_l = p5_l_in - p6_l_in;
713         tmp0_l += q1_l_in;
714         tmp0_l -= p7_l_in;
715         tmp1_l += tmp0_l;
716         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
717 
718         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
719         p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
720         ST_UB(p5, src);
721         src += pitch;
722 
723         /* p4 */
724         q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
725         tmp0_r = p4_r_in - p5_r_in;
726         tmp0_r += q2_r_in;
727         tmp0_r -= p7_r_in;
728         tmp1_r += tmp0_r;
729         r_out = (v8i16) __msa_srari_h((v8i16) tmp1_r, 4);
730 
731         q2_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q2);
732         tmp0_l = p4_l_in - p5_l_in;
733         tmp0_l += q2_l_in;
734         tmp0_l -= p7_l_in;
735         tmp1_l += tmp0_l;
736         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
737 
738         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
739         p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
740         ST_UB(p4, src);
741         src += pitch;
742 
743         /* p3 */
744         q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
745         tmp0_r = p3_r_in - p4_r_in;
746         tmp0_r += q3_r_in;
747         tmp0_r -= p7_r_in;
748         tmp1_r += tmp0_r;
749         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
750 
751         q3_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q3);
752         tmp0_l = p3_l_in - p4_l_in;
753         tmp0_l += q3_l_in;
754         tmp0_l -= p7_l_in;
755         tmp1_l += tmp0_l;
756         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
757 
758         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
759         p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
760         ST_UB(p3, src);
761         src += pitch;
762 
763         /* p2 */
764         q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
765         filter8 = LD_UB(filter48);
766         tmp0_r = p2_r_in - p3_r_in;
767         tmp0_r += q4_r_in;
768         tmp0_r -= p7_r_in;
769         tmp1_r += tmp0_r;
770         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
771 
772         q4_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q4);
773         tmp0_l = p2_l_in - p3_l_in;
774         tmp0_l += q4_l_in;
775         tmp0_l -= p7_l_in;
776         tmp1_l += tmp0_l;
777         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
778 
779         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
780         filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
781         ST_UB(filter8, src);
782         src += pitch;
783 
784         /* p1 */
785         q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
786         filter8 = LD_UB(filter48 + 16);
787         tmp0_r = p1_r_in - p2_r_in;
788         tmp0_r += q5_r_in;
789         tmp0_r -= p7_r_in;
790         tmp1_r += tmp0_r;
791         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
792 
793         q5_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q5);
794         tmp0_l = p1_l_in - p2_l_in;
795         tmp0_l += q5_l_in;
796         tmp0_l -= p7_l_in;
797         tmp1_l += tmp0_l;
798         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
799 
800         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
801         filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
802         ST_UB(filter8, src);
803         src += pitch;
804 
805         /* p0 */
806         q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
807         filter8 = LD_UB(filter48 + 32);
808         tmp0_r = p0_r_in - p1_r_in;
809         tmp0_r += q6_r_in;
810         tmp0_r -= p7_r_in;
811         tmp1_r += tmp0_r;
812         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
813 
814         q6_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q6);
815         tmp0_l = p0_l_in - p1_l_in;
816         tmp0_l += q6_l_in;
817         tmp0_l -= p7_l_in;
818         tmp1_l += tmp0_l;
819         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
820 
821         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
822         filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
823         ST_UB(filter8, src);
824         src += pitch;
825 
826         /* q0 */
827         q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
828         filter8 = LD_UB(filter48 + 48);
829         tmp0_r = q7_r_in - p0_r_in;
830         tmp0_r += q0_r_in;
831         tmp0_r -= p7_r_in;
832         tmp1_r += tmp0_r;
833         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
834 
835         q7_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q7);
836         tmp0_l = q7_l_in - p0_l_in;
837         tmp0_l += q0_l_in;
838         tmp0_l -= p7_l_in;
839         tmp1_l += tmp0_l;
840         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
841 
842         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
843         filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
844         ST_UB(filter8, src);
845         src += pitch;
846 
847         /* q1 */
848         filter8 = LD_UB(filter48 + 64);
849         tmp0_r = q7_r_in - q0_r_in;
850         tmp0_r += q1_r_in;
851         tmp0_r -= p6_r_in;
852         tmp1_r += tmp0_r;
853         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
854 
855         tmp0_l = q7_l_in - q0_l_in;
856         tmp0_l += q1_l_in;
857         tmp0_l -= p6_l_in;
858         tmp1_l += tmp0_l;
859         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
860 
861         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
862         filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
863         ST_UB(filter8, src);
864         src += pitch;
865 
866         /* q2 */
867         filter8 = LD_UB(filter48 + 80);
868         tmp0_r = q7_r_in - q1_r_in;
869         tmp0_r += q2_r_in;
870         tmp0_r -= p5_r_in;
871         tmp1_r += tmp0_r;
872         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
873 
874         tmp0_l = q7_l_in - q1_l_in;
875         tmp0_l += q2_l_in;
876         tmp0_l -= p5_l_in;
877         tmp1_l += tmp0_l;
878         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
879 
880         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
881         filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
882         ST_UB(filter8, src);
883         src += pitch;
884 
885         /* q3 */
886         tmp0_r = q7_r_in - q2_r_in;
887         tmp0_r += q3_r_in;
888         tmp0_r -= p4_r_in;
889         tmp1_r += tmp0_r;
890         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
891 
892         tmp0_l = q7_l_in - q2_l_in;
893         tmp0_l += q3_l_in;
894         tmp0_l -= p4_l_in;
895         tmp1_l += tmp0_l;
896         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
897 
898         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
899         q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
900         ST_UB(q3, src);
901         src += pitch;
902 
903         /* q4 */
904         tmp0_r = q7_r_in - q3_r_in;
905         tmp0_r += q4_r_in;
906         tmp0_r -= p3_r_in;
907         tmp1_r += tmp0_r;
908         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
909 
910         tmp0_l = q7_l_in - q3_l_in;
911         tmp0_l += q4_l_in;
912         tmp0_l -= p3_l_in;
913         tmp1_l += tmp0_l;
914         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
915 
916         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
917         q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
918         ST_UB(q4, src);
919         src += pitch;
920 
921         /* q5 */
922         tmp0_r = q7_r_in - q4_r_in;
923         tmp0_r += q5_r_in;
924         tmp0_r -= p2_r_in;
925         tmp1_r += tmp0_r;
926         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
927 
928         tmp0_l = q7_l_in - q4_l_in;
929         tmp0_l += q5_l_in;
930         tmp0_l -= p2_l_in;
931         tmp1_l += tmp0_l;
932         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
933 
934         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
935         q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
936         ST_UB(q5, src);
937         src += pitch;
938 
939         /* q6 */
940         tmp0_r = q7_r_in - q5_r_in;
941         tmp0_r += q6_r_in;
942         tmp0_r -= p1_r_in;
943         tmp1_r += tmp0_r;
944         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
945 
946         tmp0_l = q7_l_in - q5_l_in;
947         tmp0_l += q6_l_in;
948         tmp0_l -= p1_l_in;
949         tmp1_l += tmp0_l;
950         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
951 
952         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
953         q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
954         ST_UB(q6, src);
955     }
956 }
957 
ff_loop_filter_v_16_16_msa(uint8_t * src,ptrdiff_t pitch,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)958 void ff_loop_filter_v_16_16_msa(uint8_t *src, ptrdiff_t pitch,
959                                 int32_t b_limit_ptr,
960                                 int32_t limit_ptr,
961                                 int32_t thresh_ptr)
962 {
963     uint8_t filter48[16 * 8] ALLOC_ALIGNED(ALIGNMENT);
964     uint8_t early_exit = 0;
965 
966     early_exit = vp9_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0],
967                                           b_limit_ptr, limit_ptr, thresh_ptr);
968 
969     if (0 == early_exit) {
970         vp9_hz_lpf_t16_16w(src, pitch, filter48);
971     }
972 }
973 
ff_loop_filter_v_16_8_msa(uint8_t * src,ptrdiff_t pitch,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)974 void ff_loop_filter_v_16_8_msa(uint8_t *src, ptrdiff_t pitch,
975                                int32_t b_limit_ptr,
976                                int32_t limit_ptr,
977                                int32_t thresh_ptr)
978 {
979     uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
980     uint64_t dword0, dword1;
981     v16u8 flat2, mask, hev, flat, thresh, b_limit, limit;
982     v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
983     v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
984     v16u8 p0_filter16, p1_filter16;
985     v8i16 p2_filter8, p1_filter8, p0_filter8;
986     v8i16 q0_filter8, q1_filter8, q2_filter8;
987     v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r;
988     v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
989     v16i8 zero = { 0 };
990     v8u16 tmp0, tmp1, tmp2;
991 
992     /* load vector elements */
993     LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
994 
995     thresh = (v16u8) __msa_fill_b(thresh_ptr);
996     b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
997     limit = (v16u8) __msa_fill_b(limit_ptr);
998 
999     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1000                  hev, mask, flat);
1001     VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1002     VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1003                        q1_out);
1004 
1005     flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
1006 
1007     /* if flat is zero for all pixels, then no need to calculate other filter */
1008     if (__msa_test_bz_v(flat)) {
1009         p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
1010         p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
1011         q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
1012         q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
1013         SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch);
1014     } else {
1015         /* convert 8 bit input data into 16 bit */
1016         ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero,
1017                    q1, zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r,
1018                    q1_r, q2_r, q3_r);
1019         VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r,
1020                     p2_filter8, p1_filter8, p0_filter8, q0_filter8,
1021                     q1_filter8, q2_filter8);
1022 
1023         /* convert 16 bit output data into 8 bit */
1024         PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
1025                     zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
1026                     q0_filter8);
1027         PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8,
1028                     q2_filter8);
1029 
1030         /* store pixel values */
1031         p2_out = __msa_bmnz_v(p2, (v16u8) p2_filter8, flat);
1032         p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filter8, flat);
1033         p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filter8, flat);
1034         q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filter8, flat);
1035         q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filter8, flat);
1036         q2_out = __msa_bmnz_v(q2, (v16u8) q2_filter8, flat);
1037 
1038         /* load 16 vector elements */
1039         LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4);
1040         LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7);
1041 
1042         VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
1043 
1044         /* if flat2 is zero for all pixels, then no need to calculate other filter */
1045         if (__msa_test_bz_v(flat2)) {
1046             p2_d = __msa_copy_u_d((v2i64) p2_out, 0);
1047             p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
1048             p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
1049             q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
1050             q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
1051             q2_d = __msa_copy_u_d((v2i64) q2_out, 0);
1052 
1053             SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch);
1054             SD(q1_d, src + pitch);
1055             SD(q2_d, src + 2 * pitch);
1056         } else {
1057             /* LSB(right) 8 pixel operation */
1058             ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4,
1059                        zero, q5, zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r,
1060                        q4_r, q5_r, q6_r, q7_r);
1061 
1062             tmp0 = p7_r << 3;
1063             tmp0 -= p7_r;
1064             tmp0 += p6_r;
1065             tmp0 += q0_r;
1066 
1067             src -= 7 * pitch;
1068 
1069             /* calculation of p6 and p5 */
1070             tmp1 = p6_r + p5_r + p4_r + p3_r;
1071             tmp1 += (p2_r + p1_r + p0_r);
1072             tmp1 += tmp0;
1073             p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1074             tmp0 = p5_r - p6_r + q1_r - p7_r;
1075             tmp1 += tmp0;
1076             p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1077             PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1078                         p0_filter16, p1_filter16);
1079             p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2);
1080             p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2);
1081             dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1082             dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1083             SD(dword0, src);
1084             src += pitch;
1085             SD(dword1, src);
1086             src += pitch;
1087 
1088             /* calculation of p4 and p3 */
1089             tmp0 = p4_r - p5_r + q2_r - p7_r;
1090             tmp2 = p3_r - p4_r + q3_r - p7_r;
1091             tmp1 += tmp0;
1092             p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1093             tmp1 += tmp2;
1094             p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1095             PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1096                         p0_filter16, p1_filter16);
1097             p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2);
1098             p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2);
1099             dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1100             dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1101             SD(dword0, src);
1102             src += pitch;
1103             SD(dword1, src);
1104             src += pitch;
1105 
1106             /* calculation of p2 and p1 */
1107             tmp0 = p2_r - p3_r + q4_r - p7_r;
1108             tmp2 = p1_r - p2_r + q5_r - p7_r;
1109             tmp1 += tmp0;
1110             p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1111             tmp1 += tmp2;
1112             p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1113             PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1114                         p0_filter16, p1_filter16);
1115             p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2);
1116             p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2);
1117             dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1118             dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1119             SD(dword0, src);
1120             src += pitch;
1121             SD(dword1, src);
1122             src += pitch;
1123 
1124             /* calculation of p0 and q0 */
1125             tmp0 = (p0_r - p1_r) + (q6_r - p7_r);
1126             tmp2 = (q7_r - p0_r) + (q0_r - p7_r);
1127             tmp1 += tmp0;
1128             p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1129             tmp1 += tmp2;
1130             p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1131             PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1132                         p0_filter16, p1_filter16);
1133             p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2);
1134             p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2);
1135             dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1136             dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1137             SD(dword0, src);
1138             src += pitch;
1139             SD(dword1, src);
1140             src += pitch;
1141 
1142             /* calculation of q1 and q2 */
1143             tmp0 = q7_r - q0_r + q1_r - p6_r;
1144             tmp2 = q7_r - q1_r + q2_r - p5_r;
1145             tmp1 += tmp0;
1146             p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1147             tmp1 += tmp2;
1148             p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1149             PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1150                         p0_filter16, p1_filter16);
1151             p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2);
1152             p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2);
1153             dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1154             dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1155             SD(dword0, src);
1156             src += pitch;
1157             SD(dword1, src);
1158             src += pitch;
1159 
1160             /* calculation of q3 and q4 */
1161             tmp0 = (q7_r - q2_r) + (q3_r - p4_r);
1162             tmp2 = (q7_r - q3_r) + (q4_r - p3_r);
1163             tmp1 += tmp0;
1164             p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1165             tmp1 += tmp2;
1166             p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1167             PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1168                         p0_filter16, p1_filter16);
1169             p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2);
1170             p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2);
1171             dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1172             dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1173             SD(dword0, src);
1174             src += pitch;
1175             SD(dword1, src);
1176             src += pitch;
1177 
1178             /* calculation of q5 and q6 */
1179             tmp0 = (q7_r - q4_r) + (q5_r - p2_r);
1180             tmp2 = (q7_r - q5_r) + (q6_r - p1_r);
1181             tmp1 += tmp0;
1182             p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1183             tmp1 += tmp2;
1184             p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1185             PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1186                         p0_filter16, p1_filter16);
1187             p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2);
1188             p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2);
1189             dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1190             dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1191             SD(dword0, src);
1192             src += pitch;
1193             SD(dword1, src);
1194         }
1195     }
1196 }
1197 
ff_loop_filter_h_4_8_msa(uint8_t * src,ptrdiff_t pitch,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)1198 void ff_loop_filter_h_4_8_msa(uint8_t *src, ptrdiff_t pitch,
1199                               int32_t b_limit_ptr,
1200                               int32_t limit_ptr,
1201                               int32_t thresh_ptr)
1202 {
1203     v16u8 mask, hev, flat, limit, thresh, b_limit;
1204     v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1205     v8i16 vec0, vec1, vec2, vec3;
1206 
1207     LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
1208 
1209     thresh = (v16u8) __msa_fill_b(thresh_ptr);
1210     b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1211     limit = (v16u8) __msa_fill_b(limit_ptr);
1212 
1213     TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3,
1214                        p3, p2, p1, p0, q0, q1, q2, q3);
1215     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1216                  hev, mask, flat);
1217     VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
1218     ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1);
1219     ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1220 
1221     src -= 2;
1222     ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
1223 }
1224 
ff_loop_filter_h_44_16_msa(uint8_t * src,ptrdiff_t pitch,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)1225 void ff_loop_filter_h_44_16_msa(uint8_t *src, ptrdiff_t pitch,
1226                                 int32_t b_limit_ptr,
1227                                 int32_t limit_ptr,
1228                                 int32_t thresh_ptr)
1229 {
1230     v16u8 mask, hev, flat;
1231     v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
1232     v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1233     v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1234     v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
1235     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1236 
1237     LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
1238     LD_UB8(src - 4 + (8 * pitch), pitch,
1239            row8, row9, row10, row11, row12, row13, row14, row15);
1240 
1241     TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
1242                         row8, row9, row10, row11, row12, row13, row14, row15,
1243                         p3, p2, p1, p0, q0, q1, q2, q3);
1244 
1245     thresh0 = (v16u8) __msa_fill_b(thresh_ptr);
1246     thresh1 = (v16u8) __msa_fill_b(thresh_ptr >> 8);
1247     thresh0 = (v16u8) __msa_ilvr_d((v2i64) thresh1, (v2i64) thresh0);
1248 
1249     b_limit0 = (v16u8) __msa_fill_b(b_limit_ptr);
1250     b_limit1 = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
1251     b_limit0 = (v16u8) __msa_ilvr_d((v2i64) b_limit1, (v2i64) b_limit0);
1252 
1253     limit0 = (v16u8) __msa_fill_b(limit_ptr);
1254     limit1 = (v16u8) __msa_fill_b(limit_ptr >> 8);
1255     limit0 = (v16u8) __msa_ilvr_d((v2i64) limit1, (v2i64) limit0);
1256 
1257     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
1258                  hev, mask, flat);
1259     VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
1260     ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
1261     ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
1262     ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
1263     ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
1264 
1265     src -= 2;
1266 
1267     ST_W8(tmp2, tmp3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
1268     ST_W8(tmp4, tmp5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch);
1269 }
1270 
ff_loop_filter_h_8_8_msa(uint8_t * src,ptrdiff_t pitch,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)1271 void ff_loop_filter_h_8_8_msa(uint8_t *src, ptrdiff_t pitch,
1272                               int32_t b_limit_ptr,
1273                               int32_t limit_ptr,
1274                               int32_t thresh_ptr)
1275 {
1276     v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1277     v16u8 p1_out, p0_out, q0_out, q1_out;
1278     v16u8 flat, mask, hev, thresh, b_limit, limit;
1279     v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1280     v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1281     v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1282     v16u8 zero = { 0 };
1283     v8i16 vec0, vec1, vec2, vec3, vec4;
1284 
1285     /* load vector elements */
1286     LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
1287 
1288     TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3,
1289                        p3, p2, p1, p0, q0, q1, q2, q3);
1290 
1291     thresh = (v16u8) __msa_fill_b(thresh_ptr);
1292     b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1293     limit = (v16u8) __msa_fill_b(limit_ptr);
1294 
1295     /* mask and hev */
1296     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1297                  hev, mask, flat);
1298     /* flat4 */
1299     VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1300     /* filter4 */
1301     VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1302                        q1_out);
1303 
1304     flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
1305 
1306     /* if flat is zero for all pixels, then no need to calculate other filter */
1307     if (__msa_test_bz_v(flat)) {
1308         /* Store 4 pixels p1-_q1 */
1309         ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1310         ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1311 
1312         src -= 2;
1313         ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
1314     } else {
1315         ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1316                    zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1317                    q3_r);
1318         VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1319                     p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1320         /* convert 16 bit output data into 8 bit */
1321         PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r,
1322                     p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r,
1323                     p0_filt8_r, q0_filt8_r);
1324         PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r,
1325                     q2_filt8_r);
1326 
1327         /* store pixel values */
1328         p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
1329         p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
1330         p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
1331         q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
1332         q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
1333         q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
1334 
1335         /* Store 6 pixels p2-_q2 */
1336         ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1337         ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1338         vec4 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1);
1339 
1340         src -= 3;
1341         ST_W4(vec2, 0, 1, 2, 3, src, pitch);
1342         ST_H4(vec4, 0, 1, 2, 3, src + 4, pitch);
1343         src += (4 * pitch);
1344         ST_W4(vec3, 0, 1, 2, 3, src, pitch);
1345         ST_H4(vec4, 4, 5, 6, 7, src + 4, pitch);
1346     }
1347 }
1348 
ff_loop_filter_h_88_16_msa(uint8_t * src,ptrdiff_t pitch,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)1349 void ff_loop_filter_h_88_16_msa(uint8_t *src, ptrdiff_t pitch,
1350                                 int32_t b_limit_ptr,
1351                                 int32_t limit_ptr,
1352                                 int32_t thresh_ptr)
1353 {
1354     uint8_t *temp_src;
1355     v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1356     v16u8 p1_out, p0_out, q0_out, q1_out;
1357     v16u8 flat, mask, hev, thresh, b_limit, limit;
1358     v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
1359     v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1360     v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1361     v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1362     v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1363     v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
1364     v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
1365     v16u8 zero = { 0 };
1366     v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1367 
1368     temp_src = src - 4;
1369 
1370     LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
1371     temp_src += (8 * pitch);
1372     LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
1373 
1374     /* transpose 16x8 matrix into 8x16 */
1375     TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
1376                         q3, q2, q1, q0, row12, row13, row14, row15,
1377                         p3, p2, p1, p0, q0, q1, q2, q3);
1378 
1379     thresh = (v16u8) __msa_fill_b(thresh_ptr);
1380     vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
1381     thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
1382 
1383     b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1384     vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
1385     b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
1386 
1387     limit = (v16u8) __msa_fill_b(limit_ptr);
1388     vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
1389     limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
1390 
1391     /* mask and hev */
1392     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1393                  hev, mask, flat);
1394     /* flat4 */
1395     VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1396     /* filter4 */
1397     VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1398                        q1_out);
1399 
1400     /* if flat is zero for all pixels, then no need to calculate other filter */
1401     if (__msa_test_bz_v(flat)) {
1402         ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1403         ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1404         ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1405         ILVRL_H2_SH(vec1, vec0, vec4, vec5);
1406 
1407         src -= 2;
1408         ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
1409         ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch);
1410     } else {
1411         ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1412                    zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1413                    q3_r);
1414         VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1415                     p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1416 
1417         ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
1418                    p0_l);
1419         ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
1420                    q3_l);
1421 
1422         /* filter8 */
1423         VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1424                     p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1425 
1426         /* convert 16 bit output data into 8 bit */
1427         PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
1428                     p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
1429                     p0_filt8_r, q0_filt8_r);
1430         PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
1431                     q2_filt8_r);
1432 
1433         /* store pixel values */
1434         p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
1435         p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
1436         p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
1437         q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
1438         q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
1439         q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
1440 
1441         ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1442         ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1443         ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
1444         ILVRL_H2_SH(vec1, vec0, vec6, vec7);
1445         ILVRL_B2_SH(q2, q1, vec2, vec5);
1446 
1447         src -= 3;
1448         ST_W4(vec3, 0, 1, 2, 3, src, pitch);
1449         ST_H4(vec2, 0, 1, 2, 3, src + 4, pitch);
1450         src += (4 * pitch);
1451         ST_W4(vec4, 0, 1, 2, 3, src, pitch);
1452         ST_H4(vec2, 4, 5, 6, 7, src + 4, pitch);
1453         src += (4 * pitch);
1454         ST_W4(vec6, 0, 1, 2, 3, src, pitch);
1455         ST_H4(vec5, 0, 1, 2, 3, src + 4, pitch);
1456         src += (4 * pitch);
1457         ST_W4(vec7, 0, 1, 2, 3, src, pitch);
1458         ST_H4(vec5, 4, 5, 6, 7, src + 4, pitch);
1459     }
1460 }
1461 
ff_loop_filter_h_84_16_msa(uint8_t * src,ptrdiff_t pitch,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)1462 void ff_loop_filter_h_84_16_msa(uint8_t *src, ptrdiff_t pitch,
1463                                 int32_t b_limit_ptr,
1464                                 int32_t limit_ptr,
1465                                 int32_t thresh_ptr)
1466 {
1467     uint8_t *temp_src;
1468     v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1469     v16u8 p1_out, p0_out, q0_out, q1_out;
1470     v16u8 flat, mask, hev, thresh, b_limit, limit;
1471     v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
1472     v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1473     v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1474     v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1475     v16u8 zero = { 0 };
1476     v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1477 
1478     temp_src = src - 4;
1479 
1480     LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
1481     temp_src += (8 * pitch);
1482     LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
1483 
1484     /* transpose 16x8 matrix into 8x16 */
1485     TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
1486                         q3, q2, q1, q0, row12, row13, row14, row15,
1487                         p3, p2, p1, p0, q0, q1, q2, q3);
1488 
1489     thresh = (v16u8) __msa_fill_b(thresh_ptr);
1490     vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
1491     thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
1492 
1493     b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1494     vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
1495     b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
1496 
1497     limit = (v16u8) __msa_fill_b(limit_ptr);
1498     vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
1499     limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
1500 
1501     /* mask and hev */
1502     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1503                  hev, mask, flat);
1504     /* flat4 */
1505     VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1506     /* filter4 */
1507     VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1508                        q1_out);
1509 
1510     flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
1511 
1512     /* if flat is zero for all pixels, then no need to calculate other filter */
1513     if (__msa_test_bz_v(flat)) {
1514         ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1515         ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1516         ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1517         ILVRL_H2_SH(vec1, vec0, vec4, vec5);
1518 
1519         src -= 2;
1520         ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
1521         ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch);
1522     } else {
1523         ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1524                    zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1525                    q3_r);
1526         VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1527                     p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1528 
1529         /* convert 16 bit output data into 8 bit */
1530         PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r,
1531                     p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r,
1532                     p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r);
1533         PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r,
1534                     q1_filt8_r, q2_filt8_r);
1535 
1536         /* store pixel values */
1537         p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
1538         p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
1539         p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
1540         q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
1541         q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
1542         q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
1543 
1544         ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1545         ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1546         ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
1547         ILVRL_H2_SH(vec1, vec0, vec6, vec7);
1548         ILVRL_B2_SH(q2, q1, vec2, vec5);
1549 
1550         src -= 3;
1551         ST_W4(vec3, 0, 1, 2, 3, src, pitch);
1552         ST_H4(vec2, 0, 1, 2, 3, src + 4, pitch);
1553         src += (4 * pitch);
1554         ST_W4(vec4, 0, 1, 2, 3, src, pitch);
1555         ST_H4(vec2, 4, 5, 6, 7, src + 4, pitch);
1556         src += (4 * pitch);
1557         ST_W4(vec6, 0, 1, 2, 3, src, pitch);
1558         ST_H4(vec5, 0, 1, 2, 3, src + 4, pitch);
1559         src += (4 * pitch);
1560         ST_W4(vec7, 0, 1, 2, 3, src, pitch);
1561         ST_H4(vec5, 4, 5, 6, 7, src + 4, pitch);
1562     }
1563 }
1564 
ff_loop_filter_h_48_16_msa(uint8_t * src,ptrdiff_t pitch,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)1565 void ff_loop_filter_h_48_16_msa(uint8_t *src, ptrdiff_t pitch,
1566                                 int32_t b_limit_ptr,
1567                                 int32_t limit_ptr,
1568                                 int32_t thresh_ptr)
1569 {
1570     uint8_t *temp_src;
1571     v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1572     v16u8 p1_out, p0_out, q0_out, q1_out;
1573     v16u8 flat, mask, hev, thresh, b_limit, limit;
1574     v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
1575     v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1576     v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
1577     v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
1578     v16u8 zero = { 0 };
1579     v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1580 
1581     temp_src = src - 4;
1582 
1583     LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
1584     temp_src += (8 * pitch);
1585     LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
1586 
1587     /* transpose 16x8 matrix into 8x16 */
1588     TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
1589                         q3, q2, q1, q0, row12, row13, row14, row15,
1590                         p3, p2, p1, p0, q0, q1, q2, q3);
1591 
1592     thresh = (v16u8) __msa_fill_b(thresh_ptr);
1593     vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
1594     thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
1595 
1596     b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1597     vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
1598     b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
1599 
1600     limit = (v16u8) __msa_fill_b(limit_ptr);
1601     vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
1602     limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
1603 
1604     /* mask and hev */
1605     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1606                  hev, mask, flat);
1607     /* flat4 */
1608     VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1609     /* filter4 */
1610     VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1611                        q1_out);
1612 
1613     flat = (v16u8) __msa_insve_d((v2i64) flat, 0, (v2i64) zero);
1614 
1615     /* if flat is zero for all pixels, then no need to calculate other filter */
1616     if (__msa_test_bz_v(flat)) {
1617         ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1618         ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1619         ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1620         ILVRL_H2_SH(vec1, vec0, vec4, vec5);
1621 
1622         src -= 2;
1623         ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
1624         ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch);
1625     } else {
1626         ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
1627                    p0_l);
1628         ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
1629                    q3_l);
1630 
1631         VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1632                     p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1633 
1634         /* convert 16 bit output data into 8 bit */
1635         PCKEV_B4_SH(p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
1636                     p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l,
1637                     p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
1638         PCKEV_B2_SH(q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
1639                     q1_filt8_l, q2_filt8_l);
1640 
1641         /* store pixel values */
1642         p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_l, flat);
1643         p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_l, flat);
1644         p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_l, flat);
1645         q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_l, flat);
1646         q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_l, flat);
1647         q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_l, flat);
1648 
1649         ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1650         ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1651         ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
1652         ILVRL_H2_SH(vec1, vec0, vec6, vec7);
1653         ILVRL_B2_SH(q2, q1, vec2, vec5);
1654 
1655         src -= 3;
1656         ST_W4(vec3, 0, 1, 2, 3, src, pitch);
1657         ST_H4(vec2, 0, 1, 2, 3, src + 4, pitch);
1658         src += (4 * pitch);
1659         ST_W4(vec4, 0, 1, 2, 3, src, pitch);
1660         ST_H4(vec2, 4, 5, 6, 7, src + 4, pitch);
1661         src += (4 * pitch);
1662         ST_W4(vec6, 0, 1, 2, 3, src, pitch);
1663         ST_H4(vec5, 0, 1, 2, 3, src + 4, pitch);
1664         src += (4 * pitch);
1665         ST_W4(vec7, 0, 1, 2, 3, src, pitch);
1666         ST_H4(vec5, 4, 5, 6, 7, src + 4, pitch);
1667     }
1668 }
1669 
vp9_transpose_16x8_to_8x16(uint8_t * input,int32_t in_pitch,uint8_t * output,int32_t out_pitch)1670 static void vp9_transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
1671                                        uint8_t *output, int32_t out_pitch)
1672 {
1673     v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
1674     v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1675     v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1676     v16i8 zeros = { 0 };
1677 
1678     LD_UB8(input, in_pitch,
1679            p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org);
1680     /* 8x8 transpose */
1681     TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
1682                        p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
1683     /* 8x8 transpose */
1684     ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org,
1685                tmp0, tmp1, tmp2, tmp3);
1686     ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
1687     ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
1688     ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4);
1689     ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6);
1690     SLDI_B4_UB(zeros, q0, zeros, q2, zeros, q4, zeros, q6, 8, q1, q3, q5, q7);
1691 
1692     ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
1693     output += (8 * out_pitch);
1694     ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
1695 }
1696 
vp9_transpose_8x16_to_16x8(uint8_t * input,int32_t in_pitch,uint8_t * output,int32_t out_pitch)1697 static void vp9_transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
1698                                        uint8_t *output, int32_t out_pitch)
1699 {
1700     v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
1701     v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1702 
1703     LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0);
1704     LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7);
1705     TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
1706                         q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
1707     ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
1708 }
1709 
vp9_transpose_16x16(uint8_t * input,int32_t in_pitch,uint8_t * output,int32_t out_pitch)1710 static void vp9_transpose_16x16(uint8_t *input, int32_t in_pitch,
1711                                 uint8_t *output, int32_t out_pitch)
1712 {
1713     v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1714     v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
1715     v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
1716     v4i32 tmp2, tmp3;
1717     v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1718 
1719     LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
1720     input += (8 * in_pitch);
1721     LD_UB8(input, in_pitch,
1722            row8, row9, row10, row11, row12, row13, row14, row15);
1723 
1724     TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
1725                         row8, row9, row10, row11, row12, row13, row14, row15,
1726                         p7, p6, p5, p4, p3, p2, p1, p0);
1727 
1728     /* transpose 16x8 matrix into 8x16 */
1729     /* total 8 intermediate register and 32 instructions */
1730     q7 = (v16u8) __msa_ilvod_d((v2i64) row8, (v2i64) row0);
1731     q6 = (v16u8) __msa_ilvod_d((v2i64) row9, (v2i64) row1);
1732     q5 = (v16u8) __msa_ilvod_d((v2i64) row10, (v2i64) row2);
1733     q4 = (v16u8) __msa_ilvod_d((v2i64) row11, (v2i64) row3);
1734     q3 = (v16u8) __msa_ilvod_d((v2i64) row12, (v2i64) row4);
1735     q2 = (v16u8) __msa_ilvod_d((v2i64) row13, (v2i64) row5);
1736     q1 = (v16u8) __msa_ilvod_d((v2i64) row14, (v2i64) row6);
1737     q0 = (v16u8) __msa_ilvod_d((v2i64) row15, (v2i64) row7);
1738 
1739     ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1);
1740     tmp4 = (v8i16) __msa_ilvod_b((v16i8) q6, (v16i8) q7);
1741     tmp5 = (v8i16) __msa_ilvod_b((v16i8) q4, (v16i8) q5);
1742 
1743     ILVEV_B2_UB(q3, q2, q1, q0, q5, q7);
1744     tmp6 = (v8i16) __msa_ilvod_b((v16i8) q2, (v16i8) q3);
1745     tmp7 = (v8i16) __msa_ilvod_b((v16i8) q0, (v16i8) q1);
1746 
1747     ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3);
1748     q0 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1749     q4 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1750 
1751     tmp2 = (v4i32) __msa_ilvod_h(tmp1, tmp0);
1752     tmp3 = (v4i32) __msa_ilvod_h((v8i16) q7, (v8i16) q5);
1753     q2 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1754     q6 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1755 
1756     ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3);
1757     q1 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1758     q5 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1759 
1760     tmp2 = (v4i32) __msa_ilvod_h(tmp5, tmp4);
1761     tmp3 = (v4i32) __msa_ilvod_h(tmp7, tmp6);
1762     q3 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1763     q7 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1764 
1765     ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
1766     output += (8 * out_pitch);
1767     ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
1768 }
1769 
vp9_vt_lpf_t4_and_t8_8w(uint8_t * src,uint8_t * filter48,uint8_t * src_org,int32_t pitch_org,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)1770 static int32_t vp9_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
1771                                        uint8_t *src_org, int32_t pitch_org,
1772                                        int32_t b_limit_ptr,
1773                                        int32_t limit_ptr,
1774                                        int32_t thresh_ptr)
1775 {
1776     v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1777     v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
1778     v16u8 flat, mask, hev, thresh, b_limit, limit;
1779     v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1780     v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1781     v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1782     v16i8 zero = { 0 };
1783     v8i16 vec0, vec1, vec2, vec3;
1784 
1785     /* load vector elements */
1786     LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
1787 
1788     thresh = (v16u8) __msa_fill_b(thresh_ptr);
1789     b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1790     limit = (v16u8) __msa_fill_b(limit_ptr);
1791 
1792     /* mask and hev */
1793     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1794                  hev, mask, flat);
1795     /* flat4 */
1796     VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1797     /* filter4 */
1798     VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1799                        q1_out);
1800 
1801     flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
1802 
1803     /* if flat is zero for all pixels, then no need to calculate other filter */
1804     if (__msa_test_bz_v(flat)) {
1805         ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1806         ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1807         ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, (src_org - 2), pitch_org);
1808         return 1;
1809     } else {
1810         ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1811                    zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1812                    q3_r);
1813         VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1814                     p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1815 
1816         /* convert 16 bit output data into 8 bit */
1817         p2_r = (v8u16) __msa_pckev_b((v16i8) p2_filt8_r, (v16i8) p2_filt8_r);
1818         p1_r = (v8u16) __msa_pckev_b((v16i8) p1_filt8_r, (v16i8) p1_filt8_r);
1819         p0_r = (v8u16) __msa_pckev_b((v16i8) p0_filt8_r, (v16i8) p0_filt8_r);
1820         q0_r = (v8u16) __msa_pckev_b((v16i8) q0_filt8_r, (v16i8) q0_filt8_r);
1821         q1_r = (v8u16) __msa_pckev_b((v16i8) q1_filt8_r, (v16i8) q1_filt8_r);
1822         q2_r = (v8u16) __msa_pckev_b((v16i8) q2_filt8_r, (v16i8) q2_filt8_r);
1823 
1824         /* store pixel values */
1825         p2_out = __msa_bmnz_v(p2, (v16u8) p2_r, flat);
1826         p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_r, flat);
1827         p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_r, flat);
1828         q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_r, flat);
1829         q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_r, flat);
1830         q2_out = __msa_bmnz_v(q2, (v16u8) q2_r, flat);
1831 
1832         ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
1833         filter48 += (4 * 16);
1834         ST_UB2(q1_out, q2_out, filter48, 16);
1835         filter48 += (2 * 16);
1836         ST_UB(flat, filter48);
1837 
1838         return 0;
1839     }
1840 }
1841 
vp9_vt_lpf_t16_8w(uint8_t * src,uint8_t * src_org,ptrdiff_t pitch,uint8_t * filter48)1842 static int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch,
1843                                  uint8_t *filter48)
1844 {
1845     v16i8 zero = { 0 };
1846     v16u8 filter8, flat, flat2;
1847     v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1848     v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
1849     v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
1850     v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
1851     v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
1852     v8u16 tmp0_r, tmp1_r;
1853     v8i16 r_out;
1854 
1855     flat = LD_UB(filter48 + 6 * 16);
1856 
1857     LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
1858     LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
1859 
1860     VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
1861 
1862     /* if flat2 is zero for all pixels, then no need to calculate other filter */
1863     if (__msa_test_bz_v(flat2)) {
1864         v8i16 vec0, vec1, vec2, vec3, vec4;
1865 
1866         LD_UB4(filter48, 16, p2, p1, p0, q0);
1867         LD_UB2(filter48 + 4 * 16, 16, q1, q2);
1868 
1869         ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1870         ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1871         vec2 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1);
1872 
1873         src_org -= 3;
1874         ST_W4(vec3, 0, 1, 2, 3, src_org, pitch);
1875         ST_H4(vec2, 0, 1, 2, 3, (src_org + 4), pitch);
1876         src_org += (4 * pitch);
1877         ST_W4(vec4, 0, 1, 2, 3, src_org, pitch);
1878         ST_H4(vec2, 4, 5, 6, 7, (src_org + 4), pitch);
1879 
1880         return 1;
1881     } else {
1882         src -= 7 * 16;
1883 
1884         ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
1885                    zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
1886                    p3_r_in, p2_r_in, p1_r_in, p0_r_in);
1887         q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0);
1888 
1889         tmp0_r = p7_r_in << 3;
1890         tmp0_r -= p7_r_in;
1891         tmp0_r += p6_r_in;
1892         tmp0_r += q0_r_in;
1893         tmp1_r = p6_r_in + p5_r_in;
1894         tmp1_r += p4_r_in;
1895         tmp1_r += p3_r_in;
1896         tmp1_r += p2_r_in;
1897         tmp1_r += p1_r_in;
1898         tmp1_r += p0_r_in;
1899         tmp1_r += tmp0_r;
1900 
1901         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1902         r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1903         p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
1904         ST_D1(p6, 0, src);
1905         src += 16;
1906 
1907         /* p5 */
1908         q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1);
1909         tmp0_r = p5_r_in - p6_r_in;
1910         tmp0_r += q1_r_in;
1911         tmp0_r -= p7_r_in;
1912         tmp1_r += tmp0_r;
1913         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1914         r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1915         p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
1916         ST_D1(p5, 0, src);
1917         src += 16;
1918 
1919         /* p4 */
1920         q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
1921         tmp0_r = p4_r_in - p5_r_in;
1922         tmp0_r += q2_r_in;
1923         tmp0_r -= p7_r_in;
1924         tmp1_r += tmp0_r;
1925         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1926         r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1927         p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
1928         ST_D1(p4, 0, src);
1929         src += 16;
1930 
1931         /* p3 */
1932         q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
1933         tmp0_r = p3_r_in - p4_r_in;
1934         tmp0_r += q3_r_in;
1935         tmp0_r -= p7_r_in;
1936         tmp1_r += tmp0_r;
1937         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1938         r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1939         p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
1940         ST_D1(p3, 0, src);
1941         src += 16;
1942 
1943         /* p2 */
1944         q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
1945         filter8 = LD_UB(filter48);
1946         tmp0_r = p2_r_in - p3_r_in;
1947         tmp0_r += q4_r_in;
1948         tmp0_r -= p7_r_in;
1949         tmp1_r += tmp0_r;
1950         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1951         r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1952         filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
1953         ST_D1(filter8, 0, src);
1954         src += 16;
1955 
1956         /* p1 */
1957         q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
1958         filter8 = LD_UB(filter48 + 16);
1959         tmp0_r = p1_r_in - p2_r_in;
1960         tmp0_r += q5_r_in;
1961         tmp0_r -= p7_r_in;
1962         tmp1_r += tmp0_r;
1963         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1964         r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1965         filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
1966         ST_D1(filter8, 0, src);
1967         src += 16;
1968 
1969         /* p0 */
1970         q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
1971         filter8 = LD_UB(filter48 + 32);
1972         tmp0_r = p0_r_in - p1_r_in;
1973         tmp0_r += q6_r_in;
1974         tmp0_r -= p7_r_in;
1975         tmp1_r += tmp0_r;
1976         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1977         r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1978         filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
1979         ST_D1(filter8, 0, src);
1980         src += 16;
1981 
1982         /* q0 */
1983         q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
1984         filter8 = LD_UB(filter48 + 48);
1985         tmp0_r = q7_r_in - p0_r_in;
1986         tmp0_r += q0_r_in;
1987         tmp0_r -= p7_r_in;
1988         tmp1_r += tmp0_r;
1989         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1990         r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1991         filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
1992         ST_D1(filter8, 0, src);
1993         src += 16;
1994 
1995         /* q1 */
1996         filter8 = LD_UB(filter48 + 64);
1997         tmp0_r = q7_r_in - q0_r_in;
1998         tmp0_r += q1_r_in;
1999         tmp0_r -= p6_r_in;
2000         tmp1_r += tmp0_r;
2001         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2002         r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2003         filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2004         ST_D1(filter8, 0, src);
2005         src += 16;
2006 
2007         /* q2 */
2008         filter8 = LD_UB(filter48 + 80);
2009         tmp0_r = q7_r_in - q1_r_in;
2010         tmp0_r += q2_r_in;
2011         tmp0_r -= p5_r_in;
2012         tmp1_r += tmp0_r;
2013         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2014         r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2015         filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2016         ST_D1(filter8, 0, src);
2017         src += 16;
2018 
2019         /* q3 */
2020         tmp0_r = q7_r_in - q2_r_in;
2021         tmp0_r += q3_r_in;
2022         tmp0_r -= p4_r_in;
2023         tmp1_r += tmp0_r;
2024         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2025         r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2026         q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
2027         ST_D1(q3, 0, src);
2028         src += 16;
2029 
2030         /* q4 */
2031         tmp0_r = q7_r_in - q3_r_in;
2032         tmp0_r += q4_r_in;
2033         tmp0_r -= p3_r_in;
2034         tmp1_r += tmp0_r;
2035         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2036         r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2037         q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
2038         ST_D1(q4, 0, src);
2039         src += 16;
2040 
2041         /* q5 */
2042         tmp0_r = q7_r_in - q4_r_in;
2043         tmp0_r += q5_r_in;
2044         tmp0_r -= p2_r_in;
2045         tmp1_r += tmp0_r;
2046         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2047         r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2048         q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
2049         ST_D1(q5, 0, src);
2050         src += 16;
2051 
2052         /* q6 */
2053         tmp0_r = q7_r_in - q5_r_in;
2054         tmp0_r += q6_r_in;
2055         tmp0_r -= p1_r_in;
2056         tmp1_r += tmp0_r;
2057         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2058         r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2059         q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
2060         ST_D1(q6, 0, src);
2061 
2062         return 0;
2063     }
2064 }
2065 
ff_loop_filter_h_16_8_msa(uint8_t * src,ptrdiff_t pitch,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)2066 void ff_loop_filter_h_16_8_msa(uint8_t *src, ptrdiff_t pitch,
2067                                int32_t b_limit_ptr,
2068                                int32_t limit_ptr,
2069                                int32_t thresh_ptr)
2070 {
2071     uint8_t early_exit = 0;
2072     uint8_t transposed_input[16 * 24] ALLOC_ALIGNED(ALIGNMENT);
2073     uint8_t *filter48 = &transposed_input[16 * 16];
2074 
2075     vp9_transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
2076 
2077     early_exit = vp9_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8),
2078                                          &filter48[0], src, pitch,
2079                                          b_limit_ptr, limit_ptr, thresh_ptr);
2080 
2081     if (0 == early_exit) {
2082         early_exit = vp9_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
2083                                        &filter48[0]);
2084 
2085         if (0 == early_exit) {
2086             vp9_transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
2087         }
2088     }
2089 }
2090 
vp9_vt_lpf_t4_and_t8_16w(uint8_t * src,uint8_t * filter48,uint8_t * src_org,ptrdiff_t pitch,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)2091 static int32_t vp9_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
2092                                         uint8_t *src_org, ptrdiff_t pitch,
2093                                         int32_t b_limit_ptr,
2094                                         int32_t limit_ptr,
2095                                         int32_t thresh_ptr)
2096 {
2097     v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
2098     v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
2099     v16u8 flat, mask, hev, thresh, b_limit, limit;
2100     v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
2101     v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
2102     v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
2103     v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
2104     v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
2105     v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
2106     v16i8 zero = { 0 };
2107     v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
2108 
2109     /* load vector elements */
2110     LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
2111 
2112     thresh = (v16u8) __msa_fill_b(thresh_ptr);
2113     b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
2114     limit = (v16u8) __msa_fill_b(limit_ptr);
2115 
2116     /* mask and hev */
2117     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
2118                  hev, mask, flat);
2119     /* flat4 */
2120     VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
2121     /* filter4 */
2122     VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
2123                        q1_out);
2124 
2125     /* if flat is zero for all pixels, then no need to calculate other filter */
2126     if (__msa_test_bz_v(flat)) {
2127         ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2128         ILVRL_H2_SH(vec1, vec0, vec2, vec3);
2129         ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2130         ILVRL_H2_SH(vec1, vec0, vec4, vec5);
2131 
2132         src_org -= 2;
2133         ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src_org, pitch);
2134         ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src_org + 8 * pitch, pitch);
2135 
2136         return 1;
2137     } else {
2138         ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
2139                    zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
2140                    q3_r);
2141         VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
2142                     p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
2143         ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
2144                    p0_l);
2145         ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
2146                    q3_l);
2147         VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
2148                     p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
2149 
2150         /* convert 16 bit output data into 8 bit */
2151         PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
2152                     p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
2153                     p0_filt8_r, q0_filt8_r);
2154         PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
2155                     q2_filt8_r);
2156 
2157         /* store pixel values */
2158         p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
2159         p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
2160         p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
2161         q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
2162         q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
2163         q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
2164 
2165         ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
2166         filter48 += (4 * 16);
2167         ST_UB2(q1_out, q2_out, filter48, 16);
2168         filter48 += (2 * 16);
2169         ST_UB(flat, filter48);
2170 
2171         return 0;
2172     }
2173 }
2174 
vp9_vt_lpf_t16_16w(uint8_t * src,uint8_t * src_org,ptrdiff_t pitch,uint8_t * filter48)2175 static int32_t vp9_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch,
2176                                   uint8_t *filter48)
2177 {
2178     v16u8 flat, flat2, filter8;
2179     v16i8 zero = { 0 };
2180     v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
2181     v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
2182     v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
2183     v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
2184     v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
2185     v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
2186     v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
2187     v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
2188     v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
2189     v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
2190     v8i16 l_out, r_out;
2191 
2192     flat = LD_UB(filter48 + 6 * 16);
2193 
2194     LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
2195     LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
2196 
2197     VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
2198 
2199     /* if flat2 is zero for all pixels, then no need to calculate other filter */
2200     if (__msa_test_bz_v(flat2)) {
2201         v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2202 
2203         LD_UB4(filter48, 16, p2, p1, p0, q0);
2204         LD_UB2(filter48 + 4 * 16, 16, q1, q2);
2205 
2206         ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
2207         ILVRL_H2_SH(vec1, vec0, vec3, vec4);
2208         ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
2209         ILVRL_H2_SH(vec1, vec0, vec6, vec7);
2210         ILVRL_B2_SH(q2, q1, vec2, vec5);
2211 
2212         src_org -= 3;
2213         ST_W4(vec3, 0, 1, 2, 3, src_org, pitch);
2214         ST_H4(vec2, 0, 1, 2, 3, (src_org + 4), pitch);
2215         src_org += (4 * pitch);
2216         ST_W4(vec4, 0, 1, 2, 3, src_org, pitch);
2217         ST_H4(vec2, 4, 5, 6, 7, (src_org + 4), pitch);
2218         src_org += (4 * pitch);
2219         ST_W4(vec6, 0, 1, 2, 3, src_org, pitch);
2220         ST_H4(vec5, 0, 1, 2, 3, (src_org + 4), pitch);
2221         src_org += (4 * pitch);
2222         ST_W4(vec7, 0, 1, 2, 3, src_org, pitch);
2223         ST_H4(vec5, 4, 5, 6, 7, (src_org + 4), pitch);
2224 
2225         return 1;
2226     } else {
2227         src -= 7 * 16;
2228 
2229         ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
2230                    zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
2231                    p3_r_in, p2_r_in, p1_r_in, p0_r_in);
2232         q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0);
2233 
2234         tmp0_r = p7_r_in << 3;
2235         tmp0_r -= p7_r_in;
2236         tmp0_r += p6_r_in;
2237         tmp0_r += q0_r_in;
2238         tmp1_r = p6_r_in + p5_r_in;
2239         tmp1_r += p4_r_in;
2240         tmp1_r += p3_r_in;
2241         tmp1_r += p2_r_in;
2242         tmp1_r += p1_r_in;
2243         tmp1_r += p0_r_in;
2244         tmp1_r += tmp0_r;
2245         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2246 
2247         ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
2248                    p5_l_in, p4_l_in);
2249         ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
2250                    p1_l_in, p0_l_in);
2251         q0_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q0);
2252 
2253         tmp0_l = p7_l_in << 3;
2254         tmp0_l -= p7_l_in;
2255         tmp0_l += p6_l_in;
2256         tmp0_l += q0_l_in;
2257         tmp1_l = p6_l_in + p5_l_in;
2258         tmp1_l += p4_l_in;
2259         tmp1_l += p3_l_in;
2260         tmp1_l += p2_l_in;
2261         tmp1_l += p1_l_in;
2262         tmp1_l += p0_l_in;
2263         tmp1_l += tmp0_l;
2264         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2265 
2266         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2267         p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
2268         ST_UB(p6, src);
2269         src += 16;
2270 
2271         /* p5 */
2272         q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1);
2273         tmp0_r = p5_r_in - p6_r_in;
2274         tmp0_r += q1_r_in;
2275         tmp0_r -= p7_r_in;
2276         tmp1_r += tmp0_r;
2277         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2278         q1_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q1);
2279         tmp0_l = p5_l_in - p6_l_in;
2280         tmp0_l += q1_l_in;
2281         tmp0_l -= p7_l_in;
2282         tmp1_l += tmp0_l;
2283         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2284         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2285         p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
2286         ST_UB(p5, src);
2287         src += 16;
2288 
2289         /* p4 */
2290         q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
2291         tmp0_r = p4_r_in - p5_r_in;
2292         tmp0_r += q2_r_in;
2293         tmp0_r -= p7_r_in;
2294         tmp1_r += tmp0_r;
2295         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2296         q2_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q2);
2297         tmp0_l = p4_l_in - p5_l_in;
2298         tmp0_l += q2_l_in;
2299         tmp0_l -= p7_l_in;
2300         tmp1_l += tmp0_l;
2301         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2302         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2303         p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
2304         ST_UB(p4, src);
2305         src += 16;
2306 
2307         /* p3 */
2308         q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
2309         tmp0_r = p3_r_in - p4_r_in;
2310         tmp0_r += q3_r_in;
2311         tmp0_r -= p7_r_in;
2312         tmp1_r += tmp0_r;
2313         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2314         q3_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q3);
2315         tmp0_l = p3_l_in - p4_l_in;
2316         tmp0_l += q3_l_in;
2317         tmp0_l -= p7_l_in;
2318         tmp1_l += tmp0_l;
2319         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2320         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2321         p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
2322         ST_UB(p3, src);
2323         src += 16;
2324 
2325         /* p2 */
2326         q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
2327         filter8 = LD_UB(filter48);
2328         tmp0_r = p2_r_in - p3_r_in;
2329         tmp0_r += q4_r_in;
2330         tmp0_r -= p7_r_in;
2331         tmp1_r += tmp0_r;
2332         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2333         q4_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q4);
2334         tmp0_l = p2_l_in - p3_l_in;
2335         tmp0_l += q4_l_in;
2336         tmp0_l -= p7_l_in;
2337         tmp1_l += tmp0_l;
2338         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2339         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2340         filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2341         ST_UB(filter8, src);
2342         src += 16;
2343 
2344         /* p1 */
2345         q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
2346         filter8 = LD_UB(filter48 + 16);
2347         tmp0_r = p1_r_in - p2_r_in;
2348         tmp0_r += q5_r_in;
2349         tmp0_r -= p7_r_in;
2350         tmp1_r += tmp0_r;
2351         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2352         q5_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q5);
2353         tmp0_l = p1_l_in - p2_l_in;
2354         tmp0_l += q5_l_in;
2355         tmp0_l -= p7_l_in;
2356         tmp1_l += tmp0_l;
2357         l_out = __msa_srari_h((v8i16) (tmp1_l), 4);
2358         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2359         filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2360         ST_UB(filter8, src);
2361         src += 16;
2362 
2363         /* p0 */
2364         q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
2365         filter8 = LD_UB(filter48 + 32);
2366         tmp0_r = p0_r_in - p1_r_in;
2367         tmp0_r += q6_r_in;
2368         tmp0_r -= p7_r_in;
2369         tmp1_r += tmp0_r;
2370         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2371         q6_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q6);
2372         tmp0_l = p0_l_in - p1_l_in;
2373         tmp0_l += q6_l_in;
2374         tmp0_l -= p7_l_in;
2375         tmp1_l += tmp0_l;
2376         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2377         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2378         filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2379         ST_UB(filter8, src);
2380         src += 16;
2381 
2382         /* q0 */
2383         q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
2384         filter8 = LD_UB(filter48 + 48);
2385         tmp0_r = q7_r_in - p0_r_in;
2386         tmp0_r += q0_r_in;
2387         tmp0_r -= p7_r_in;
2388         tmp1_r += tmp0_r;
2389         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2390         q7_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q7);
2391         tmp0_l = q7_l_in - p0_l_in;
2392         tmp0_l += q0_l_in;
2393         tmp0_l -= p7_l_in;
2394         tmp1_l += tmp0_l;
2395         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2396         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2397         filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2398         ST_UB(filter8, src);
2399         src += 16;
2400 
2401         /* q1 */
2402         filter8 = LD_UB(filter48 + 64);
2403         tmp0_r = q7_r_in - q0_r_in;
2404         tmp0_r += q1_r_in;
2405         tmp0_r -= p6_r_in;
2406         tmp1_r += tmp0_r;
2407         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2408         tmp0_l = q7_l_in - q0_l_in;
2409         tmp0_l += q1_l_in;
2410         tmp0_l -= p6_l_in;
2411         tmp1_l += tmp0_l;
2412         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2413         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2414         filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2415         ST_UB(filter8, src);
2416         src += 16;
2417 
2418         /* q2 */
2419         filter8 = LD_UB(filter48 + 80);
2420         tmp0_r = q7_r_in - q1_r_in;
2421         tmp0_r += q2_r_in;
2422         tmp0_r -= p5_r_in;
2423         tmp1_r += tmp0_r;
2424         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2425         tmp0_l = q7_l_in - q1_l_in;
2426         tmp0_l += q2_l_in;
2427         tmp0_l -= p5_l_in;
2428         tmp1_l += tmp0_l;
2429         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2430         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2431         filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2432         ST_UB(filter8, src);
2433         src += 16;
2434 
2435         /* q3 */
2436         tmp0_r = q7_r_in - q2_r_in;
2437         tmp0_r += q3_r_in;
2438         tmp0_r -= p4_r_in;
2439         tmp1_r += tmp0_r;
2440         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2441         tmp0_l = q7_l_in - q2_l_in;
2442         tmp0_l += q3_l_in;
2443         tmp0_l -= p4_l_in;
2444         tmp1_l += tmp0_l;
2445         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2446         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2447         q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
2448         ST_UB(q3, src);
2449         src += 16;
2450 
2451         /* q4 */
2452         tmp0_r = q7_r_in - q3_r_in;
2453         tmp0_r += q4_r_in;
2454         tmp0_r -= p3_r_in;
2455         tmp1_r += tmp0_r;
2456         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2457         tmp0_l = q7_l_in - q3_l_in;
2458         tmp0_l += q4_l_in;
2459         tmp0_l -= p3_l_in;
2460         tmp1_l += tmp0_l;
2461         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2462         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2463         q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
2464         ST_UB(q4, src);
2465         src += 16;
2466 
2467         /* q5 */
2468         tmp0_r = q7_r_in - q4_r_in;
2469         tmp0_r += q5_r_in;
2470         tmp0_r -= p2_r_in;
2471         tmp1_r += tmp0_r;
2472         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2473         tmp0_l = q7_l_in - q4_l_in;
2474         tmp0_l += q5_l_in;
2475         tmp0_l -= p2_l_in;
2476         tmp1_l += tmp0_l;
2477         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2478         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2479         q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
2480         ST_UB(q5, src);
2481         src += 16;
2482 
2483         /* q6 */
2484         tmp0_r = q7_r_in - q5_r_in;
2485         tmp0_r += q6_r_in;
2486         tmp0_r -= p1_r_in;
2487         tmp1_r += tmp0_r;
2488         r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2489         tmp0_l = q7_l_in - q5_l_in;
2490         tmp0_l += q6_l_in;
2491         tmp0_l -= p1_l_in;
2492         tmp1_l += tmp0_l;
2493         l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2494         r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2495         q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
2496         ST_UB(q6, src);
2497 
2498         return 0;
2499     }
2500 }
2501 
ff_loop_filter_h_16_16_msa(uint8_t * src,ptrdiff_t pitch,int32_t b_limit_ptr,int32_t limit_ptr,int32_t thresh_ptr)2502 void ff_loop_filter_h_16_16_msa(uint8_t *src, ptrdiff_t pitch,
2503                                 int32_t b_limit_ptr,
2504                                 int32_t limit_ptr,
2505                                 int32_t thresh_ptr)
2506 {
2507     uint8_t early_exit = 0;
2508     uint8_t transposed_input[16 * 24] ALLOC_ALIGNED(ALIGNMENT);
2509     uint8_t *filter48 = &transposed_input[16 * 16];
2510 
2511     vp9_transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
2512 
2513     early_exit = vp9_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8),
2514                                           &filter48[0], src, pitch,
2515                                           b_limit_ptr, limit_ptr, thresh_ptr);
2516 
2517     if (0 == early_exit) {
2518         early_exit = vp9_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
2519                                         &filter48[0]);
2520 
2521         if (0 == early_exit) {
2522             vp9_transpose_16x16(transposed_input, 16, (src - 8), pitch);
2523         }
2524     }
2525 }
2526