1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/mips/loopfilter_msa.h"
13 #include "vpx_ports/mem.h"
14 
hz_lpf_t4_and_t8_16w(uint8_t * src,int32_t pitch,uint8_t * filter48,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)15 static int32_t hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch,
16                                     uint8_t *filter48,
17                                     const uint8_t *b_limit_ptr,
18                                     const uint8_t *limit_ptr,
19                                     const uint8_t *thresh_ptr) {
20   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
21   v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
22   v16u8 flat, mask, hev, thresh, b_limit, limit;
23   v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
24   v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
25   v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
26   v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
27   v16u8 zero = { 0 };
28 
29   /* load vector elements */
30   LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
31 
32   thresh = (v16u8)__msa_fill_b(*thresh_ptr);
33   b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
34   limit = (v16u8)__msa_fill_b(*limit_ptr);
35 
36   /* mask and hev */
37   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
38                mask, flat);
39   VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
40   VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
41 
42   if (__msa_test_bz_v(flat)) {
43     ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
44 
45     return 1;
46   } else {
47     ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
48                q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
49     VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
50                 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
51 
52     ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
53     ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
54     VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
55                 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
56 
57     /* convert 16 bit output data into 8 bit */
58     PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
59                 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
60                 p0_filt8_r, q0_filt8_r);
61     PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
62                 q2_filt8_r);
63 
64     /* store pixel values */
65     p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
66     p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
67     p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
68     q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
69     q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
70     q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
71 
72     ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
73     filter48 += (4 * 16);
74     ST_UB2(q1_out, q2_out, filter48, 16);
75     filter48 += (2 * 16);
76     ST_UB(flat, filter48);
77 
78     return 0;
79   }
80 }
81 
hz_lpf_t16_16w(uint8_t * src,int32_t pitch,uint8_t * filter48)82 static void hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
83   v16u8 flat, flat2, filter8;
84   v16i8 zero = { 0 };
85   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
86   v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
87   v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
88   v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
89   v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
90   v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
91   v8i16 l_out, r_out;
92 
93   flat = LD_UB(filter48 + 96);
94 
95   LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0);
96   LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7);
97   VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
98 
99   if (__msa_test_bz_v(flat2)) {
100     LD_UB4(filter48, 16, p2, p1, p0, q0);
101     LD_UB2(filter48 + 4 * 16, 16, q1, q2);
102 
103     src -= 3 * pitch;
104     ST_UB4(p2, p1, p0, q0, src, pitch);
105     src += (4 * pitch);
106     ST_UB2(q1, q2, src, pitch);
107   } else {
108     src -= 7 * pitch;
109 
110     ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
111                p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
112                p2_r_in, p1_r_in, p0_r_in);
113 
114     q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
115 
116     tmp0_r = p7_r_in << 3;
117     tmp0_r -= p7_r_in;
118     tmp0_r += p6_r_in;
119     tmp0_r += q0_r_in;
120     tmp1_r = p6_r_in + p5_r_in;
121     tmp1_r += p4_r_in;
122     tmp1_r += p3_r_in;
123     tmp1_r += p2_r_in;
124     tmp1_r += p1_r_in;
125     tmp1_r += p0_r_in;
126     tmp1_r += tmp0_r;
127     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
128 
129     ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
130                p5_l_in, p4_l_in);
131     ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
132                p1_l_in, p0_l_in);
133     q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
134 
135     tmp0_l = p7_l_in << 3;
136     tmp0_l -= p7_l_in;
137     tmp0_l += p6_l_in;
138     tmp0_l += q0_l_in;
139     tmp1_l = p6_l_in + p5_l_in;
140     tmp1_l += p4_l_in;
141     tmp1_l += p3_l_in;
142     tmp1_l += p2_l_in;
143     tmp1_l += p1_l_in;
144     tmp1_l += p0_l_in;
145     tmp1_l += tmp0_l;
146     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
147 
148     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
149     p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
150     ST_UB(p6, src);
151     src += pitch;
152 
153     /* p5 */
154     q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
155     tmp0_r = p5_r_in - p6_r_in;
156     tmp0_r += q1_r_in;
157     tmp0_r -= p7_r_in;
158     tmp1_r += tmp0_r;
159     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
160 
161     q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
162     tmp0_l = p5_l_in - p6_l_in;
163     tmp0_l += q1_l_in;
164     tmp0_l -= p7_l_in;
165     tmp1_l += tmp0_l;
166     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
167 
168     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
169     p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
170     ST_UB(p5, src);
171     src += pitch;
172 
173     /* p4 */
174     q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
175     tmp0_r = p4_r_in - p5_r_in;
176     tmp0_r += q2_r_in;
177     tmp0_r -= p7_r_in;
178     tmp1_r += tmp0_r;
179     r_out = (v8i16)__msa_srari_h((v8i16)tmp1_r, 4);
180 
181     q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
182     tmp0_l = p4_l_in - p5_l_in;
183     tmp0_l += q2_l_in;
184     tmp0_l -= p7_l_in;
185     tmp1_l += tmp0_l;
186     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
187 
188     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
189     p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
190     ST_UB(p4, src);
191     src += pitch;
192 
193     /* p3 */
194     q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
195     tmp0_r = p3_r_in - p4_r_in;
196     tmp0_r += q3_r_in;
197     tmp0_r -= p7_r_in;
198     tmp1_r += tmp0_r;
199     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
200 
201     q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
202     tmp0_l = p3_l_in - p4_l_in;
203     tmp0_l += q3_l_in;
204     tmp0_l -= p7_l_in;
205     tmp1_l += tmp0_l;
206     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
207 
208     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
209     p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
210     ST_UB(p3, src);
211     src += pitch;
212 
213     /* p2 */
214     q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
215     filter8 = LD_UB(filter48);
216     tmp0_r = p2_r_in - p3_r_in;
217     tmp0_r += q4_r_in;
218     tmp0_r -= p7_r_in;
219     tmp1_r += tmp0_r;
220     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
221 
222     q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
223     tmp0_l = p2_l_in - p3_l_in;
224     tmp0_l += q4_l_in;
225     tmp0_l -= p7_l_in;
226     tmp1_l += tmp0_l;
227     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
228 
229     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
230     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
231     ST_UB(filter8, src);
232     src += pitch;
233 
234     /* p1 */
235     q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
236     filter8 = LD_UB(filter48 + 16);
237     tmp0_r = p1_r_in - p2_r_in;
238     tmp0_r += q5_r_in;
239     tmp0_r -= p7_r_in;
240     tmp1_r += tmp0_r;
241     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
242 
243     q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
244     tmp0_l = p1_l_in - p2_l_in;
245     tmp0_l += q5_l_in;
246     tmp0_l -= p7_l_in;
247     tmp1_l += tmp0_l;
248     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
249 
250     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
251     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
252     ST_UB(filter8, src);
253     src += pitch;
254 
255     /* p0 */
256     q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
257     filter8 = LD_UB(filter48 + 32);
258     tmp0_r = p0_r_in - p1_r_in;
259     tmp0_r += q6_r_in;
260     tmp0_r -= p7_r_in;
261     tmp1_r += tmp0_r;
262     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
263 
264     q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
265     tmp0_l = p0_l_in - p1_l_in;
266     tmp0_l += q6_l_in;
267     tmp0_l -= p7_l_in;
268     tmp1_l += tmp0_l;
269     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
270 
271     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
272     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
273     ST_UB(filter8, src);
274     src += pitch;
275 
276     /* q0 */
277     q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
278     filter8 = LD_UB(filter48 + 48);
279     tmp0_r = q7_r_in - p0_r_in;
280     tmp0_r += q0_r_in;
281     tmp0_r -= p7_r_in;
282     tmp1_r += tmp0_r;
283     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
284 
285     q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
286     tmp0_l = q7_l_in - p0_l_in;
287     tmp0_l += q0_l_in;
288     tmp0_l -= p7_l_in;
289     tmp1_l += tmp0_l;
290     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
291 
292     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
293     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
294     ST_UB(filter8, src);
295     src += pitch;
296 
297     /* q1 */
298     filter8 = LD_UB(filter48 + 64);
299     tmp0_r = q7_r_in - q0_r_in;
300     tmp0_r += q1_r_in;
301     tmp0_r -= p6_r_in;
302     tmp1_r += tmp0_r;
303     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
304 
305     tmp0_l = q7_l_in - q0_l_in;
306     tmp0_l += q1_l_in;
307     tmp0_l -= p6_l_in;
308     tmp1_l += tmp0_l;
309     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
310 
311     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
312     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
313     ST_UB(filter8, src);
314     src += pitch;
315 
316     /* q2 */
317     filter8 = LD_UB(filter48 + 80);
318     tmp0_r = q7_r_in - q1_r_in;
319     tmp0_r += q2_r_in;
320     tmp0_r -= p5_r_in;
321     tmp1_r += tmp0_r;
322     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
323 
324     tmp0_l = q7_l_in - q1_l_in;
325     tmp0_l += q2_l_in;
326     tmp0_l -= p5_l_in;
327     tmp1_l += tmp0_l;
328     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
329 
330     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
331     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
332     ST_UB(filter8, src);
333     src += pitch;
334 
335     /* q3 */
336     tmp0_r = q7_r_in - q2_r_in;
337     tmp0_r += q3_r_in;
338     tmp0_r -= p4_r_in;
339     tmp1_r += tmp0_r;
340     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
341 
342     tmp0_l = q7_l_in - q2_l_in;
343     tmp0_l += q3_l_in;
344     tmp0_l -= p4_l_in;
345     tmp1_l += tmp0_l;
346     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
347 
348     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
349     q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
350     ST_UB(q3, src);
351     src += pitch;
352 
353     /* q4 */
354     tmp0_r = q7_r_in - q3_r_in;
355     tmp0_r += q4_r_in;
356     tmp0_r -= p3_r_in;
357     tmp1_r += tmp0_r;
358     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
359 
360     tmp0_l = q7_l_in - q3_l_in;
361     tmp0_l += q4_l_in;
362     tmp0_l -= p3_l_in;
363     tmp1_l += tmp0_l;
364     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
365 
366     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
367     q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
368     ST_UB(q4, src);
369     src += pitch;
370 
371     /* q5 */
372     tmp0_r = q7_r_in - q4_r_in;
373     tmp0_r += q5_r_in;
374     tmp0_r -= p2_r_in;
375     tmp1_r += tmp0_r;
376     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
377 
378     tmp0_l = q7_l_in - q4_l_in;
379     tmp0_l += q5_l_in;
380     tmp0_l -= p2_l_in;
381     tmp1_l += tmp0_l;
382     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
383 
384     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
385     q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
386     ST_UB(q5, src);
387     src += pitch;
388 
389     /* q6 */
390     tmp0_r = q7_r_in - q5_r_in;
391     tmp0_r += q6_r_in;
392     tmp0_r -= p1_r_in;
393     tmp1_r += tmp0_r;
394     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
395 
396     tmp0_l = q7_l_in - q5_l_in;
397     tmp0_l += q6_l_in;
398     tmp0_l -= p1_l_in;
399     tmp1_l += tmp0_l;
400     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
401 
402     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
403     q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
404     ST_UB(q6, src);
405   }
406 }
407 
mb_lpf_horizontal_edge_dual(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr,int32_t count)408 static void mb_lpf_horizontal_edge_dual(uint8_t *src, int32_t pitch,
409                                         const uint8_t *b_limit_ptr,
410                                         const uint8_t *limit_ptr,
411                                         const uint8_t *thresh_ptr,
412                                         int32_t count) {
413   DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]);
414   uint8_t early_exit = 0;
415 
416   (void)count;
417 
418   early_exit = hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr,
419                                     limit_ptr, thresh_ptr);
420 
421   if (0 == early_exit) {
422     hz_lpf_t16_16w(src, pitch, filter48);
423   }
424 }
425 
mb_lpf_horizontal_edge(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr,int32_t count)426 static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,
427                                    const uint8_t *b_limit_ptr,
428                                    const uint8_t *limit_ptr,
429                                    const uint8_t *thresh_ptr, int32_t count) {
430   if (1 == count) {
431     uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
432     uint64_t dword0, dword1;
433     v16u8 flat2, mask, hev, flat, thresh, b_limit, limit;
434     v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
435     v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
436     v16u8 p0_filter16, p1_filter16;
437     v8i16 p2_filter8, p1_filter8, p0_filter8;
438     v8i16 q0_filter8, q1_filter8, q2_filter8;
439     v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r;
440     v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
441     v16i8 zero = { 0 };
442     v8u16 tmp0, tmp1, tmp2;
443 
444     /* load vector elements */
445     LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
446 
447     thresh = (v16u8)__msa_fill_b(*thresh_ptr);
448     b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
449     limit = (v16u8)__msa_fill_b(*limit_ptr);
450 
451     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
452                  mask, flat);
453     VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
454     VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
455                        q1_out);
456 
457     flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
458 
459     if (__msa_test_bz_v(flat)) {
460       p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
461       p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
462       q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
463       q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
464       SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch);
465     } else {
466       /* convert 8 bit input data into 16 bit */
467       ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
468                  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
469                  q3_r);
470       VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
471                   p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
472 
473       /* convert 16 bit output data into 8 bit */
474       PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero,
475                   q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8);
476       PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
477 
478       /* store pixel values */
479       p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat);
480       p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat);
481       p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat);
482       q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat);
483       q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat);
484       q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat);
485 
486       /* load 16 vector elements */
487       LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4);
488       LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7);
489 
490       VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
491 
492       if (__msa_test_bz_v(flat2)) {
493         p2_d = __msa_copy_u_d((v2i64)p2_out, 0);
494         p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
495         p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
496         q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
497         q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
498         q2_d = __msa_copy_u_d((v2i64)q2_out, 0);
499 
500         SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch);
501         SD(q1_d, src + pitch);
502         SD(q2_d, src + 2 * pitch);
503       } else {
504         /* LSB(right) 8 pixel operation */
505         ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4, zero, q5,
506                    zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r, q4_r, q5_r, q6_r,
507                    q7_r);
508 
509         tmp0 = p7_r << 3;
510         tmp0 -= p7_r;
511         tmp0 += p6_r;
512         tmp0 += q0_r;
513 
514         src -= 7 * pitch;
515 
516         /* calculation of p6 and p5 */
517         tmp1 = p6_r + p5_r + p4_r + p3_r;
518         tmp1 += (p2_r + p1_r + p0_r);
519         tmp1 += tmp0;
520         p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
521         tmp0 = p5_r - p6_r + q1_r - p7_r;
522         tmp1 += tmp0;
523         p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
524         PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
525                     p1_filter16);
526         p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2);
527         p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2);
528         dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
529         dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
530         SD(dword0, src);
531         src += pitch;
532         SD(dword1, src);
533         src += pitch;
534 
535         /* calculation of p4 and p3 */
536         tmp0 = p4_r - p5_r + q2_r - p7_r;
537         tmp2 = p3_r - p4_r + q3_r - p7_r;
538         tmp1 += tmp0;
539         p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
540         tmp1 += tmp2;
541         p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
542         PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
543                     p1_filter16);
544         p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2);
545         p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2);
546         dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
547         dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
548         SD(dword0, src);
549         src += pitch;
550         SD(dword1, src);
551         src += pitch;
552 
553         /* calculation of p2 and p1 */
554         tmp0 = p2_r - p3_r + q4_r - p7_r;
555         tmp2 = p1_r - p2_r + q5_r - p7_r;
556         tmp1 += tmp0;
557         p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
558         tmp1 += tmp2;
559         p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
560         PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
561                     p1_filter16);
562         p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2);
563         p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2);
564         dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
565         dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
566         SD(dword0, src);
567         src += pitch;
568         SD(dword1, src);
569         src += pitch;
570 
571         /* calculation of p0 and q0 */
572         tmp0 = (p0_r - p1_r) + (q6_r - p7_r);
573         tmp2 = (q7_r - p0_r) + (q0_r - p7_r);
574         tmp1 += tmp0;
575         p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
576         tmp1 += tmp2;
577         p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
578         PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
579                     p1_filter16);
580         p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2);
581         p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2);
582         dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
583         dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
584         SD(dword0, src);
585         src += pitch;
586         SD(dword1, src);
587         src += pitch;
588 
589         /* calculation of q1 and q2 */
590         tmp0 = q7_r - q0_r + q1_r - p6_r;
591         tmp2 = q7_r - q1_r + q2_r - p5_r;
592         tmp1 += tmp0;
593         p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
594         tmp1 += tmp2;
595         p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
596         PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
597                     p1_filter16);
598         p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2);
599         p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2);
600         dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
601         dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
602         SD(dword0, src);
603         src += pitch;
604         SD(dword1, src);
605         src += pitch;
606 
607         /* calculation of q3 and q4 */
608         tmp0 = (q7_r - q2_r) + (q3_r - p4_r);
609         tmp2 = (q7_r - q3_r) + (q4_r - p3_r);
610         tmp1 += tmp0;
611         p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
612         tmp1 += tmp2;
613         p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
614         PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
615                     p1_filter16);
616         p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2);
617         p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2);
618         dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
619         dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
620         SD(dword0, src);
621         src += pitch;
622         SD(dword1, src);
623         src += pitch;
624 
625         /* calculation of q5 and q6 */
626         tmp0 = (q7_r - q4_r) + (q5_r - p2_r);
627         tmp2 = (q7_r - q5_r) + (q6_r - p1_r);
628         tmp1 += tmp0;
629         p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
630         tmp1 += tmp2;
631         p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
632         PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
633                     p1_filter16);
634         p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2);
635         p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2);
636         dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
637         dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
638         SD(dword0, src);
639         src += pitch;
640         SD(dword1, src);
641       }
642     }
643   } else {
644     mb_lpf_horizontal_edge_dual(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr,
645                                 count);
646   }
647 }
648 
vpx_lpf_horizontal_16_msa(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)649 void vpx_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch,
650                                const uint8_t *b_limit_ptr,
651                                const uint8_t *limit_ptr,
652                                const uint8_t *thresh_ptr) {
653   mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1);
654 }
655 
vpx_lpf_horizontal_16_dual_msa(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)656 void vpx_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
657                                     const uint8_t *b_limit_ptr,
658                                     const uint8_t *limit_ptr,
659                                     const uint8_t *thresh_ptr) {
660   mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 2);
661 }
662 
transpose_16x8_to_8x16(uint8_t * input,int32_t in_pitch,uint8_t * output,int32_t out_pitch)663 static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
664                                    uint8_t *output, int32_t out_pitch) {
665   v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
666   v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
667   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
668 
669   LD_UB8(input, in_pitch, p7_org, p6_org, p5_org, p4_org, p3_org, p2_org,
670          p1_org, p0_org);
671   /* 8x8 transpose */
672   TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
673                      p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
674   /* 8x8 transpose */
675   ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org,
676              tmp0, tmp1, tmp2, tmp3);
677   ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
678   ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
679   ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4);
680   ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6);
681   SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8);
682 
683   ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
684   output += (8 * out_pitch);
685   ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
686 }
687 
transpose_8x16_to_16x8(uint8_t * input,int32_t in_pitch,uint8_t * output,int32_t out_pitch)688 static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
689                                    uint8_t *output, int32_t out_pitch) {
690   v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
691   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
692 
693   LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0);
694   LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7);
695   TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
696                       q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
697   ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
698 }
699 
transpose_16x16(uint8_t * input,int32_t in_pitch,uint8_t * output,int32_t out_pitch)700 static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output,
701                             int32_t out_pitch) {
702   v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
703   v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
704   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
705   v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
706   v4i32 tmp2, tmp3;
707 
708   LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
709   input += (8 * in_pitch);
710   LD_UB8(input, in_pitch, row8, row9, row10, row11, row12, row13, row14, row15);
711 
712   TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
713                       row9, row10, row11, row12, row13, row14, row15, p7, p6,
714                       p5, p4, p3, p2, p1, p0);
715 
716   /* transpose 16x8 matrix into 8x16 */
717   /* total 8 intermediate register and 32 instructions */
718   q7 = (v16u8)__msa_ilvod_d((v2i64)row8, (v2i64)row0);
719   q6 = (v16u8)__msa_ilvod_d((v2i64)row9, (v2i64)row1);
720   q5 = (v16u8)__msa_ilvod_d((v2i64)row10, (v2i64)row2);
721   q4 = (v16u8)__msa_ilvod_d((v2i64)row11, (v2i64)row3);
722   q3 = (v16u8)__msa_ilvod_d((v2i64)row12, (v2i64)row4);
723   q2 = (v16u8)__msa_ilvod_d((v2i64)row13, (v2i64)row5);
724   q1 = (v16u8)__msa_ilvod_d((v2i64)row14, (v2i64)row6);
725   q0 = (v16u8)__msa_ilvod_d((v2i64)row15, (v2i64)row7);
726 
727   ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1);
728   tmp4 = (v8i16)__msa_ilvod_b((v16i8)q6, (v16i8)q7);
729   tmp5 = (v8i16)__msa_ilvod_b((v16i8)q4, (v16i8)q5);
730 
731   ILVEV_B2_UB(q3, q2, q1, q0, q5, q7);
732   tmp6 = (v8i16)__msa_ilvod_b((v16i8)q2, (v16i8)q3);
733   tmp7 = (v8i16)__msa_ilvod_b((v16i8)q0, (v16i8)q1);
734 
735   ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3);
736   q0 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
737   q4 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
738 
739   tmp2 = (v4i32)__msa_ilvod_h(tmp1, tmp0);
740   tmp3 = (v4i32)__msa_ilvod_h((v8i16)q7, (v8i16)q5);
741   q2 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
742   q6 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
743 
744   ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3);
745   q1 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
746   q5 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
747 
748   tmp2 = (v4i32)__msa_ilvod_h(tmp5, tmp4);
749   tmp3 = (v4i32)__msa_ilvod_h(tmp7, tmp6);
750   q3 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
751   q7 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
752 
753   ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
754   output += (8 * out_pitch);
755   ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
756 }
757 
vt_lpf_t4_and_t8_8w(uint8_t * src,uint8_t * filter48,uint8_t * src_org,int32_t pitch_org,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)758 static int32_t vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
759                                    uint8_t *src_org, int32_t pitch_org,
760                                    const uint8_t *b_limit_ptr,
761                                    const uint8_t *limit_ptr,
762                                    const uint8_t *thresh_ptr) {
763   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
764   v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
765   v16u8 flat, mask, hev, thresh, b_limit, limit;
766   v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
767   v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
768   v16i8 zero = { 0 };
769   v8i16 vec0, vec1, vec2, vec3;
770 
771   /* load vector elements */
772   LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
773 
774   thresh = (v16u8)__msa_fill_b(*thresh_ptr);
775   b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
776   limit = (v16u8)__msa_fill_b(*limit_ptr);
777 
778   /* mask and hev */
779   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
780                mask, flat);
781   /* flat4 */
782   VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
783   /* filter4 */
784   VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
785 
786   flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
787 
788   if (__msa_test_bz_v(flat)) {
789     ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
790     ILVRL_H2_SH(vec1, vec0, vec2, vec3);
791     ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org);
792     return 1;
793   } else {
794     ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
795                q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
796     VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
797                 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
798 
799     /* convert 16 bit output data into 8 bit */
800     p2_r = (v8u16)__msa_pckev_b((v16i8)p2_filt8_r, (v16i8)p2_filt8_r);
801     p1_r = (v8u16)__msa_pckev_b((v16i8)p1_filt8_r, (v16i8)p1_filt8_r);
802     p0_r = (v8u16)__msa_pckev_b((v16i8)p0_filt8_r, (v16i8)p0_filt8_r);
803     q0_r = (v8u16)__msa_pckev_b((v16i8)q0_filt8_r, (v16i8)q0_filt8_r);
804     q1_r = (v8u16)__msa_pckev_b((v16i8)q1_filt8_r, (v16i8)q1_filt8_r);
805     q2_r = (v8u16)__msa_pckev_b((v16i8)q2_filt8_r, (v16i8)q2_filt8_r);
806 
807     /* store pixel values */
808     p2_out = __msa_bmnz_v(p2, (v16u8)p2_r, flat);
809     p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_r, flat);
810     p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_r, flat);
811     q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_r, flat);
812     q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_r, flat);
813     q2_out = __msa_bmnz_v(q2, (v16u8)q2_r, flat);
814 
815     ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
816     filter48 += (4 * 16);
817     ST_UB2(q1_out, q2_out, filter48, 16);
818     filter48 += (2 * 16);
819     ST_UB(flat, filter48);
820 
821     return 0;
822   }
823 }
824 
vt_lpf_t16_8w(uint8_t * src,uint8_t * src_org,int32_t pitch,uint8_t * filter48)825 static int32_t vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
826                              uint8_t *filter48) {
827   v16i8 zero = { 0 };
828   v16u8 filter8, flat, flat2;
829   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
830   v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
831   v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
832   v8u16 tmp0_r, tmp1_r;
833   v8i16 r_out;
834 
835   flat = LD_UB(filter48 + 6 * 16);
836 
837   LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
838   LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
839 
840   VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
841 
842   if (__msa_test_bz_v(flat2)) {
843     v8i16 vec0, vec1, vec2, vec3, vec4;
844 
845     LD_UB4(filter48, 16, p2, p1, p0, q0);
846     LD_UB2(filter48 + 4 * 16, 16, q1, q2);
847 
848     ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
849     ILVRL_H2_SH(vec1, vec0, vec3, vec4);
850     vec2 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1);
851 
852     src_org -= 3;
853     ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
854     ST2x4_UB(vec2, 0, (src_org + 4), pitch);
855     src_org += (4 * pitch);
856     ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
857     ST2x4_UB(vec2, 4, (src_org + 4), pitch);
858 
859     return 1;
860   } else {
861     src -= 7 * 16;
862 
863     ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
864                p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
865                p2_r_in, p1_r_in, p0_r_in);
866     q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
867 
868     tmp0_r = p7_r_in << 3;
869     tmp0_r -= p7_r_in;
870     tmp0_r += p6_r_in;
871     tmp0_r += q0_r_in;
872     tmp1_r = p6_r_in + p5_r_in;
873     tmp1_r += p4_r_in;
874     tmp1_r += p3_r_in;
875     tmp1_r += p2_r_in;
876     tmp1_r += p1_r_in;
877     tmp1_r += p0_r_in;
878     tmp1_r += tmp0_r;
879 
880     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
881     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
882     p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
883     ST8x1_UB(p6, src);
884     src += 16;
885 
886     /* p5 */
887     q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
888     tmp0_r = p5_r_in - p6_r_in;
889     tmp0_r += q1_r_in;
890     tmp0_r -= p7_r_in;
891     tmp1_r += tmp0_r;
892     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
893     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
894     p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
895     ST8x1_UB(p5, src);
896     src += 16;
897 
898     /* p4 */
899     q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
900     tmp0_r = p4_r_in - p5_r_in;
901     tmp0_r += q2_r_in;
902     tmp0_r -= p7_r_in;
903     tmp1_r += tmp0_r;
904     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
905     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
906     p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
907     ST8x1_UB(p4, src);
908     src += 16;
909 
910     /* p3 */
911     q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
912     tmp0_r = p3_r_in - p4_r_in;
913     tmp0_r += q3_r_in;
914     tmp0_r -= p7_r_in;
915     tmp1_r += tmp0_r;
916     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
917     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
918     p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
919     ST8x1_UB(p3, src);
920     src += 16;
921 
922     /* p2 */
923     q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
924     filter8 = LD_UB(filter48);
925     tmp0_r = p2_r_in - p3_r_in;
926     tmp0_r += q4_r_in;
927     tmp0_r -= p7_r_in;
928     tmp1_r += tmp0_r;
929     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
930     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
931     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
932     ST8x1_UB(filter8, src);
933     src += 16;
934 
935     /* p1 */
936     q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
937     filter8 = LD_UB(filter48 + 16);
938     tmp0_r = p1_r_in - p2_r_in;
939     tmp0_r += q5_r_in;
940     tmp0_r -= p7_r_in;
941     tmp1_r += tmp0_r;
942     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
943     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
944     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
945     ST8x1_UB(filter8, src);
946     src += 16;
947 
948     /* p0 */
949     q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
950     filter8 = LD_UB(filter48 + 32);
951     tmp0_r = p0_r_in - p1_r_in;
952     tmp0_r += q6_r_in;
953     tmp0_r -= p7_r_in;
954     tmp1_r += tmp0_r;
955     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
956     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
957     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
958     ST8x1_UB(filter8, src);
959     src += 16;
960 
961     /* q0 */
962     q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
963     filter8 = LD_UB(filter48 + 48);
964     tmp0_r = q7_r_in - p0_r_in;
965     tmp0_r += q0_r_in;
966     tmp0_r -= p7_r_in;
967     tmp1_r += tmp0_r;
968     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
969     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
970     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
971     ST8x1_UB(filter8, src);
972     src += 16;
973 
974     /* q1 */
975     filter8 = LD_UB(filter48 + 64);
976     tmp0_r = q7_r_in - q0_r_in;
977     tmp0_r += q1_r_in;
978     tmp0_r -= p6_r_in;
979     tmp1_r += tmp0_r;
980     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
981     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
982     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
983     ST8x1_UB(filter8, src);
984     src += 16;
985 
986     /* q2 */
987     filter8 = LD_UB(filter48 + 80);
988     tmp0_r = q7_r_in - q1_r_in;
989     tmp0_r += q2_r_in;
990     tmp0_r -= p5_r_in;
991     tmp1_r += tmp0_r;
992     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
993     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
994     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
995     ST8x1_UB(filter8, src);
996     src += 16;
997 
998     /* q3 */
999     tmp0_r = q7_r_in - q2_r_in;
1000     tmp0_r += q3_r_in;
1001     tmp0_r -= p4_r_in;
1002     tmp1_r += tmp0_r;
1003     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1004     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
1005     q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
1006     ST8x1_UB(q3, src);
1007     src += 16;
1008 
1009     /* q4 */
1010     tmp0_r = q7_r_in - q3_r_in;
1011     tmp0_r += q4_r_in;
1012     tmp0_r -= p3_r_in;
1013     tmp1_r += tmp0_r;
1014     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1015     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
1016     q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
1017     ST8x1_UB(q4, src);
1018     src += 16;
1019 
1020     /* q5 */
1021     tmp0_r = q7_r_in - q4_r_in;
1022     tmp0_r += q5_r_in;
1023     tmp0_r -= p2_r_in;
1024     tmp1_r += tmp0_r;
1025     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1026     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
1027     q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
1028     ST8x1_UB(q5, src);
1029     src += 16;
1030 
1031     /* q6 */
1032     tmp0_r = q7_r_in - q5_r_in;
1033     tmp0_r += q6_r_in;
1034     tmp0_r -= p1_r_in;
1035     tmp1_r += tmp0_r;
1036     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1037     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
1038     q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
1039     ST8x1_UB(q6, src);
1040 
1041     return 0;
1042   }
1043 }
1044 
vpx_lpf_vertical_16_msa(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)1045 void vpx_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
1046                              const uint8_t *b_limit_ptr,
1047                              const uint8_t *limit_ptr,
1048                              const uint8_t *thresh_ptr) {
1049   uint8_t early_exit = 0;
1050   DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
1051   uint8_t *filter48 = &transposed_input[16 * 16];
1052 
1053   transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
1054 
1055   early_exit =
1056       vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src, pitch,
1057                           b_limit_ptr, limit_ptr, thresh_ptr);
1058 
1059   if (0 == early_exit) {
1060     early_exit =
1061         vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch, &filter48[0]);
1062 
1063     if (0 == early_exit) {
1064       transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
1065     }
1066   }
1067 }
1068 
vt_lpf_t4_and_t8_16w(uint8_t * src,uint8_t * filter48,uint8_t * src_org,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)1069 static int32_t vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
1070                                     uint8_t *src_org, int32_t pitch,
1071                                     const uint8_t *b_limit_ptr,
1072                                     const uint8_t *limit_ptr,
1073                                     const uint8_t *thresh_ptr) {
1074   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1075   v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
1076   v16u8 flat, mask, hev, thresh, b_limit, limit;
1077   v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1078   v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1079   v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
1080   v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
1081   v16i8 zero = { 0 };
1082   v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
1083 
1084   /* load vector elements */
1085   LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
1086 
1087   thresh = (v16u8)__msa_fill_b(*thresh_ptr);
1088   b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
1089   limit = (v16u8)__msa_fill_b(*limit_ptr);
1090 
1091   /* mask and hev */
1092   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
1093                mask, flat);
1094   /* flat4 */
1095   VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1096   /* filter4 */
1097   VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
1098 
1099   if (__msa_test_bz_v(flat)) {
1100     ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1101     ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1102     ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1103     ILVRL_H2_SH(vec1, vec0, vec4, vec5);
1104 
1105     src_org -= 2;
1106     ST4x8_UB(vec2, vec3, src_org, pitch);
1107     src_org += 8 * pitch;
1108     ST4x8_UB(vec4, vec5, src_org, pitch);
1109 
1110     return 1;
1111   } else {
1112     ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
1113                q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
1114     VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1115                 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1116     ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
1117     ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
1118     VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1119                 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1120 
1121     /* convert 16 bit output data into 8 bit */
1122     PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
1123                 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
1124                 p0_filt8_r, q0_filt8_r);
1125     PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
1126                 q2_filt8_r);
1127 
1128     /* store pixel values */
1129     p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
1130     p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
1131     p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
1132     q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
1133     q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
1134     q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
1135 
1136     ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
1137     filter48 += (4 * 16);
1138     ST_UB2(q1_out, q2_out, filter48, 16);
1139     filter48 += (2 * 16);
1140     ST_UB(flat, filter48);
1141 
1142     return 0;
1143   }
1144 }
1145 
vt_lpf_t16_16w(uint8_t * src,uint8_t * src_org,int32_t pitch,uint8_t * filter48)1146 static int32_t vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
1147                               uint8_t *filter48) {
1148   v16u8 flat, flat2, filter8;
1149   v16i8 zero = { 0 };
1150   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1151   v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
1152   v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
1153   v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
1154   v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
1155   v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
1156   v8i16 l_out, r_out;
1157 
1158   flat = LD_UB(filter48 + 6 * 16);
1159 
1160   LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
1161   LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
1162 
1163   VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
1164 
1165   if (__msa_test_bz_v(flat2)) {
1166     v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1167 
1168     LD_UB4(filter48, 16, p2, p1, p0, q0);
1169     LD_UB2(filter48 + 4 * 16, 16, q1, q2);
1170 
1171     ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1172     ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1173     ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
1174     ILVRL_H2_SH(vec1, vec0, vec6, vec7);
1175     ILVRL_B2_SH(q2, q1, vec2, vec5);
1176 
1177     src_org -= 3;
1178     ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
1179     ST2x4_UB(vec2, 0, (src_org + 4), pitch);
1180     src_org += (4 * pitch);
1181     ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
1182     ST2x4_UB(vec2, 4, (src_org + 4), pitch);
1183     src_org += (4 * pitch);
1184     ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch);
1185     ST2x4_UB(vec5, 0, (src_org + 4), pitch);
1186     src_org += (4 * pitch);
1187     ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch);
1188     ST2x4_UB(vec5, 4, (src_org + 4), pitch);
1189 
1190     return 1;
1191   } else {
1192     src -= 7 * 16;
1193 
1194     ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
1195                p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
1196                p2_r_in, p1_r_in, p0_r_in);
1197     q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
1198 
1199     tmp0_r = p7_r_in << 3;
1200     tmp0_r -= p7_r_in;
1201     tmp0_r += p6_r_in;
1202     tmp0_r += q0_r_in;
1203     tmp1_r = p6_r_in + p5_r_in;
1204     tmp1_r += p4_r_in;
1205     tmp1_r += p3_r_in;
1206     tmp1_r += p2_r_in;
1207     tmp1_r += p1_r_in;
1208     tmp1_r += p0_r_in;
1209     tmp1_r += tmp0_r;
1210     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1211 
1212     ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
1213                p5_l_in, p4_l_in);
1214     ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
1215                p1_l_in, p0_l_in);
1216     q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
1217 
1218     tmp0_l = p7_l_in << 3;
1219     tmp0_l -= p7_l_in;
1220     tmp0_l += p6_l_in;
1221     tmp0_l += q0_l_in;
1222     tmp1_l = p6_l_in + p5_l_in;
1223     tmp1_l += p4_l_in;
1224     tmp1_l += p3_l_in;
1225     tmp1_l += p2_l_in;
1226     tmp1_l += p1_l_in;
1227     tmp1_l += p0_l_in;
1228     tmp1_l += tmp0_l;
1229     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1230 
1231     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1232     p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
1233     ST_UB(p6, src);
1234     src += 16;
1235 
1236     /* p5 */
1237     q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
1238     tmp0_r = p5_r_in - p6_r_in;
1239     tmp0_r += q1_r_in;
1240     tmp0_r -= p7_r_in;
1241     tmp1_r += tmp0_r;
1242     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1243     q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
1244     tmp0_l = p5_l_in - p6_l_in;
1245     tmp0_l += q1_l_in;
1246     tmp0_l -= p7_l_in;
1247     tmp1_l += tmp0_l;
1248     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1249     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1250     p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
1251     ST_UB(p5, src);
1252     src += 16;
1253 
1254     /* p4 */
1255     q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
1256     tmp0_r = p4_r_in - p5_r_in;
1257     tmp0_r += q2_r_in;
1258     tmp0_r -= p7_r_in;
1259     tmp1_r += tmp0_r;
1260     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1261     q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
1262     tmp0_l = p4_l_in - p5_l_in;
1263     tmp0_l += q2_l_in;
1264     tmp0_l -= p7_l_in;
1265     tmp1_l += tmp0_l;
1266     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1267     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1268     p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
1269     ST_UB(p4, src);
1270     src += 16;
1271 
1272     /* p3 */
1273     q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
1274     tmp0_r = p3_r_in - p4_r_in;
1275     tmp0_r += q3_r_in;
1276     tmp0_r -= p7_r_in;
1277     tmp1_r += tmp0_r;
1278     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1279     q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
1280     tmp0_l = p3_l_in - p4_l_in;
1281     tmp0_l += q3_l_in;
1282     tmp0_l -= p7_l_in;
1283     tmp1_l += tmp0_l;
1284     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1285     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1286     p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
1287     ST_UB(p3, src);
1288     src += 16;
1289 
1290     /* p2 */
1291     q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
1292     filter8 = LD_UB(filter48);
1293     tmp0_r = p2_r_in - p3_r_in;
1294     tmp0_r += q4_r_in;
1295     tmp0_r -= p7_r_in;
1296     tmp1_r += tmp0_r;
1297     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1298     q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
1299     tmp0_l = p2_l_in - p3_l_in;
1300     tmp0_l += q4_l_in;
1301     tmp0_l -= p7_l_in;
1302     tmp1_l += tmp0_l;
1303     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1304     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1305     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1306     ST_UB(filter8, src);
1307     src += 16;
1308 
1309     /* p1 */
1310     q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
1311     filter8 = LD_UB(filter48 + 16);
1312     tmp0_r = p1_r_in - p2_r_in;
1313     tmp0_r += q5_r_in;
1314     tmp0_r -= p7_r_in;
1315     tmp1_r += tmp0_r;
1316     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1317     q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
1318     tmp0_l = p1_l_in - p2_l_in;
1319     tmp0_l += q5_l_in;
1320     tmp0_l -= p7_l_in;
1321     tmp1_l += tmp0_l;
1322     l_out = __msa_srari_h((v8i16)(tmp1_l), 4);
1323     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1324     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1325     ST_UB(filter8, src);
1326     src += 16;
1327 
1328     /* p0 */
1329     q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
1330     filter8 = LD_UB(filter48 + 32);
1331     tmp0_r = p0_r_in - p1_r_in;
1332     tmp0_r += q6_r_in;
1333     tmp0_r -= p7_r_in;
1334     tmp1_r += tmp0_r;
1335     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1336     q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
1337     tmp0_l = p0_l_in - p1_l_in;
1338     tmp0_l += q6_l_in;
1339     tmp0_l -= p7_l_in;
1340     tmp1_l += tmp0_l;
1341     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1342     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1343     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1344     ST_UB(filter8, src);
1345     src += 16;
1346 
1347     /* q0 */
1348     q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
1349     filter8 = LD_UB(filter48 + 48);
1350     tmp0_r = q7_r_in - p0_r_in;
1351     tmp0_r += q0_r_in;
1352     tmp0_r -= p7_r_in;
1353     tmp1_r += tmp0_r;
1354     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1355     q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
1356     tmp0_l = q7_l_in - p0_l_in;
1357     tmp0_l += q0_l_in;
1358     tmp0_l -= p7_l_in;
1359     tmp1_l += tmp0_l;
1360     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1361     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1362     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1363     ST_UB(filter8, src);
1364     src += 16;
1365 
1366     /* q1 */
1367     filter8 = LD_UB(filter48 + 64);
1368     tmp0_r = q7_r_in - q0_r_in;
1369     tmp0_r += q1_r_in;
1370     tmp0_r -= p6_r_in;
1371     tmp1_r += tmp0_r;
1372     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1373     tmp0_l = q7_l_in - q0_l_in;
1374     tmp0_l += q1_l_in;
1375     tmp0_l -= p6_l_in;
1376     tmp1_l += tmp0_l;
1377     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1378     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1379     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1380     ST_UB(filter8, src);
1381     src += 16;
1382 
1383     /* q2 */
1384     filter8 = LD_UB(filter48 + 80);
1385     tmp0_r = q7_r_in - q1_r_in;
1386     tmp0_r += q2_r_in;
1387     tmp0_r -= p5_r_in;
1388     tmp1_r += tmp0_r;
1389     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1390     tmp0_l = q7_l_in - q1_l_in;
1391     tmp0_l += q2_l_in;
1392     tmp0_l -= p5_l_in;
1393     tmp1_l += tmp0_l;
1394     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1395     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1396     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1397     ST_UB(filter8, src);
1398     src += 16;
1399 
1400     /* q3 */
1401     tmp0_r = q7_r_in - q2_r_in;
1402     tmp0_r += q3_r_in;
1403     tmp0_r -= p4_r_in;
1404     tmp1_r += tmp0_r;
1405     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1406     tmp0_l = q7_l_in - q2_l_in;
1407     tmp0_l += q3_l_in;
1408     tmp0_l -= p4_l_in;
1409     tmp1_l += tmp0_l;
1410     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1411     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1412     q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
1413     ST_UB(q3, src);
1414     src += 16;
1415 
1416     /* q4 */
1417     tmp0_r = q7_r_in - q3_r_in;
1418     tmp0_r += q4_r_in;
1419     tmp0_r -= p3_r_in;
1420     tmp1_r += tmp0_r;
1421     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1422     tmp0_l = q7_l_in - q3_l_in;
1423     tmp0_l += q4_l_in;
1424     tmp0_l -= p3_l_in;
1425     tmp1_l += tmp0_l;
1426     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1427     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1428     q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
1429     ST_UB(q4, src);
1430     src += 16;
1431 
1432     /* q5 */
1433     tmp0_r = q7_r_in - q4_r_in;
1434     tmp0_r += q5_r_in;
1435     tmp0_r -= p2_r_in;
1436     tmp1_r += tmp0_r;
1437     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1438     tmp0_l = q7_l_in - q4_l_in;
1439     tmp0_l += q5_l_in;
1440     tmp0_l -= p2_l_in;
1441     tmp1_l += tmp0_l;
1442     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1443     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1444     q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
1445     ST_UB(q5, src);
1446     src += 16;
1447 
1448     /* q6 */
1449     tmp0_r = q7_r_in - q5_r_in;
1450     tmp0_r += q6_r_in;
1451     tmp0_r -= p1_r_in;
1452     tmp1_r += tmp0_r;
1453     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1454     tmp0_l = q7_l_in - q5_l_in;
1455     tmp0_l += q6_l_in;
1456     tmp0_l -= p1_l_in;
1457     tmp1_l += tmp0_l;
1458     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1459     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1460     q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
1461     ST_UB(q6, src);
1462 
1463     return 0;
1464   }
1465 }
1466 
vpx_lpf_vertical_16_dual_msa(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)1467 void vpx_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch,
1468                                   const uint8_t *b_limit_ptr,
1469                                   const uint8_t *limit_ptr,
1470                                   const uint8_t *thresh_ptr) {
1471   uint8_t early_exit = 0;
1472   DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
1473   uint8_t *filter48 = &transposed_input[16 * 16];
1474 
1475   transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
1476 
1477   early_exit =
1478       vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src,
1479                            pitch, b_limit_ptr, limit_ptr, thresh_ptr);
1480 
1481   if (0 == early_exit) {
1482     early_exit =
1483         vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, &filter48[0]);
1484 
1485     if (0 == early_exit) {
1486       transpose_16x16(transposed_input, 16, (src - 8), pitch);
1487     }
1488   }
1489 }
1490