1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include "aom_ports/mem.h"
13 #include "aom_dsp/mips/loopfilter_msa.h"
14 
aom_hz_lpf_t4_and_t8_16w(uint8_t * src,int32_t pitch,uint8_t * filter48,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)15 int32_t aom_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48,
16                                  const uint8_t *b_limit_ptr,
17                                  const uint8_t *limit_ptr,
18                                  const uint8_t *thresh_ptr) {
19   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
20   v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
21   v16u8 flat, mask, hev, thresh, b_limit, limit;
22   v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
23   v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
24   v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
25   v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
26   v16u8 zero = { 0 };
27 
28   /* load vector elements */
29   LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
30 
31   thresh = (v16u8)__msa_fill_b(*thresh_ptr);
32   b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
33   limit = (v16u8)__msa_fill_b(*limit_ptr);
34 
35   /* mask and hev */
36   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
37                mask, flat);
38   AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
39   AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
40 
41   if (__msa_test_bz_v(flat)) {
42     ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
43 
44     return 1;
45   } else {
46     ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
47                q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
48     AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
49                 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
50 
51     ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
52     ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
53     AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
54                 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
55 
56     /* convert 16 bit output data into 8 bit */
57     PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
58                 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
59                 p0_filt8_r, q0_filt8_r);
60     PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
61                 q2_filt8_r);
62 
63     /* store pixel values */
64     p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
65     p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
66     p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
67     q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
68     q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
69     q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
70 
71     ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
72     filter48 += (4 * 16);
73     ST_UB2(q1_out, q2_out, filter48, 16);
74     filter48 += (2 * 16);
75     ST_UB(flat, filter48);
76 
77     return 0;
78   }
79 }
80 
aom_hz_lpf_t16_16w(uint8_t * src,int32_t pitch,uint8_t * filter48)81 void aom_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
82   v16u8 flat, flat2, filter8;
83   v16i8 zero = { 0 };
84   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
85   v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
86   v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
87   v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
88   v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
89   v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
90   v8i16 l_out, r_out;
91 
92   flat = LD_UB(filter48 + 96);
93 
94   LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0);
95   LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7);
96   AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
97 
98   if (__msa_test_bz_v(flat2)) {
99     LD_UB4(filter48, 16, p2, p1, p0, q0);
100     LD_UB2(filter48 + 4 * 16, 16, q1, q2);
101 
102     src -= 3 * pitch;
103     ST_UB4(p2, p1, p0, q0, src, pitch);
104     src += (4 * pitch);
105     ST_UB2(q1, q2, src, pitch);
106   } else {
107     src -= 7 * pitch;
108 
109     ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
110                p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
111                p2_r_in, p1_r_in, p0_r_in);
112 
113     q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
114 
115     tmp0_r = p7_r_in << 3;
116     tmp0_r -= p7_r_in;
117     tmp0_r += p6_r_in;
118     tmp0_r += q0_r_in;
119     tmp1_r = p6_r_in + p5_r_in;
120     tmp1_r += p4_r_in;
121     tmp1_r += p3_r_in;
122     tmp1_r += p2_r_in;
123     tmp1_r += p1_r_in;
124     tmp1_r += p0_r_in;
125     tmp1_r += tmp0_r;
126     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
127 
128     ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
129                p5_l_in, p4_l_in);
130     ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
131                p1_l_in, p0_l_in);
132     q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
133 
134     tmp0_l = p7_l_in << 3;
135     tmp0_l -= p7_l_in;
136     tmp0_l += p6_l_in;
137     tmp0_l += q0_l_in;
138     tmp1_l = p6_l_in + p5_l_in;
139     tmp1_l += p4_l_in;
140     tmp1_l += p3_l_in;
141     tmp1_l += p2_l_in;
142     tmp1_l += p1_l_in;
143     tmp1_l += p0_l_in;
144     tmp1_l += tmp0_l;
145     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
146 
147     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
148     p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
149     ST_UB(p6, src);
150     src += pitch;
151 
152     /* p5 */
153     q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
154     tmp0_r = p5_r_in - p6_r_in;
155     tmp0_r += q1_r_in;
156     tmp0_r -= p7_r_in;
157     tmp1_r += tmp0_r;
158     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
159 
160     q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
161     tmp0_l = p5_l_in - p6_l_in;
162     tmp0_l += q1_l_in;
163     tmp0_l -= p7_l_in;
164     tmp1_l += tmp0_l;
165     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
166 
167     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
168     p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
169     ST_UB(p5, src);
170     src += pitch;
171 
172     /* p4 */
173     q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
174     tmp0_r = p4_r_in - p5_r_in;
175     tmp0_r += q2_r_in;
176     tmp0_r -= p7_r_in;
177     tmp1_r += tmp0_r;
178     r_out = (v8i16)__msa_srari_h((v8i16)tmp1_r, 4);
179 
180     q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
181     tmp0_l = p4_l_in - p5_l_in;
182     tmp0_l += q2_l_in;
183     tmp0_l -= p7_l_in;
184     tmp1_l += tmp0_l;
185     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
186 
187     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
188     p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
189     ST_UB(p4, src);
190     src += pitch;
191 
192     /* p3 */
193     q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
194     tmp0_r = p3_r_in - p4_r_in;
195     tmp0_r += q3_r_in;
196     tmp0_r -= p7_r_in;
197     tmp1_r += tmp0_r;
198     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
199 
200     q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
201     tmp0_l = p3_l_in - p4_l_in;
202     tmp0_l += q3_l_in;
203     tmp0_l -= p7_l_in;
204     tmp1_l += tmp0_l;
205     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
206 
207     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
208     p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
209     ST_UB(p3, src);
210     src += pitch;
211 
212     /* p2 */
213     q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
214     filter8 = LD_UB(filter48);
215     tmp0_r = p2_r_in - p3_r_in;
216     tmp0_r += q4_r_in;
217     tmp0_r -= p7_r_in;
218     tmp1_r += tmp0_r;
219     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
220 
221     q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
222     tmp0_l = p2_l_in - p3_l_in;
223     tmp0_l += q4_l_in;
224     tmp0_l -= p7_l_in;
225     tmp1_l += tmp0_l;
226     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
227 
228     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
229     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
230     ST_UB(filter8, src);
231     src += pitch;
232 
233     /* p1 */
234     q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
235     filter8 = LD_UB(filter48 + 16);
236     tmp0_r = p1_r_in - p2_r_in;
237     tmp0_r += q5_r_in;
238     tmp0_r -= p7_r_in;
239     tmp1_r += tmp0_r;
240     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
241 
242     q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
243     tmp0_l = p1_l_in - p2_l_in;
244     tmp0_l += q5_l_in;
245     tmp0_l -= p7_l_in;
246     tmp1_l += tmp0_l;
247     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
248 
249     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
250     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
251     ST_UB(filter8, src);
252     src += pitch;
253 
254     /* p0 */
255     q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
256     filter8 = LD_UB(filter48 + 32);
257     tmp0_r = p0_r_in - p1_r_in;
258     tmp0_r += q6_r_in;
259     tmp0_r -= p7_r_in;
260     tmp1_r += tmp0_r;
261     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
262 
263     q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
264     tmp0_l = p0_l_in - p1_l_in;
265     tmp0_l += q6_l_in;
266     tmp0_l -= p7_l_in;
267     tmp1_l += tmp0_l;
268     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
269 
270     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
271     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
272     ST_UB(filter8, src);
273     src += pitch;
274 
275     /* q0 */
276     q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
277     filter8 = LD_UB(filter48 + 48);
278     tmp0_r = q7_r_in - p0_r_in;
279     tmp0_r += q0_r_in;
280     tmp0_r -= p7_r_in;
281     tmp1_r += tmp0_r;
282     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
283 
284     q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
285     tmp0_l = q7_l_in - p0_l_in;
286     tmp0_l += q0_l_in;
287     tmp0_l -= p7_l_in;
288     tmp1_l += tmp0_l;
289     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
290 
291     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
292     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
293     ST_UB(filter8, src);
294     src += pitch;
295 
296     /* q1 */
297     filter8 = LD_UB(filter48 + 64);
298     tmp0_r = q7_r_in - q0_r_in;
299     tmp0_r += q1_r_in;
300     tmp0_r -= p6_r_in;
301     tmp1_r += tmp0_r;
302     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
303 
304     tmp0_l = q7_l_in - q0_l_in;
305     tmp0_l += q1_l_in;
306     tmp0_l -= p6_l_in;
307     tmp1_l += tmp0_l;
308     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
309 
310     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
311     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
312     ST_UB(filter8, src);
313     src += pitch;
314 
315     /* q2 */
316     filter8 = LD_UB(filter48 + 80);
317     tmp0_r = q7_r_in - q1_r_in;
318     tmp0_r += q2_r_in;
319     tmp0_r -= p5_r_in;
320     tmp1_r += tmp0_r;
321     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
322 
323     tmp0_l = q7_l_in - q1_l_in;
324     tmp0_l += q2_l_in;
325     tmp0_l -= p5_l_in;
326     tmp1_l += tmp0_l;
327     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
328 
329     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
330     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
331     ST_UB(filter8, src);
332     src += pitch;
333 
334     /* q3 */
335     tmp0_r = q7_r_in - q2_r_in;
336     tmp0_r += q3_r_in;
337     tmp0_r -= p4_r_in;
338     tmp1_r += tmp0_r;
339     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
340 
341     tmp0_l = q7_l_in - q2_l_in;
342     tmp0_l += q3_l_in;
343     tmp0_l -= p4_l_in;
344     tmp1_l += tmp0_l;
345     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
346 
347     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
348     q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
349     ST_UB(q3, src);
350     src += pitch;
351 
352     /* q4 */
353     tmp0_r = q7_r_in - q3_r_in;
354     tmp0_r += q4_r_in;
355     tmp0_r -= p3_r_in;
356     tmp1_r += tmp0_r;
357     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
358 
359     tmp0_l = q7_l_in - q3_l_in;
360     tmp0_l += q4_l_in;
361     tmp0_l -= p3_l_in;
362     tmp1_l += tmp0_l;
363     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
364 
365     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
366     q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
367     ST_UB(q4, src);
368     src += pitch;
369 
370     /* q5 */
371     tmp0_r = q7_r_in - q4_r_in;
372     tmp0_r += q5_r_in;
373     tmp0_r -= p2_r_in;
374     tmp1_r += tmp0_r;
375     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
376 
377     tmp0_l = q7_l_in - q4_l_in;
378     tmp0_l += q5_l_in;
379     tmp0_l -= p2_l_in;
380     tmp1_l += tmp0_l;
381     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
382 
383     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
384     q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
385     ST_UB(q5, src);
386     src += pitch;
387 
388     /* q6 */
389     tmp0_r = q7_r_in - q5_r_in;
390     tmp0_r += q6_r_in;
391     tmp0_r -= p1_r_in;
392     tmp1_r += tmp0_r;
393     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
394 
395     tmp0_l = q7_l_in - q5_l_in;
396     tmp0_l += q6_l_in;
397     tmp0_l -= p1_l_in;
398     tmp1_l += tmp0_l;
399     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
400 
401     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
402     q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
403     ST_UB(q6, src);
404   }
405 }
406 
mb_lpf_horizontal_edge_dual(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr,int32_t count)407 static void mb_lpf_horizontal_edge_dual(uint8_t *src, int32_t pitch,
408                                         const uint8_t *b_limit_ptr,
409                                         const uint8_t *limit_ptr,
410                                         const uint8_t *thresh_ptr,
411                                         int32_t count) {
412   DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]);
413   uint8_t early_exit = 0;
414 
415   (void)count;
416 
417   early_exit = aom_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr,
418                                         limit_ptr, thresh_ptr);
419 
420   if (0 == early_exit) {
421     aom_hz_lpf_t16_16w(src, pitch, filter48);
422   }
423 }
424 
mb_lpf_horizontal_edge(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr,int32_t count)425 static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,
426                                    const uint8_t *b_limit_ptr,
427                                    const uint8_t *limit_ptr,
428                                    const uint8_t *thresh_ptr, int32_t count) {
429   if (1 == count) {
430     uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
431     uint64_t dword0, dword1;
432     v16u8 flat2, mask, hev, flat, thresh, b_limit, limit;
433     v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
434     v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
435     v16u8 p0_filter16, p1_filter16;
436     v8i16 p2_filter8, p1_filter8, p0_filter8;
437     v8i16 q0_filter8, q1_filter8, q2_filter8;
438     v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r;
439     v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
440     v16i8 zero = { 0 };
441     v8u16 tmp0, tmp1, tmp2;
442 
443     /* load vector elements */
444     LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
445 
446     thresh = (v16u8)__msa_fill_b(*thresh_ptr);
447     b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
448     limit = (v16u8)__msa_fill_b(*limit_ptr);
449 
450     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
451                  mask, flat);
452     AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
453     AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
454                        q1_out);
455 
456     flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
457 
458     if (__msa_test_bz_v(flat)) {
459       p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
460       p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
461       q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
462       q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
463       SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch);
464     } else {
465       /* convert 8 bit input data into 16 bit */
466       ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
467                  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
468                  q3_r);
469       AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
470                   p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
471 
472       /* convert 16 bit output data into 8 bit */
473       PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero,
474                   q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8);
475       PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
476 
477       /* store pixel values */
478       p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat);
479       p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat);
480       p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat);
481       q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat);
482       q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat);
483       q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat);
484 
485       /* load 16 vector elements */
486       LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4);
487       LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7);
488 
489       AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
490 
491       if (__msa_test_bz_v(flat2)) {
492         p2_d = __msa_copy_u_d((v2i64)p2_out, 0);
493         p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
494         p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
495         q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
496         q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
497         q2_d = __msa_copy_u_d((v2i64)q2_out, 0);
498 
499         SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch);
500         SD(q1_d, src + pitch);
501         SD(q2_d, src + 2 * pitch);
502       } else {
503         /* LSB(right) 8 pixel operation */
504         ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4, zero, q5,
505                    zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r, q4_r, q5_r, q6_r,
506                    q7_r);
507 
508         tmp0 = p7_r << 3;
509         tmp0 -= p7_r;
510         tmp0 += p6_r;
511         tmp0 += q0_r;
512 
513         src -= 7 * pitch;
514 
515         /* calculation of p6 and p5 */
516         tmp1 = p6_r + p5_r + p4_r + p3_r;
517         tmp1 += (p2_r + p1_r + p0_r);
518         tmp1 += tmp0;
519         p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
520         tmp0 = p5_r - p6_r + q1_r - p7_r;
521         tmp1 += tmp0;
522         p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
523         PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
524                     p1_filter16);
525         p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2);
526         p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2);
527         dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
528         dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
529         SD(dword0, src);
530         src += pitch;
531         SD(dword1, src);
532         src += pitch;
533 
534         /* calculation of p4 and p3 */
535         tmp0 = p4_r - p5_r + q2_r - p7_r;
536         tmp2 = p3_r - p4_r + q3_r - p7_r;
537         tmp1 += tmp0;
538         p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
539         tmp1 += tmp2;
540         p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
541         PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
542                     p1_filter16);
543         p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2);
544         p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2);
545         dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
546         dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
547         SD(dword0, src);
548         src += pitch;
549         SD(dword1, src);
550         src += pitch;
551 
552         /* calculation of p2 and p1 */
553         tmp0 = p2_r - p3_r + q4_r - p7_r;
554         tmp2 = p1_r - p2_r + q5_r - p7_r;
555         tmp1 += tmp0;
556         p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
557         tmp1 += tmp2;
558         p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
559         PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
560                     p1_filter16);
561         p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2);
562         p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2);
563         dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
564         dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
565         SD(dword0, src);
566         src += pitch;
567         SD(dword1, src);
568         src += pitch;
569 
570         /* calculation of p0 and q0 */
571         tmp0 = (p0_r - p1_r) + (q6_r - p7_r);
572         tmp2 = (q7_r - p0_r) + (q0_r - p7_r);
573         tmp1 += tmp0;
574         p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
575         tmp1 += tmp2;
576         p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
577         PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
578                     p1_filter16);
579         p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2);
580         p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2);
581         dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
582         dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
583         SD(dword0, src);
584         src += pitch;
585         SD(dword1, src);
586         src += pitch;
587 
588         /* calculation of q1 and q2 */
589         tmp0 = q7_r - q0_r + q1_r - p6_r;
590         tmp2 = q7_r - q1_r + q2_r - p5_r;
591         tmp1 += tmp0;
592         p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
593         tmp1 += tmp2;
594         p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
595         PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
596                     p1_filter16);
597         p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2);
598         p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2);
599         dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
600         dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
601         SD(dword0, src);
602         src += pitch;
603         SD(dword1, src);
604         src += pitch;
605 
606         /* calculation of q3 and q4 */
607         tmp0 = (q7_r - q2_r) + (q3_r - p4_r);
608         tmp2 = (q7_r - q3_r) + (q4_r - p3_r);
609         tmp1 += tmp0;
610         p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
611         tmp1 += tmp2;
612         p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
613         PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
614                     p1_filter16);
615         p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2);
616         p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2);
617         dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
618         dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
619         SD(dword0, src);
620         src += pitch;
621         SD(dword1, src);
622         src += pitch;
623 
624         /* calculation of q5 and q6 */
625         tmp0 = (q7_r - q4_r) + (q5_r - p2_r);
626         tmp2 = (q7_r - q5_r) + (q6_r - p1_r);
627         tmp1 += tmp0;
628         p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
629         tmp1 += tmp2;
630         p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
631         PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
632                     p1_filter16);
633         p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2);
634         p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2);
635         dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
636         dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
637         SD(dword0, src);
638         src += pitch;
639         SD(dword1, src);
640       }
641     }
642   } else {
643     mb_lpf_horizontal_edge_dual(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr,
644                                 count);
645   }
646 }
647 
aom_lpf_horizontal_16_msa(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)648 void aom_lpf_horizontal_16_msa(uint8_t *src, int32_t pitch,
649                                const uint8_t *b_limit_ptr,
650                                const uint8_t *limit_ptr,
651                                const uint8_t *thresh_ptr) {
652   mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1);
653 }
654 
aom_lpf_horizontal_16_dual_msa(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)655 void aom_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
656                                     const uint8_t *b_limit_ptr,
657                                     const uint8_t *limit_ptr,
658                                     const uint8_t *thresh_ptr) {
659   mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 2);
660 }
661 
transpose_16x8_to_8x16(uint8_t * input,int32_t in_pitch,uint8_t * output,int32_t out_pitch)662 static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
663                                    uint8_t *output, int32_t out_pitch) {
664   v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
665   v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
666   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
667 
668   LD_UB8(input, in_pitch, p7_org, p6_org, p5_org, p4_org, p3_org, p2_org,
669          p1_org, p0_org);
670   /* 8x8 transpose */
671   TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
672                      p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
673   /* 8x8 transpose */
674   ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org,
675              tmp0, tmp1, tmp2, tmp3);
676   ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
677   ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
678   ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4);
679   ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6);
680   SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8);
681 
682   ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
683   output += (8 * out_pitch);
684   ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
685 }
686 
transpose_8x16_to_16x8(uint8_t * input,int32_t in_pitch,uint8_t * output,int32_t out_pitch)687 static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
688                                    uint8_t *output, int32_t out_pitch) {
689   v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
690   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
691 
692   LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0);
693   LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7);
694   TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
695                       q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
696   ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
697 }
698 
transpose_16x16(uint8_t * input,int32_t in_pitch,uint8_t * output,int32_t out_pitch)699 static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output,
700                             int32_t out_pitch) {
701   v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
702   v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
703   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
704   v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
705   v4i32 tmp2, tmp3;
706 
707   LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
708   input += (8 * in_pitch);
709   LD_UB8(input, in_pitch, row8, row9, row10, row11, row12, row13, row14, row15);
710 
711   TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
712                       row9, row10, row11, row12, row13, row14, row15, p7, p6,
713                       p5, p4, p3, p2, p1, p0);
714 
715   /* transpose 16x8 matrix into 8x16 */
716   /* total 8 intermediate register and 32 instructions */
717   q7 = (v16u8)__msa_ilvod_d((v2i64)row8, (v2i64)row0);
718   q6 = (v16u8)__msa_ilvod_d((v2i64)row9, (v2i64)row1);
719   q5 = (v16u8)__msa_ilvod_d((v2i64)row10, (v2i64)row2);
720   q4 = (v16u8)__msa_ilvod_d((v2i64)row11, (v2i64)row3);
721   q3 = (v16u8)__msa_ilvod_d((v2i64)row12, (v2i64)row4);
722   q2 = (v16u8)__msa_ilvod_d((v2i64)row13, (v2i64)row5);
723   q1 = (v16u8)__msa_ilvod_d((v2i64)row14, (v2i64)row6);
724   q0 = (v16u8)__msa_ilvod_d((v2i64)row15, (v2i64)row7);
725 
726   ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1);
727   tmp4 = (v8i16)__msa_ilvod_b((v16i8)q6, (v16i8)q7);
728   tmp5 = (v8i16)__msa_ilvod_b((v16i8)q4, (v16i8)q5);
729 
730   ILVEV_B2_UB(q3, q2, q1, q0, q5, q7);
731   tmp6 = (v8i16)__msa_ilvod_b((v16i8)q2, (v16i8)q3);
732   tmp7 = (v8i16)__msa_ilvod_b((v16i8)q0, (v16i8)q1);
733 
734   ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3);
735   q0 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
736   q4 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
737 
738   tmp2 = (v4i32)__msa_ilvod_h(tmp1, tmp0);
739   tmp3 = (v4i32)__msa_ilvod_h((v8i16)q7, (v8i16)q5);
740   q2 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
741   q6 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
742 
743   ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3);
744   q1 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
745   q5 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
746 
747   tmp2 = (v4i32)__msa_ilvod_h(tmp5, tmp4);
748   tmp3 = (v4i32)__msa_ilvod_h(tmp7, tmp6);
749   q3 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
750   q7 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
751 
752   ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
753   output += (8 * out_pitch);
754   ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
755 }
756 
aom_vt_lpf_t4_and_t8_8w(uint8_t * src,uint8_t * filter48,uint8_t * src_org,int32_t pitch_org,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)757 int32_t aom_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
758                                 uint8_t *src_org, int32_t pitch_org,
759                                 const uint8_t *b_limit_ptr,
760                                 const uint8_t *limit_ptr,
761                                 const uint8_t *thresh_ptr) {
762   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
763   v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
764   v16u8 flat, mask, hev, thresh, b_limit, limit;
765   v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
766   v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
767   v16i8 zero = { 0 };
768   v8i16 vec0, vec1, vec2, vec3;
769 
770   /* load vector elements */
771   LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
772 
773   thresh = (v16u8)__msa_fill_b(*thresh_ptr);
774   b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
775   limit = (v16u8)__msa_fill_b(*limit_ptr);
776 
777   /* mask and hev */
778   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
779                mask, flat);
780   /* flat4 */
781   AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
782   /* filter4 */
783   AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
784 
785   flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
786 
787   if (__msa_test_bz_v(flat)) {
788     ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
789     ILVRL_H2_SH(vec1, vec0, vec2, vec3);
790     ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org);
791     return 1;
792   } else {
793     ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
794                q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
795     AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
796                 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
797 
798     /* convert 16 bit output data into 8 bit */
799     p2_r = (v8u16)__msa_pckev_b((v16i8)p2_filt8_r, (v16i8)p2_filt8_r);
800     p1_r = (v8u16)__msa_pckev_b((v16i8)p1_filt8_r, (v16i8)p1_filt8_r);
801     p0_r = (v8u16)__msa_pckev_b((v16i8)p0_filt8_r, (v16i8)p0_filt8_r);
802     q0_r = (v8u16)__msa_pckev_b((v16i8)q0_filt8_r, (v16i8)q0_filt8_r);
803     q1_r = (v8u16)__msa_pckev_b((v16i8)q1_filt8_r, (v16i8)q1_filt8_r);
804     q2_r = (v8u16)__msa_pckev_b((v16i8)q2_filt8_r, (v16i8)q2_filt8_r);
805 
806     /* store pixel values */
807     p2_out = __msa_bmnz_v(p2, (v16u8)p2_r, flat);
808     p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_r, flat);
809     p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_r, flat);
810     q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_r, flat);
811     q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_r, flat);
812     q2_out = __msa_bmnz_v(q2, (v16u8)q2_r, flat);
813 
814     ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
815     filter48 += (4 * 16);
816     ST_UB2(q1_out, q2_out, filter48, 16);
817     filter48 += (2 * 16);
818     ST_UB(flat, filter48);
819 
820     return 0;
821   }
822 }
823 
aom_vt_lpf_t16_8w(uint8_t * src,uint8_t * src_org,int32_t pitch,uint8_t * filter48)824 int32_t aom_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
825                           uint8_t *filter48) {
826   v16i8 zero = { 0 };
827   v16u8 filter8, flat, flat2;
828   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
829   v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
830   v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
831   v8u16 tmp0_r, tmp1_r;
832   v8i16 r_out;
833 
834   flat = LD_UB(filter48 + 6 * 16);
835 
836   LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
837   LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
838 
839   AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
840 
841   if (__msa_test_bz_v(flat2)) {
842     v8i16 vec0, vec1, vec2, vec3, vec4;
843 
844     LD_UB4(filter48, 16, p2, p1, p0, q0);
845     LD_UB2(filter48 + 4 * 16, 16, q1, q2);
846 
847     ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
848     ILVRL_H2_SH(vec1, vec0, vec3, vec4);
849     vec2 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1);
850 
851     src_org -= 3;
852     ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
853     ST2x4_UB(vec2, 0, (src_org + 4), pitch);
854     src_org += (4 * pitch);
855     ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
856     ST2x4_UB(vec2, 4, (src_org + 4), pitch);
857 
858     return 1;
859   } else {
860     src -= 7 * 16;
861 
862     ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
863                p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
864                p2_r_in, p1_r_in, p0_r_in);
865     q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
866 
867     tmp0_r = p7_r_in << 3;
868     tmp0_r -= p7_r_in;
869     tmp0_r += p6_r_in;
870     tmp0_r += q0_r_in;
871     tmp1_r = p6_r_in + p5_r_in;
872     tmp1_r += p4_r_in;
873     tmp1_r += p3_r_in;
874     tmp1_r += p2_r_in;
875     tmp1_r += p1_r_in;
876     tmp1_r += p0_r_in;
877     tmp1_r += tmp0_r;
878 
879     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
880     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
881     p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
882     ST8x1_UB(p6, src);
883     src += 16;
884 
885     /* p5 */
886     q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
887     tmp0_r = p5_r_in - p6_r_in;
888     tmp0_r += q1_r_in;
889     tmp0_r -= p7_r_in;
890     tmp1_r += tmp0_r;
891     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
892     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
893     p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
894     ST8x1_UB(p5, src);
895     src += 16;
896 
897     /* p4 */
898     q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
899     tmp0_r = p4_r_in - p5_r_in;
900     tmp0_r += q2_r_in;
901     tmp0_r -= p7_r_in;
902     tmp1_r += tmp0_r;
903     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
904     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
905     p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
906     ST8x1_UB(p4, src);
907     src += 16;
908 
909     /* p3 */
910     q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
911     tmp0_r = p3_r_in - p4_r_in;
912     tmp0_r += q3_r_in;
913     tmp0_r -= p7_r_in;
914     tmp1_r += tmp0_r;
915     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
916     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
917     p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
918     ST8x1_UB(p3, src);
919     src += 16;
920 
921     /* p2 */
922     q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
923     filter8 = LD_UB(filter48);
924     tmp0_r = p2_r_in - p3_r_in;
925     tmp0_r += q4_r_in;
926     tmp0_r -= p7_r_in;
927     tmp1_r += tmp0_r;
928     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
929     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
930     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
931     ST8x1_UB(filter8, src);
932     src += 16;
933 
934     /* p1 */
935     q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
936     filter8 = LD_UB(filter48 + 16);
937     tmp0_r = p1_r_in - p2_r_in;
938     tmp0_r += q5_r_in;
939     tmp0_r -= p7_r_in;
940     tmp1_r += tmp0_r;
941     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
942     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
943     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
944     ST8x1_UB(filter8, src);
945     src += 16;
946 
947     /* p0 */
948     q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
949     filter8 = LD_UB(filter48 + 32);
950     tmp0_r = p0_r_in - p1_r_in;
951     tmp0_r += q6_r_in;
952     tmp0_r -= p7_r_in;
953     tmp1_r += tmp0_r;
954     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
955     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
956     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
957     ST8x1_UB(filter8, src);
958     src += 16;
959 
960     /* q0 */
961     q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
962     filter8 = LD_UB(filter48 + 48);
963     tmp0_r = q7_r_in - p0_r_in;
964     tmp0_r += q0_r_in;
965     tmp0_r -= p7_r_in;
966     tmp1_r += tmp0_r;
967     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
968     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
969     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
970     ST8x1_UB(filter8, src);
971     src += 16;
972 
973     /* q1 */
974     filter8 = LD_UB(filter48 + 64);
975     tmp0_r = q7_r_in - q0_r_in;
976     tmp0_r += q1_r_in;
977     tmp0_r -= p6_r_in;
978     tmp1_r += tmp0_r;
979     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
980     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
981     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
982     ST8x1_UB(filter8, src);
983     src += 16;
984 
985     /* q2 */
986     filter8 = LD_UB(filter48 + 80);
987     tmp0_r = q7_r_in - q1_r_in;
988     tmp0_r += q2_r_in;
989     tmp0_r -= p5_r_in;
990     tmp1_r += tmp0_r;
991     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
992     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
993     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
994     ST8x1_UB(filter8, src);
995     src += 16;
996 
997     /* q3 */
998     tmp0_r = q7_r_in - q2_r_in;
999     tmp0_r += q3_r_in;
1000     tmp0_r -= p4_r_in;
1001     tmp1_r += tmp0_r;
1002     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1003     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
1004     q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
1005     ST8x1_UB(q3, src);
1006     src += 16;
1007 
1008     /* q4 */
1009     tmp0_r = q7_r_in - q3_r_in;
1010     tmp0_r += q4_r_in;
1011     tmp0_r -= p3_r_in;
1012     tmp1_r += tmp0_r;
1013     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1014     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
1015     q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
1016     ST8x1_UB(q4, src);
1017     src += 16;
1018 
1019     /* q5 */
1020     tmp0_r = q7_r_in - q4_r_in;
1021     tmp0_r += q5_r_in;
1022     tmp0_r -= p2_r_in;
1023     tmp1_r += tmp0_r;
1024     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1025     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
1026     q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
1027     ST8x1_UB(q5, src);
1028     src += 16;
1029 
1030     /* q6 */
1031     tmp0_r = q7_r_in - q5_r_in;
1032     tmp0_r += q6_r_in;
1033     tmp0_r -= p1_r_in;
1034     tmp1_r += tmp0_r;
1035     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1036     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
1037     q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
1038     ST8x1_UB(q6, src);
1039 
1040     return 0;
1041   }
1042 }
1043 
aom_lpf_vertical_16_msa(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)1044 void aom_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
1045                              const uint8_t *b_limit_ptr,
1046                              const uint8_t *limit_ptr,
1047                              const uint8_t *thresh_ptr) {
1048   uint8_t early_exit = 0;
1049   DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
1050   uint8_t *filter48 = &transposed_input[16 * 16];
1051 
1052   transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
1053 
1054   early_exit =
1055       aom_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src,
1056                               pitch, b_limit_ptr, limit_ptr, thresh_ptr);
1057 
1058   if (0 == early_exit) {
1059     early_exit = aom_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
1060                                    &filter48[0]);
1061 
1062     if (0 == early_exit) {
1063       transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
1064     }
1065   }
1066 }
1067 
aom_vt_lpf_t4_and_t8_16w(uint8_t * src,uint8_t * filter48,uint8_t * src_org,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)1068 int32_t aom_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
1069                                  uint8_t *src_org, int32_t pitch,
1070                                  const uint8_t *b_limit_ptr,
1071                                  const uint8_t *limit_ptr,
1072                                  const uint8_t *thresh_ptr) {
1073   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1074   v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
1075   v16u8 flat, mask, hev, thresh, b_limit, limit;
1076   v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1077   v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1078   v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
1079   v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
1080   v16i8 zero = { 0 };
1081   v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
1082 
1083   /* load vector elements */
1084   LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
1085 
1086   thresh = (v16u8)__msa_fill_b(*thresh_ptr);
1087   b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
1088   limit = (v16u8)__msa_fill_b(*limit_ptr);
1089 
1090   /* mask and hev */
1091   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
1092                mask, flat);
1093   /* flat4 */
1094   AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1095   /* filter4 */
1096   AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
1097 
1098   if (__msa_test_bz_v(flat)) {
1099     ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1100     ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1101     ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1102     ILVRL_H2_SH(vec1, vec0, vec4, vec5);
1103 
1104     src_org -= 2;
1105     ST4x8_UB(vec2, vec3, src_org, pitch);
1106     src_org += 8 * pitch;
1107     ST4x8_UB(vec4, vec5, src_org, pitch);
1108 
1109     return 1;
1110   } else {
1111     ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
1112                q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
1113     AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1114                 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1115     ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
1116     ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
1117     AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1118                 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1119 
1120     /* convert 16 bit output data into 8 bit */
1121     PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
1122                 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
1123                 p0_filt8_r, q0_filt8_r);
1124     PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
1125                 q2_filt8_r);
1126 
1127     /* store pixel values */
1128     p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
1129     p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
1130     p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
1131     q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
1132     q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
1133     q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
1134 
1135     ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
1136     filter48 += (4 * 16);
1137     ST_UB2(q1_out, q2_out, filter48, 16);
1138     filter48 += (2 * 16);
1139     ST_UB(flat, filter48);
1140 
1141     return 0;
1142   }
1143 }
1144 
aom_vt_lpf_t16_16w(uint8_t * src,uint8_t * src_org,int32_t pitch,uint8_t * filter48)1145 int32_t aom_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
1146                            uint8_t *filter48) {
1147   v16u8 flat, flat2, filter8;
1148   v16i8 zero = { 0 };
1149   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1150   v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
1151   v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
1152   v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
1153   v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
1154   v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
1155   v8i16 l_out, r_out;
1156 
1157   flat = LD_UB(filter48 + 6 * 16);
1158 
1159   LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
1160   LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
1161 
1162   AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
1163 
1164   if (__msa_test_bz_v(flat2)) {
1165     v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1166 
1167     LD_UB4(filter48, 16, p2, p1, p0, q0);
1168     LD_UB2(filter48 + 4 * 16, 16, q1, q2);
1169 
1170     ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1171     ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1172     ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
1173     ILVRL_H2_SH(vec1, vec0, vec6, vec7);
1174     ILVRL_B2_SH(q2, q1, vec2, vec5);
1175 
1176     src_org -= 3;
1177     ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
1178     ST2x4_UB(vec2, 0, (src_org + 4), pitch);
1179     src_org += (4 * pitch);
1180     ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
1181     ST2x4_UB(vec2, 4, (src_org + 4), pitch);
1182     src_org += (4 * pitch);
1183     ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch);
1184     ST2x4_UB(vec5, 0, (src_org + 4), pitch);
1185     src_org += (4 * pitch);
1186     ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch);
1187     ST2x4_UB(vec5, 4, (src_org + 4), pitch);
1188 
1189     return 1;
1190   } else {
1191     src -= 7 * 16;
1192 
1193     ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
1194                p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
1195                p2_r_in, p1_r_in, p0_r_in);
1196     q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
1197 
1198     tmp0_r = p7_r_in << 3;
1199     tmp0_r -= p7_r_in;
1200     tmp0_r += p6_r_in;
1201     tmp0_r += q0_r_in;
1202     tmp1_r = p6_r_in + p5_r_in;
1203     tmp1_r += p4_r_in;
1204     tmp1_r += p3_r_in;
1205     tmp1_r += p2_r_in;
1206     tmp1_r += p1_r_in;
1207     tmp1_r += p0_r_in;
1208     tmp1_r += tmp0_r;
1209     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1210 
1211     ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
1212                p5_l_in, p4_l_in);
1213     ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
1214                p1_l_in, p0_l_in);
1215     q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
1216 
1217     tmp0_l = p7_l_in << 3;
1218     tmp0_l -= p7_l_in;
1219     tmp0_l += p6_l_in;
1220     tmp0_l += q0_l_in;
1221     tmp1_l = p6_l_in + p5_l_in;
1222     tmp1_l += p4_l_in;
1223     tmp1_l += p3_l_in;
1224     tmp1_l += p2_l_in;
1225     tmp1_l += p1_l_in;
1226     tmp1_l += p0_l_in;
1227     tmp1_l += tmp0_l;
1228     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1229 
1230     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1231     p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
1232     ST_UB(p6, src);
1233     src += 16;
1234 
1235     /* p5 */
1236     q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
1237     tmp0_r = p5_r_in - p6_r_in;
1238     tmp0_r += q1_r_in;
1239     tmp0_r -= p7_r_in;
1240     tmp1_r += tmp0_r;
1241     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1242     q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
1243     tmp0_l = p5_l_in - p6_l_in;
1244     tmp0_l += q1_l_in;
1245     tmp0_l -= p7_l_in;
1246     tmp1_l += tmp0_l;
1247     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1248     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1249     p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
1250     ST_UB(p5, src);
1251     src += 16;
1252 
1253     /* p4 */
1254     q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
1255     tmp0_r = p4_r_in - p5_r_in;
1256     tmp0_r += q2_r_in;
1257     tmp0_r -= p7_r_in;
1258     tmp1_r += tmp0_r;
1259     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1260     q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
1261     tmp0_l = p4_l_in - p5_l_in;
1262     tmp0_l += q2_l_in;
1263     tmp0_l -= p7_l_in;
1264     tmp1_l += tmp0_l;
1265     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1266     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1267     p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
1268     ST_UB(p4, src);
1269     src += 16;
1270 
1271     /* p3 */
1272     q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
1273     tmp0_r = p3_r_in - p4_r_in;
1274     tmp0_r += q3_r_in;
1275     tmp0_r -= p7_r_in;
1276     tmp1_r += tmp0_r;
1277     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1278     q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
1279     tmp0_l = p3_l_in - p4_l_in;
1280     tmp0_l += q3_l_in;
1281     tmp0_l -= p7_l_in;
1282     tmp1_l += tmp0_l;
1283     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1284     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1285     p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
1286     ST_UB(p3, src);
1287     src += 16;
1288 
1289     /* p2 */
1290     q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
1291     filter8 = LD_UB(filter48);
1292     tmp0_r = p2_r_in - p3_r_in;
1293     tmp0_r += q4_r_in;
1294     tmp0_r -= p7_r_in;
1295     tmp1_r += tmp0_r;
1296     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1297     q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
1298     tmp0_l = p2_l_in - p3_l_in;
1299     tmp0_l += q4_l_in;
1300     tmp0_l -= p7_l_in;
1301     tmp1_l += tmp0_l;
1302     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1303     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1304     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1305     ST_UB(filter8, src);
1306     src += 16;
1307 
1308     /* p1 */
1309     q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
1310     filter8 = LD_UB(filter48 + 16);
1311     tmp0_r = p1_r_in - p2_r_in;
1312     tmp0_r += q5_r_in;
1313     tmp0_r -= p7_r_in;
1314     tmp1_r += tmp0_r;
1315     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1316     q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
1317     tmp0_l = p1_l_in - p2_l_in;
1318     tmp0_l += q5_l_in;
1319     tmp0_l -= p7_l_in;
1320     tmp1_l += tmp0_l;
1321     l_out = __msa_srari_h((v8i16)(tmp1_l), 4);
1322     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1323     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1324     ST_UB(filter8, src);
1325     src += 16;
1326 
1327     /* p0 */
1328     q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
1329     filter8 = LD_UB(filter48 + 32);
1330     tmp0_r = p0_r_in - p1_r_in;
1331     tmp0_r += q6_r_in;
1332     tmp0_r -= p7_r_in;
1333     tmp1_r += tmp0_r;
1334     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1335     q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
1336     tmp0_l = p0_l_in - p1_l_in;
1337     tmp0_l += q6_l_in;
1338     tmp0_l -= p7_l_in;
1339     tmp1_l += tmp0_l;
1340     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1341     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1342     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1343     ST_UB(filter8, src);
1344     src += 16;
1345 
1346     /* q0 */
1347     q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
1348     filter8 = LD_UB(filter48 + 48);
1349     tmp0_r = q7_r_in - p0_r_in;
1350     tmp0_r += q0_r_in;
1351     tmp0_r -= p7_r_in;
1352     tmp1_r += tmp0_r;
1353     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1354     q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
1355     tmp0_l = q7_l_in - p0_l_in;
1356     tmp0_l += q0_l_in;
1357     tmp0_l -= p7_l_in;
1358     tmp1_l += tmp0_l;
1359     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1360     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1361     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1362     ST_UB(filter8, src);
1363     src += 16;
1364 
1365     /* q1 */
1366     filter8 = LD_UB(filter48 + 64);
1367     tmp0_r = q7_r_in - q0_r_in;
1368     tmp0_r += q1_r_in;
1369     tmp0_r -= p6_r_in;
1370     tmp1_r += tmp0_r;
1371     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1372     tmp0_l = q7_l_in - q0_l_in;
1373     tmp0_l += q1_l_in;
1374     tmp0_l -= p6_l_in;
1375     tmp1_l += tmp0_l;
1376     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1377     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1378     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1379     ST_UB(filter8, src);
1380     src += 16;
1381 
1382     /* q2 */
1383     filter8 = LD_UB(filter48 + 80);
1384     tmp0_r = q7_r_in - q1_r_in;
1385     tmp0_r += q2_r_in;
1386     tmp0_r -= p5_r_in;
1387     tmp1_r += tmp0_r;
1388     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1389     tmp0_l = q7_l_in - q1_l_in;
1390     tmp0_l += q2_l_in;
1391     tmp0_l -= p5_l_in;
1392     tmp1_l += tmp0_l;
1393     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1394     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1395     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1396     ST_UB(filter8, src);
1397     src += 16;
1398 
1399     /* q3 */
1400     tmp0_r = q7_r_in - q2_r_in;
1401     tmp0_r += q3_r_in;
1402     tmp0_r -= p4_r_in;
1403     tmp1_r += tmp0_r;
1404     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1405     tmp0_l = q7_l_in - q2_l_in;
1406     tmp0_l += q3_l_in;
1407     tmp0_l -= p4_l_in;
1408     tmp1_l += tmp0_l;
1409     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1410     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1411     q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
1412     ST_UB(q3, src);
1413     src += 16;
1414 
1415     /* q4 */
1416     tmp0_r = q7_r_in - q3_r_in;
1417     tmp0_r += q4_r_in;
1418     tmp0_r -= p3_r_in;
1419     tmp1_r += tmp0_r;
1420     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1421     tmp0_l = q7_l_in - q3_l_in;
1422     tmp0_l += q4_l_in;
1423     tmp0_l -= p3_l_in;
1424     tmp1_l += tmp0_l;
1425     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1426     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1427     q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
1428     ST_UB(q4, src);
1429     src += 16;
1430 
1431     /* q5 */
1432     tmp0_r = q7_r_in - q4_r_in;
1433     tmp0_r += q5_r_in;
1434     tmp0_r -= p2_r_in;
1435     tmp1_r += tmp0_r;
1436     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1437     tmp0_l = q7_l_in - q4_l_in;
1438     tmp0_l += q5_l_in;
1439     tmp0_l -= p2_l_in;
1440     tmp1_l += tmp0_l;
1441     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1442     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1443     q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
1444     ST_UB(q5, src);
1445     src += 16;
1446 
1447     /* q6 */
1448     tmp0_r = q7_r_in - q5_r_in;
1449     tmp0_r += q6_r_in;
1450     tmp0_r -= p1_r_in;
1451     tmp1_r += tmp0_r;
1452     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1453     tmp0_l = q7_l_in - q5_l_in;
1454     tmp0_l += q6_l_in;
1455     tmp0_l -= p1_l_in;
1456     tmp1_l += tmp0_l;
1457     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1458     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1459     q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
1460     ST_UB(q6, src);
1461 
1462     return 0;
1463   }
1464 }
1465 
aom_lpf_vertical_16_dual_msa(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)1466 void aom_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch,
1467                                   const uint8_t *b_limit_ptr,
1468                                   const uint8_t *limit_ptr,
1469                                   const uint8_t *thresh_ptr) {
1470   uint8_t early_exit = 0;
1471   DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
1472   uint8_t *filter48 = &transposed_input[16 * 16];
1473 
1474   transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
1475 
1476   early_exit =
1477       aom_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src,
1478                                pitch, b_limit_ptr, limit_ptr, thresh_ptr);
1479 
1480   if (0 == early_exit) {
1481     early_exit = aom_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
1482                                     &filter48[0]);
1483 
1484     if (0 == early_exit) {
1485       transpose_16x16(transposed_input, 16, (src - 8), pitch);
1486     }
1487   }
1488 }
1489