1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include "aom_ports/mem.h"
13 #include "aom_dsp/mips/loopfilter_msa.h"
14 
aom_hz_lpf_t4_and_t8_16w(uint8_t * src,int32_t pitch,uint8_t * filter48,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)15 int32_t aom_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48,
16                                  const uint8_t *b_limit_ptr,
17                                  const uint8_t *limit_ptr,
18                                  const uint8_t *thresh_ptr) {
19   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
20   v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
21   v16u8 flat, mask, hev, thresh, b_limit, limit;
22   v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
23   v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
24   v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
25   v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
26   v16u8 zero = { 0 };
27 
28   /* load vector elements */
29   LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
30 
31   thresh = (v16u8)__msa_fill_b(*thresh_ptr);
32   b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
33   limit = (v16u8)__msa_fill_b(*limit_ptr);
34 
35   /* mask and hev */
36   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
37                mask, flat);
38   AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
39   AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
40 
41   if (__msa_test_bz_v(flat)) {
42     ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
43 
44     return 1;
45   } else {
46     ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
47                q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
48     AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
49                 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
50 
51     ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
52     ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
53     AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
54                 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
55 
56     /* convert 16 bit output data into 8 bit */
57     PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
58                 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
59                 p0_filt8_r, q0_filt8_r);
60     PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
61                 q2_filt8_r);
62 
63     /* store pixel values */
64     p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
65     p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
66     p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
67     q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
68     q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
69     q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
70 
71     ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
72     filter48 += (4 * 16);
73     ST_UB2(q1_out, q2_out, filter48, 16);
74     filter48 += (2 * 16);
75     ST_UB(flat, filter48);
76 
77     return 0;
78   }
79 }
80 
aom_hz_lpf_t16_16w(uint8_t * src,int32_t pitch,uint8_t * filter48)81 void aom_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
82   v16u8 flat, flat2, filter8;
83   v16i8 zero = { 0 };
84   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
85   v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
86   v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
87   v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
88   v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
89   v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
90   v8i16 l_out, r_out;
91 
92   flat = LD_UB(filter48 + 96);
93 
94   LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0);
95   LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7);
96   AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
97 
98   if (__msa_test_bz_v(flat2)) {
99     LD_UB4(filter48, 16, p2, p1, p0, q0);
100     LD_UB2(filter48 + 4 * 16, 16, q1, q2);
101 
102     src -= 3 * pitch;
103     ST_UB4(p2, p1, p0, q0, src, pitch);
104     src += (4 * pitch);
105     ST_UB2(q1, q2, src, pitch);
106   } else {
107     src -= 7 * pitch;
108 
109     ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
110                p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
111                p2_r_in, p1_r_in, p0_r_in);
112 
113     q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
114 
115     tmp0_r = p7_r_in << 3;
116     tmp0_r -= p7_r_in;
117     tmp0_r += p6_r_in;
118     tmp0_r += q0_r_in;
119     tmp1_r = p6_r_in + p5_r_in;
120     tmp1_r += p4_r_in;
121     tmp1_r += p3_r_in;
122     tmp1_r += p2_r_in;
123     tmp1_r += p1_r_in;
124     tmp1_r += p0_r_in;
125     tmp1_r += tmp0_r;
126     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
127 
128     ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
129                p5_l_in, p4_l_in);
130     ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
131                p1_l_in, p0_l_in);
132     q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
133 
134     tmp0_l = p7_l_in << 3;
135     tmp0_l -= p7_l_in;
136     tmp0_l += p6_l_in;
137     tmp0_l += q0_l_in;
138     tmp1_l = p6_l_in + p5_l_in;
139     tmp1_l += p4_l_in;
140     tmp1_l += p3_l_in;
141     tmp1_l += p2_l_in;
142     tmp1_l += p1_l_in;
143     tmp1_l += p0_l_in;
144     tmp1_l += tmp0_l;
145     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
146 
147     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
148     p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
149     ST_UB(p6, src);
150     src += pitch;
151 
152     /* p5 */
153     q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
154     tmp0_r = p5_r_in - p6_r_in;
155     tmp0_r += q1_r_in;
156     tmp0_r -= p7_r_in;
157     tmp1_r += tmp0_r;
158     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
159 
160     q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
161     tmp0_l = p5_l_in - p6_l_in;
162     tmp0_l += q1_l_in;
163     tmp0_l -= p7_l_in;
164     tmp1_l += tmp0_l;
165     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
166 
167     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
168     p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
169     ST_UB(p5, src);
170     src += pitch;
171 
172     /* p4 */
173     q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
174     tmp0_r = p4_r_in - p5_r_in;
175     tmp0_r += q2_r_in;
176     tmp0_r -= p7_r_in;
177     tmp1_r += tmp0_r;
178     r_out = (v8i16)__msa_srari_h((v8i16)tmp1_r, 4);
179 
180     q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
181     tmp0_l = p4_l_in - p5_l_in;
182     tmp0_l += q2_l_in;
183     tmp0_l -= p7_l_in;
184     tmp1_l += tmp0_l;
185     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
186 
187     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
188     p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
189     ST_UB(p4, src);
190     src += pitch;
191 
192     /* p3 */
193     q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
194     tmp0_r = p3_r_in - p4_r_in;
195     tmp0_r += q3_r_in;
196     tmp0_r -= p7_r_in;
197     tmp1_r += tmp0_r;
198     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
199 
200     q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
201     tmp0_l = p3_l_in - p4_l_in;
202     tmp0_l += q3_l_in;
203     tmp0_l -= p7_l_in;
204     tmp1_l += tmp0_l;
205     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
206 
207     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
208     p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
209     ST_UB(p3, src);
210     src += pitch;
211 
212     /* p2 */
213     q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
214     filter8 = LD_UB(filter48);
215     tmp0_r = p2_r_in - p3_r_in;
216     tmp0_r += q4_r_in;
217     tmp0_r -= p7_r_in;
218     tmp1_r += tmp0_r;
219     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
220 
221     q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
222     tmp0_l = p2_l_in - p3_l_in;
223     tmp0_l += q4_l_in;
224     tmp0_l -= p7_l_in;
225     tmp1_l += tmp0_l;
226     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
227 
228     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
229     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
230     ST_UB(filter8, src);
231     src += pitch;
232 
233     /* p1 */
234     q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
235     filter8 = LD_UB(filter48 + 16);
236     tmp0_r = p1_r_in - p2_r_in;
237     tmp0_r += q5_r_in;
238     tmp0_r -= p7_r_in;
239     tmp1_r += tmp0_r;
240     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
241 
242     q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
243     tmp0_l = p1_l_in - p2_l_in;
244     tmp0_l += q5_l_in;
245     tmp0_l -= p7_l_in;
246     tmp1_l += tmp0_l;
247     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
248 
249     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
250     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
251     ST_UB(filter8, src);
252     src += pitch;
253 
254     /* p0 */
255     q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
256     filter8 = LD_UB(filter48 + 32);
257     tmp0_r = p0_r_in - p1_r_in;
258     tmp0_r += q6_r_in;
259     tmp0_r -= p7_r_in;
260     tmp1_r += tmp0_r;
261     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
262 
263     q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
264     tmp0_l = p0_l_in - p1_l_in;
265     tmp0_l += q6_l_in;
266     tmp0_l -= p7_l_in;
267     tmp1_l += tmp0_l;
268     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
269 
270     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
271     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
272     ST_UB(filter8, src);
273     src += pitch;
274 
275     /* q0 */
276     q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
277     filter8 = LD_UB(filter48 + 48);
278     tmp0_r = q7_r_in - p0_r_in;
279     tmp0_r += q0_r_in;
280     tmp0_r -= p7_r_in;
281     tmp1_r += tmp0_r;
282     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
283 
284     q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
285     tmp0_l = q7_l_in - p0_l_in;
286     tmp0_l += q0_l_in;
287     tmp0_l -= p7_l_in;
288     tmp1_l += tmp0_l;
289     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
290 
291     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
292     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
293     ST_UB(filter8, src);
294     src += pitch;
295 
296     /* q1 */
297     filter8 = LD_UB(filter48 + 64);
298     tmp0_r = q7_r_in - q0_r_in;
299     tmp0_r += q1_r_in;
300     tmp0_r -= p6_r_in;
301     tmp1_r += tmp0_r;
302     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
303 
304     tmp0_l = q7_l_in - q0_l_in;
305     tmp0_l += q1_l_in;
306     tmp0_l -= p6_l_in;
307     tmp1_l += tmp0_l;
308     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
309 
310     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
311     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
312     ST_UB(filter8, src);
313     src += pitch;
314 
315     /* q2 */
316     filter8 = LD_UB(filter48 + 80);
317     tmp0_r = q7_r_in - q1_r_in;
318     tmp0_r += q2_r_in;
319     tmp0_r -= p5_r_in;
320     tmp1_r += tmp0_r;
321     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
322 
323     tmp0_l = q7_l_in - q1_l_in;
324     tmp0_l += q2_l_in;
325     tmp0_l -= p5_l_in;
326     tmp1_l += tmp0_l;
327     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
328 
329     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
330     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
331     ST_UB(filter8, src);
332     src += pitch;
333 
334     /* q3 */
335     tmp0_r = q7_r_in - q2_r_in;
336     tmp0_r += q3_r_in;
337     tmp0_r -= p4_r_in;
338     tmp1_r += tmp0_r;
339     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
340 
341     tmp0_l = q7_l_in - q2_l_in;
342     tmp0_l += q3_l_in;
343     tmp0_l -= p4_l_in;
344     tmp1_l += tmp0_l;
345     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
346 
347     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
348     q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
349     ST_UB(q3, src);
350     src += pitch;
351 
352     /* q4 */
353     tmp0_r = q7_r_in - q3_r_in;
354     tmp0_r += q4_r_in;
355     tmp0_r -= p3_r_in;
356     tmp1_r += tmp0_r;
357     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
358 
359     tmp0_l = q7_l_in - q3_l_in;
360     tmp0_l += q4_l_in;
361     tmp0_l -= p3_l_in;
362     tmp1_l += tmp0_l;
363     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
364 
365     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
366     q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
367     ST_UB(q4, src);
368     src += pitch;
369 
370     /* q5 */
371     tmp0_r = q7_r_in - q4_r_in;
372     tmp0_r += q5_r_in;
373     tmp0_r -= p2_r_in;
374     tmp1_r += tmp0_r;
375     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
376 
377     tmp0_l = q7_l_in - q4_l_in;
378     tmp0_l += q5_l_in;
379     tmp0_l -= p2_l_in;
380     tmp1_l += tmp0_l;
381     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
382 
383     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
384     q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
385     ST_UB(q5, src);
386     src += pitch;
387 
388     /* q6 */
389     tmp0_r = q7_r_in - q5_r_in;
390     tmp0_r += q6_r_in;
391     tmp0_r -= p1_r_in;
392     tmp1_r += tmp0_r;
393     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
394 
395     tmp0_l = q7_l_in - q5_l_in;
396     tmp0_l += q6_l_in;
397     tmp0_l -= p1_l_in;
398     tmp1_l += tmp0_l;
399     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
400 
401     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
402     q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
403     ST_UB(q6, src);
404   }
405 }
406 
aom_lpf_horizontal_16_dual_msa(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr,int32_t count)407 void aom_lpf_horizontal_16_dual_msa(uint8_t *src, int32_t pitch,
408                                     const uint8_t *b_limit_ptr,
409                                     const uint8_t *limit_ptr,
410                                     const uint8_t *thresh_ptr, int32_t count) {
411   DECLARE_ALIGNED(32, uint8_t, filter48[16 * 8]);
412   uint8_t early_exit = 0;
413 
414   (void)count;
415 
416   early_exit = aom_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr,
417                                         limit_ptr, thresh_ptr);
418 
419   if (0 == early_exit) {
420     aom_hz_lpf_t16_16w(src, pitch, filter48);
421   }
422 }
423 
mb_lpf_horizontal_edge(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr,int32_t count)424 static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch,
425                                    const uint8_t *b_limit_ptr,
426                                    const uint8_t *limit_ptr,
427                                    const uint8_t *thresh_ptr, int32_t count) {
428   if (1 == count) {
429     uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
430     uint64_t dword0, dword1;
431     v16u8 flat2, mask, hev, flat, thresh, b_limit, limit;
432     v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
433     v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
434     v16u8 p0_filter16, p1_filter16;
435     v8i16 p2_filter8, p1_filter8, p0_filter8;
436     v8i16 q0_filter8, q1_filter8, q2_filter8;
437     v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r;
438     v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
439     v16i8 zero = { 0 };
440     v8u16 tmp0, tmp1, tmp2;
441 
442     /* load vector elements */
443     LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
444 
445     thresh = (v16u8)__msa_fill_b(*thresh_ptr);
446     b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
447     limit = (v16u8)__msa_fill_b(*limit_ptr);
448 
449     LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
450                  mask, flat);
451     AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
452     AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
453                        q1_out);
454 
455     flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
456 
457     if (__msa_test_bz_v(flat)) {
458       p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
459       p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
460       q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
461       q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
462       SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch);
463     } else {
464       /* convert 8 bit input data into 16 bit */
465       ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
466                  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
467                  q3_r);
468       AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
469                   p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
470 
471       /* convert 16 bit output data into 8 bit */
472       PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8, zero,
473                   q0_filter8, p2_filter8, p1_filter8, p0_filter8, q0_filter8);
474       PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
475 
476       /* store pixel values */
477       p2_out = __msa_bmnz_v(p2, (v16u8)p2_filter8, flat);
478       p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filter8, flat);
479       p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filter8, flat);
480       q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filter8, flat);
481       q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filter8, flat);
482       q2_out = __msa_bmnz_v(q2, (v16u8)q2_filter8, flat);
483 
484       /* load 16 vector elements */
485       LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4);
486       LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7);
487 
488       AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
489 
490       if (__msa_test_bz_v(flat2)) {
491         p2_d = __msa_copy_u_d((v2i64)p2_out, 0);
492         p1_d = __msa_copy_u_d((v2i64)p1_out, 0);
493         p0_d = __msa_copy_u_d((v2i64)p0_out, 0);
494         q0_d = __msa_copy_u_d((v2i64)q0_out, 0);
495         q1_d = __msa_copy_u_d((v2i64)q1_out, 0);
496         q2_d = __msa_copy_u_d((v2i64)q2_out, 0);
497 
498         SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch);
499         SD(q1_d, src + pitch);
500         SD(q2_d, src + 2 * pitch);
501       } else {
502         /* LSB(right) 8 pixel operation */
503         ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4, zero, q5,
504                    zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r, q4_r, q5_r, q6_r,
505                    q7_r);
506 
507         tmp0 = p7_r << 3;
508         tmp0 -= p7_r;
509         tmp0 += p6_r;
510         tmp0 += q0_r;
511 
512         src -= 7 * pitch;
513 
514         /* calculation of p6 and p5 */
515         tmp1 = p6_r + p5_r + p4_r + p3_r;
516         tmp1 += (p2_r + p1_r + p0_r);
517         tmp1 += tmp0;
518         p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
519         tmp0 = p5_r - p6_r + q1_r - p7_r;
520         tmp1 += tmp0;
521         p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
522         PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
523                     p1_filter16);
524         p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2);
525         p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2);
526         dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
527         dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
528         SD(dword0, src);
529         src += pitch;
530         SD(dword1, src);
531         src += pitch;
532 
533         /* calculation of p4 and p3 */
534         tmp0 = p4_r - p5_r + q2_r - p7_r;
535         tmp2 = p3_r - p4_r + q3_r - p7_r;
536         tmp1 += tmp0;
537         p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
538         tmp1 += tmp2;
539         p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
540         PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
541                     p1_filter16);
542         p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2);
543         p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2);
544         dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
545         dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
546         SD(dword0, src);
547         src += pitch;
548         SD(dword1, src);
549         src += pitch;
550 
551         /* calculation of p2 and p1 */
552         tmp0 = p2_r - p3_r + q4_r - p7_r;
553         tmp2 = p1_r - p2_r + q5_r - p7_r;
554         tmp1 += tmp0;
555         p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
556         tmp1 += tmp2;
557         p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
558         PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
559                     p1_filter16);
560         p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2);
561         p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2);
562         dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
563         dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
564         SD(dword0, src);
565         src += pitch;
566         SD(dword1, src);
567         src += pitch;
568 
569         /* calculation of p0 and q0 */
570         tmp0 = (p0_r - p1_r) + (q6_r - p7_r);
571         tmp2 = (q7_r - p0_r) + (q0_r - p7_r);
572         tmp1 += tmp0;
573         p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
574         tmp1 += tmp2;
575         p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
576         PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
577                     p1_filter16);
578         p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2);
579         p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2);
580         dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
581         dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
582         SD(dword0, src);
583         src += pitch;
584         SD(dword1, src);
585         src += pitch;
586 
587         /* calculation of q1 and q2 */
588         tmp0 = q7_r - q0_r + q1_r - p6_r;
589         tmp2 = q7_r - q1_r + q2_r - p5_r;
590         tmp1 += tmp0;
591         p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
592         tmp1 += tmp2;
593         p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
594         PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
595                     p1_filter16);
596         p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2);
597         p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2);
598         dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
599         dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
600         SD(dword0, src);
601         src += pitch;
602         SD(dword1, src);
603         src += pitch;
604 
605         /* calculation of q3 and q4 */
606         tmp0 = (q7_r - q2_r) + (q3_r - p4_r);
607         tmp2 = (q7_r - q3_r) + (q4_r - p3_r);
608         tmp1 += tmp0;
609         p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
610         tmp1 += tmp2;
611         p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
612         PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
613                     p1_filter16);
614         p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2);
615         p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2);
616         dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
617         dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
618         SD(dword0, src);
619         src += pitch;
620         SD(dword1, src);
621         src += pitch;
622 
623         /* calculation of q5 and q6 */
624         tmp0 = (q7_r - q4_r) + (q5_r - p2_r);
625         tmp2 = (q7_r - q5_r) + (q6_r - p1_r);
626         tmp1 += tmp0;
627         p0_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
628         tmp1 += tmp2;
629         p1_filter16 = (v16u8)__msa_srari_h((v8i16)tmp1, 4);
630         PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16, p0_filter16,
631                     p1_filter16);
632         p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2);
633         p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2);
634         dword0 = __msa_copy_u_d((v2i64)p0_filter16, 0);
635         dword1 = __msa_copy_u_d((v2i64)p1_filter16, 0);
636         SD(dword0, src);
637         src += pitch;
638         SD(dword1, src);
639       }
640     }
641   } else {
642     aom_lpf_horizontal_16_dual_msa(src, pitch, b_limit_ptr, limit_ptr,
643                                    thresh_ptr, count);
644   }
645 }
646 
aom_lpf_horizontal_edge_8_msa(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)647 void aom_lpf_horizontal_edge_8_msa(uint8_t *src, int32_t pitch,
648                                    const uint8_t *b_limit_ptr,
649                                    const uint8_t *limit_ptr,
650                                    const uint8_t *thresh_ptr) {
651   mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 1);
652 }
653 
aom_lpf_horizontal_edge_16_msa(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)654 void aom_lpf_horizontal_edge_16_msa(uint8_t *src, int32_t pitch,
655                                     const uint8_t *b_limit_ptr,
656                                     const uint8_t *limit_ptr,
657                                     const uint8_t *thresh_ptr) {
658   mb_lpf_horizontal_edge(src, pitch, b_limit_ptr, limit_ptr, thresh_ptr, 2);
659 }
660 
transpose_16x8_to_8x16(uint8_t * input,int32_t in_pitch,uint8_t * output,int32_t out_pitch)661 static void transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
662                                    uint8_t *output, int32_t out_pitch) {
663   v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
664   v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
665   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
666 
667   LD_UB8(input, in_pitch, p7_org, p6_org, p5_org, p4_org, p3_org, p2_org,
668          p1_org, p0_org);
669   /* 8x8 transpose */
670   TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
671                      p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
672   /* 8x8 transpose */
673   ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org,
674              tmp0, tmp1, tmp2, tmp3);
675   ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
676   ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
677   ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4);
678   ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6);
679   SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8);
680 
681   ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
682   output += (8 * out_pitch);
683   ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
684 }
685 
transpose_8x16_to_16x8(uint8_t * input,int32_t in_pitch,uint8_t * output,int32_t out_pitch)686 static void transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
687                                    uint8_t *output, int32_t out_pitch) {
688   v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
689   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
690 
691   LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0);
692   LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7);
693   TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
694                       q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
695   ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
696 }
697 
transpose_16x16(uint8_t * input,int32_t in_pitch,uint8_t * output,int32_t out_pitch)698 static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output,
699                             int32_t out_pitch) {
700   v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
701   v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
702   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
703   v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
704   v4i32 tmp2, tmp3;
705 
706   LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
707   input += (8 * in_pitch);
708   LD_UB8(input, in_pitch, row8, row9, row10, row11, row12, row13, row14, row15);
709 
710   TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7, row8,
711                       row9, row10, row11, row12, row13, row14, row15, p7, p6,
712                       p5, p4, p3, p2, p1, p0);
713 
714   /* transpose 16x8 matrix into 8x16 */
715   /* total 8 intermediate register and 32 instructions */
716   q7 = (v16u8)__msa_ilvod_d((v2i64)row8, (v2i64)row0);
717   q6 = (v16u8)__msa_ilvod_d((v2i64)row9, (v2i64)row1);
718   q5 = (v16u8)__msa_ilvod_d((v2i64)row10, (v2i64)row2);
719   q4 = (v16u8)__msa_ilvod_d((v2i64)row11, (v2i64)row3);
720   q3 = (v16u8)__msa_ilvod_d((v2i64)row12, (v2i64)row4);
721   q2 = (v16u8)__msa_ilvod_d((v2i64)row13, (v2i64)row5);
722   q1 = (v16u8)__msa_ilvod_d((v2i64)row14, (v2i64)row6);
723   q0 = (v16u8)__msa_ilvod_d((v2i64)row15, (v2i64)row7);
724 
725   ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1);
726   tmp4 = (v8i16)__msa_ilvod_b((v16i8)q6, (v16i8)q7);
727   tmp5 = (v8i16)__msa_ilvod_b((v16i8)q4, (v16i8)q5);
728 
729   ILVEV_B2_UB(q3, q2, q1, q0, q5, q7);
730   tmp6 = (v8i16)__msa_ilvod_b((v16i8)q2, (v16i8)q3);
731   tmp7 = (v8i16)__msa_ilvod_b((v16i8)q0, (v16i8)q1);
732 
733   ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3);
734   q0 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
735   q4 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
736 
737   tmp2 = (v4i32)__msa_ilvod_h(tmp1, tmp0);
738   tmp3 = (v4i32)__msa_ilvod_h((v8i16)q7, (v8i16)q5);
739   q2 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
740   q6 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
741 
742   ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3);
743   q1 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
744   q5 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
745 
746   tmp2 = (v4i32)__msa_ilvod_h(tmp5, tmp4);
747   tmp3 = (v4i32)__msa_ilvod_h(tmp7, tmp6);
748   q3 = (v16u8)__msa_ilvev_w(tmp3, tmp2);
749   q7 = (v16u8)__msa_ilvod_w(tmp3, tmp2);
750 
751   ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
752   output += (8 * out_pitch);
753   ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
754 }
755 
aom_vt_lpf_t4_and_t8_8w(uint8_t * src,uint8_t * filter48,uint8_t * src_org,int32_t pitch_org,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)756 int32_t aom_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
757                                 uint8_t *src_org, int32_t pitch_org,
758                                 const uint8_t *b_limit_ptr,
759                                 const uint8_t *limit_ptr,
760                                 const uint8_t *thresh_ptr) {
761   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
762   v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
763   v16u8 flat, mask, hev, thresh, b_limit, limit;
764   v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
765   v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
766   v16i8 zero = { 0 };
767   v8i16 vec0, vec1, vec2, vec3;
768 
769   /* load vector elements */
770   LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
771 
772   thresh = (v16u8)__msa_fill_b(*thresh_ptr);
773   b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
774   limit = (v16u8)__msa_fill_b(*limit_ptr);
775 
776   /* mask and hev */
777   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
778                mask, flat);
779   /* flat4 */
780   AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
781   /* filter4 */
782   AOM_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
783 
784   flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat);
785 
786   if (__msa_test_bz_v(flat)) {
787     ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
788     ILVRL_H2_SH(vec1, vec0, vec2, vec3);
789     ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org);
790     return 1;
791   } else {
792     ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
793                q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
794     AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
795                 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
796 
797     /* convert 16 bit output data into 8 bit */
798     p2_r = (v8u16)__msa_pckev_b((v16i8)p2_filt8_r, (v16i8)p2_filt8_r);
799     p1_r = (v8u16)__msa_pckev_b((v16i8)p1_filt8_r, (v16i8)p1_filt8_r);
800     p0_r = (v8u16)__msa_pckev_b((v16i8)p0_filt8_r, (v16i8)p0_filt8_r);
801     q0_r = (v8u16)__msa_pckev_b((v16i8)q0_filt8_r, (v16i8)q0_filt8_r);
802     q1_r = (v8u16)__msa_pckev_b((v16i8)q1_filt8_r, (v16i8)q1_filt8_r);
803     q2_r = (v8u16)__msa_pckev_b((v16i8)q2_filt8_r, (v16i8)q2_filt8_r);
804 
805     /* store pixel values */
806     p2_out = __msa_bmnz_v(p2, (v16u8)p2_r, flat);
807     p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_r, flat);
808     p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_r, flat);
809     q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_r, flat);
810     q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_r, flat);
811     q2_out = __msa_bmnz_v(q2, (v16u8)q2_r, flat);
812 
813     ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
814     filter48 += (4 * 16);
815     ST_UB2(q1_out, q2_out, filter48, 16);
816     filter48 += (2 * 16);
817     ST_UB(flat, filter48);
818 
819     return 0;
820   }
821 }
822 
aom_vt_lpf_t16_8w(uint8_t * src,uint8_t * src_org,int32_t pitch,uint8_t * filter48)823 int32_t aom_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
824                           uint8_t *filter48) {
825   v16i8 zero = { 0 };
826   v16u8 filter8, flat, flat2;
827   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
828   v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
829   v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
830   v8u16 tmp0_r, tmp1_r;
831   v8i16 r_out;
832 
833   flat = LD_UB(filter48 + 6 * 16);
834 
835   LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
836   LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
837 
838   AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
839 
840   if (__msa_test_bz_v(flat2)) {
841     v8i16 vec0, vec1, vec2, vec3, vec4;
842 
843     LD_UB4(filter48, 16, p2, p1, p0, q0);
844     LD_UB2(filter48 + 4 * 16, 16, q1, q2);
845 
846     ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
847     ILVRL_H2_SH(vec1, vec0, vec3, vec4);
848     vec2 = (v8i16)__msa_ilvr_b((v16i8)q2, (v16i8)q1);
849 
850     src_org -= 3;
851     ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
852     ST2x4_UB(vec2, 0, (src_org + 4), pitch);
853     src_org += (4 * pitch);
854     ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
855     ST2x4_UB(vec2, 4, (src_org + 4), pitch);
856 
857     return 1;
858   } else {
859     src -= 7 * 16;
860 
861     ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
862                p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
863                p2_r_in, p1_r_in, p0_r_in);
864     q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
865 
866     tmp0_r = p7_r_in << 3;
867     tmp0_r -= p7_r_in;
868     tmp0_r += p6_r_in;
869     tmp0_r += q0_r_in;
870     tmp1_r = p6_r_in + p5_r_in;
871     tmp1_r += p4_r_in;
872     tmp1_r += p3_r_in;
873     tmp1_r += p2_r_in;
874     tmp1_r += p1_r_in;
875     tmp1_r += p0_r_in;
876     tmp1_r += tmp0_r;
877 
878     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
879     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
880     p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
881     ST8x1_UB(p6, src);
882     src += 16;
883 
884     /* p5 */
885     q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
886     tmp0_r = p5_r_in - p6_r_in;
887     tmp0_r += q1_r_in;
888     tmp0_r -= p7_r_in;
889     tmp1_r += tmp0_r;
890     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
891     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
892     p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
893     ST8x1_UB(p5, src);
894     src += 16;
895 
896     /* p4 */
897     q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
898     tmp0_r = p4_r_in - p5_r_in;
899     tmp0_r += q2_r_in;
900     tmp0_r -= p7_r_in;
901     tmp1_r += tmp0_r;
902     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
903     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
904     p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
905     ST8x1_UB(p4, src);
906     src += 16;
907 
908     /* p3 */
909     q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
910     tmp0_r = p3_r_in - p4_r_in;
911     tmp0_r += q3_r_in;
912     tmp0_r -= p7_r_in;
913     tmp1_r += tmp0_r;
914     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
915     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
916     p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
917     ST8x1_UB(p3, src);
918     src += 16;
919 
920     /* p2 */
921     q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
922     filter8 = LD_UB(filter48);
923     tmp0_r = p2_r_in - p3_r_in;
924     tmp0_r += q4_r_in;
925     tmp0_r -= p7_r_in;
926     tmp1_r += tmp0_r;
927     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
928     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
929     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
930     ST8x1_UB(filter8, src);
931     src += 16;
932 
933     /* p1 */
934     q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
935     filter8 = LD_UB(filter48 + 16);
936     tmp0_r = p1_r_in - p2_r_in;
937     tmp0_r += q5_r_in;
938     tmp0_r -= p7_r_in;
939     tmp1_r += tmp0_r;
940     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
941     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
942     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
943     ST8x1_UB(filter8, src);
944     src += 16;
945 
946     /* p0 */
947     q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
948     filter8 = LD_UB(filter48 + 32);
949     tmp0_r = p0_r_in - p1_r_in;
950     tmp0_r += q6_r_in;
951     tmp0_r -= p7_r_in;
952     tmp1_r += tmp0_r;
953     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
954     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
955     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
956     ST8x1_UB(filter8, src);
957     src += 16;
958 
959     /* q0 */
960     q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
961     filter8 = LD_UB(filter48 + 48);
962     tmp0_r = q7_r_in - p0_r_in;
963     tmp0_r += q0_r_in;
964     tmp0_r -= p7_r_in;
965     tmp1_r += tmp0_r;
966     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
967     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
968     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
969     ST8x1_UB(filter8, src);
970     src += 16;
971 
972     /* q1 */
973     filter8 = LD_UB(filter48 + 64);
974     tmp0_r = q7_r_in - q0_r_in;
975     tmp0_r += q1_r_in;
976     tmp0_r -= p6_r_in;
977     tmp1_r += tmp0_r;
978     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
979     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
980     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
981     ST8x1_UB(filter8, src);
982     src += 16;
983 
984     /* q2 */
985     filter8 = LD_UB(filter48 + 80);
986     tmp0_r = q7_r_in - q1_r_in;
987     tmp0_r += q2_r_in;
988     tmp0_r -= p5_r_in;
989     tmp1_r += tmp0_r;
990     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
991     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
992     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
993     ST8x1_UB(filter8, src);
994     src += 16;
995 
996     /* q3 */
997     tmp0_r = q7_r_in - q2_r_in;
998     tmp0_r += q3_r_in;
999     tmp0_r -= p4_r_in;
1000     tmp1_r += tmp0_r;
1001     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1002     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
1003     q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
1004     ST8x1_UB(q3, src);
1005     src += 16;
1006 
1007     /* q4 */
1008     tmp0_r = q7_r_in - q3_r_in;
1009     tmp0_r += q4_r_in;
1010     tmp0_r -= p3_r_in;
1011     tmp1_r += tmp0_r;
1012     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1013     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
1014     q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
1015     ST8x1_UB(q4, src);
1016     src += 16;
1017 
1018     /* q5 */
1019     tmp0_r = q7_r_in - q4_r_in;
1020     tmp0_r += q5_r_in;
1021     tmp0_r -= p2_r_in;
1022     tmp1_r += tmp0_r;
1023     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1024     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
1025     q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
1026     ST8x1_UB(q5, src);
1027     src += 16;
1028 
1029     /* q6 */
1030     tmp0_r = q7_r_in - q5_r_in;
1031     tmp0_r += q6_r_in;
1032     tmp0_r -= p1_r_in;
1033     tmp1_r += tmp0_r;
1034     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1035     r_out = (v8i16)__msa_pckev_b((v16i8)r_out, (v16i8)r_out);
1036     q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
1037     ST8x1_UB(q6, src);
1038 
1039     return 0;
1040   }
1041 }
1042 
aom_lpf_vertical_16_msa(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)1043 void aom_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
1044                              const uint8_t *b_limit_ptr,
1045                              const uint8_t *limit_ptr,
1046                              const uint8_t *thresh_ptr) {
1047   uint8_t early_exit = 0;
1048   DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
1049   uint8_t *filter48 = &transposed_input[16 * 16];
1050 
1051   transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
1052 
1053   early_exit =
1054       aom_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src,
1055                               pitch, b_limit_ptr, limit_ptr, thresh_ptr);
1056 
1057   if (0 == early_exit) {
1058     early_exit = aom_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
1059                                    &filter48[0]);
1060 
1061     if (0 == early_exit) {
1062       transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
1063     }
1064   }
1065 }
1066 
aom_vt_lpf_t4_and_t8_16w(uint8_t * src,uint8_t * filter48,uint8_t * src_org,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)1067 int32_t aom_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
1068                                  uint8_t *src_org, int32_t pitch,
1069                                  const uint8_t *b_limit_ptr,
1070                                  const uint8_t *limit_ptr,
1071                                  const uint8_t *thresh_ptr) {
1072   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1073   v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
1074   v16u8 flat, mask, hev, thresh, b_limit, limit;
1075   v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1076   v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1077   v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r;
1078   v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l;
1079   v16i8 zero = { 0 };
1080   v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
1081 
1082   /* load vector elements */
1083   LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
1084 
1085   thresh = (v16u8)__msa_fill_b(*thresh_ptr);
1086   b_limit = (v16u8)__msa_fill_b(*b_limit_ptr);
1087   limit = (v16u8)__msa_fill_b(*limit_ptr);
1088 
1089   /* mask and hev */
1090   LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
1091                mask, flat);
1092   /* flat4 */
1093   AOM_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1094   /* filter4 */
1095   AOM_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
1096 
1097   if (__msa_test_bz_v(flat)) {
1098     ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1099     ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1100     ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1101     ILVRL_H2_SH(vec1, vec0, vec4, vec5);
1102 
1103     src_org -= 2;
1104     ST4x8_UB(vec2, vec3, src_org, pitch);
1105     src_org += 8 * pitch;
1106     ST4x8_UB(vec4, vec5, src_org, pitch);
1107 
1108     return 1;
1109   } else {
1110     ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1, zero,
1111                q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r);
1112     AOM_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1113                 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1114     ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l, p0_l);
1115     ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l, q3_l);
1116     AOM_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1117                 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1118 
1119     /* convert 16 bit output data into 8 bit */
1120     PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
1121                 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
1122                 p0_filt8_r, q0_filt8_r);
1123     PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
1124                 q2_filt8_r);
1125 
1126     /* store pixel values */
1127     p2_out = __msa_bmnz_v(p2, (v16u8)p2_filt8_r, flat);
1128     p1_out = __msa_bmnz_v(p1_out, (v16u8)p1_filt8_r, flat);
1129     p0_out = __msa_bmnz_v(p0_out, (v16u8)p0_filt8_r, flat);
1130     q0_out = __msa_bmnz_v(q0_out, (v16u8)q0_filt8_r, flat);
1131     q1_out = __msa_bmnz_v(q1_out, (v16u8)q1_filt8_r, flat);
1132     q2_out = __msa_bmnz_v(q2, (v16u8)q2_filt8_r, flat);
1133 
1134     ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
1135     filter48 += (4 * 16);
1136     ST_UB2(q1_out, q2_out, filter48, 16);
1137     filter48 += (2 * 16);
1138     ST_UB(flat, filter48);
1139 
1140     return 0;
1141   }
1142 }
1143 
aom_vt_lpf_t16_16w(uint8_t * src,uint8_t * src_org,int32_t pitch,uint8_t * filter48)1144 int32_t aom_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
1145                            uint8_t *filter48) {
1146   v16u8 flat, flat2, filter8;
1147   v16i8 zero = { 0 };
1148   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1149   v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in, p2_r_in, p1_r_in, p0_r_in;
1150   v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in, q3_r_in, q2_r_in, q1_r_in, q0_r_in;
1151   v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in, p3_l_in, p2_l_in, p1_l_in, p0_l_in;
1152   v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in, q3_l_in, q2_l_in, q1_l_in, q0_l_in;
1153   v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
1154   v8i16 l_out, r_out;
1155 
1156   flat = LD_UB(filter48 + 6 * 16);
1157 
1158   LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
1159   LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
1160 
1161   AOM_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
1162 
1163   if (__msa_test_bz_v(flat2)) {
1164     v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1165 
1166     LD_UB4(filter48, 16, p2, p1, p0, q0);
1167     LD_UB2(filter48 + 4 * 16, 16, q1, q2);
1168 
1169     ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1170     ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1171     ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
1172     ILVRL_H2_SH(vec1, vec0, vec6, vec7);
1173     ILVRL_B2_SH(q2, q1, vec2, vec5);
1174 
1175     src_org -= 3;
1176     ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
1177     ST2x4_UB(vec2, 0, (src_org + 4), pitch);
1178     src_org += (4 * pitch);
1179     ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
1180     ST2x4_UB(vec2, 4, (src_org + 4), pitch);
1181     src_org += (4 * pitch);
1182     ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch);
1183     ST2x4_UB(vec5, 0, (src_org + 4), pitch);
1184     src_org += (4 * pitch);
1185     ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch);
1186     ST2x4_UB(vec5, 4, (src_org + 4), pitch);
1187 
1188     return 1;
1189   } else {
1190     src -= 7 * 16;
1191 
1192     ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2, zero,
1193                p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in, p3_r_in,
1194                p2_r_in, p1_r_in, p0_r_in);
1195     q0_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q0);
1196 
1197     tmp0_r = p7_r_in << 3;
1198     tmp0_r -= p7_r_in;
1199     tmp0_r += p6_r_in;
1200     tmp0_r += q0_r_in;
1201     tmp1_r = p6_r_in + p5_r_in;
1202     tmp1_r += p4_r_in;
1203     tmp1_r += p3_r_in;
1204     tmp1_r += p2_r_in;
1205     tmp1_r += p1_r_in;
1206     tmp1_r += p0_r_in;
1207     tmp1_r += tmp0_r;
1208     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1209 
1210     ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
1211                p5_l_in, p4_l_in);
1212     ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
1213                p1_l_in, p0_l_in);
1214     q0_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q0);
1215 
1216     tmp0_l = p7_l_in << 3;
1217     tmp0_l -= p7_l_in;
1218     tmp0_l += p6_l_in;
1219     tmp0_l += q0_l_in;
1220     tmp1_l = p6_l_in + p5_l_in;
1221     tmp1_l += p4_l_in;
1222     tmp1_l += p3_l_in;
1223     tmp1_l += p2_l_in;
1224     tmp1_l += p1_l_in;
1225     tmp1_l += p0_l_in;
1226     tmp1_l += tmp0_l;
1227     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1228 
1229     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1230     p6 = __msa_bmnz_v(p6, (v16u8)r_out, flat2);
1231     ST_UB(p6, src);
1232     src += 16;
1233 
1234     /* p5 */
1235     q1_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q1);
1236     tmp0_r = p5_r_in - p6_r_in;
1237     tmp0_r += q1_r_in;
1238     tmp0_r -= p7_r_in;
1239     tmp1_r += tmp0_r;
1240     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1241     q1_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q1);
1242     tmp0_l = p5_l_in - p6_l_in;
1243     tmp0_l += q1_l_in;
1244     tmp0_l -= p7_l_in;
1245     tmp1_l += tmp0_l;
1246     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1247     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1248     p5 = __msa_bmnz_v(p5, (v16u8)r_out, flat2);
1249     ST_UB(p5, src);
1250     src += 16;
1251 
1252     /* p4 */
1253     q2_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q2);
1254     tmp0_r = p4_r_in - p5_r_in;
1255     tmp0_r += q2_r_in;
1256     tmp0_r -= p7_r_in;
1257     tmp1_r += tmp0_r;
1258     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1259     q2_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q2);
1260     tmp0_l = p4_l_in - p5_l_in;
1261     tmp0_l += q2_l_in;
1262     tmp0_l -= p7_l_in;
1263     tmp1_l += tmp0_l;
1264     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1265     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1266     p4 = __msa_bmnz_v(p4, (v16u8)r_out, flat2);
1267     ST_UB(p4, src);
1268     src += 16;
1269 
1270     /* p3 */
1271     q3_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q3);
1272     tmp0_r = p3_r_in - p4_r_in;
1273     tmp0_r += q3_r_in;
1274     tmp0_r -= p7_r_in;
1275     tmp1_r += tmp0_r;
1276     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1277     q3_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q3);
1278     tmp0_l = p3_l_in - p4_l_in;
1279     tmp0_l += q3_l_in;
1280     tmp0_l -= p7_l_in;
1281     tmp1_l += tmp0_l;
1282     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1283     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1284     p3 = __msa_bmnz_v(p3, (v16u8)r_out, flat2);
1285     ST_UB(p3, src);
1286     src += 16;
1287 
1288     /* p2 */
1289     q4_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q4);
1290     filter8 = LD_UB(filter48);
1291     tmp0_r = p2_r_in - p3_r_in;
1292     tmp0_r += q4_r_in;
1293     tmp0_r -= p7_r_in;
1294     tmp1_r += tmp0_r;
1295     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1296     q4_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q4);
1297     tmp0_l = p2_l_in - p3_l_in;
1298     tmp0_l += q4_l_in;
1299     tmp0_l -= p7_l_in;
1300     tmp1_l += tmp0_l;
1301     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1302     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1303     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1304     ST_UB(filter8, src);
1305     src += 16;
1306 
1307     /* p1 */
1308     q5_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q5);
1309     filter8 = LD_UB(filter48 + 16);
1310     tmp0_r = p1_r_in - p2_r_in;
1311     tmp0_r += q5_r_in;
1312     tmp0_r -= p7_r_in;
1313     tmp1_r += tmp0_r;
1314     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1315     q5_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q5);
1316     tmp0_l = p1_l_in - p2_l_in;
1317     tmp0_l += q5_l_in;
1318     tmp0_l -= p7_l_in;
1319     tmp1_l += tmp0_l;
1320     l_out = __msa_srari_h((v8i16)(tmp1_l), 4);
1321     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1322     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1323     ST_UB(filter8, src);
1324     src += 16;
1325 
1326     /* p0 */
1327     q6_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q6);
1328     filter8 = LD_UB(filter48 + 32);
1329     tmp0_r = p0_r_in - p1_r_in;
1330     tmp0_r += q6_r_in;
1331     tmp0_r -= p7_r_in;
1332     tmp1_r += tmp0_r;
1333     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1334     q6_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q6);
1335     tmp0_l = p0_l_in - p1_l_in;
1336     tmp0_l += q6_l_in;
1337     tmp0_l -= p7_l_in;
1338     tmp1_l += tmp0_l;
1339     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1340     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1341     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1342     ST_UB(filter8, src);
1343     src += 16;
1344 
1345     /* q0 */
1346     q7_r_in = (v8u16)__msa_ilvr_b(zero, (v16i8)q7);
1347     filter8 = LD_UB(filter48 + 48);
1348     tmp0_r = q7_r_in - p0_r_in;
1349     tmp0_r += q0_r_in;
1350     tmp0_r -= p7_r_in;
1351     tmp1_r += tmp0_r;
1352     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1353     q7_l_in = (v8u16)__msa_ilvl_b(zero, (v16i8)q7);
1354     tmp0_l = q7_l_in - p0_l_in;
1355     tmp0_l += q0_l_in;
1356     tmp0_l -= p7_l_in;
1357     tmp1_l += tmp0_l;
1358     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1359     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1360     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1361     ST_UB(filter8, src);
1362     src += 16;
1363 
1364     /* q1 */
1365     filter8 = LD_UB(filter48 + 64);
1366     tmp0_r = q7_r_in - q0_r_in;
1367     tmp0_r += q1_r_in;
1368     tmp0_r -= p6_r_in;
1369     tmp1_r += tmp0_r;
1370     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1371     tmp0_l = q7_l_in - q0_l_in;
1372     tmp0_l += q1_l_in;
1373     tmp0_l -= p6_l_in;
1374     tmp1_l += tmp0_l;
1375     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1376     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1377     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1378     ST_UB(filter8, src);
1379     src += 16;
1380 
1381     /* q2 */
1382     filter8 = LD_UB(filter48 + 80);
1383     tmp0_r = q7_r_in - q1_r_in;
1384     tmp0_r += q2_r_in;
1385     tmp0_r -= p5_r_in;
1386     tmp1_r += tmp0_r;
1387     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1388     tmp0_l = q7_l_in - q1_l_in;
1389     tmp0_l += q2_l_in;
1390     tmp0_l -= p5_l_in;
1391     tmp1_l += tmp0_l;
1392     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1393     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1394     filter8 = __msa_bmnz_v(filter8, (v16u8)r_out, flat2);
1395     ST_UB(filter8, src);
1396     src += 16;
1397 
1398     /* q3 */
1399     tmp0_r = q7_r_in - q2_r_in;
1400     tmp0_r += q3_r_in;
1401     tmp0_r -= p4_r_in;
1402     tmp1_r += tmp0_r;
1403     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1404     tmp0_l = q7_l_in - q2_l_in;
1405     tmp0_l += q3_l_in;
1406     tmp0_l -= p4_l_in;
1407     tmp1_l += tmp0_l;
1408     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1409     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1410     q3 = __msa_bmnz_v(q3, (v16u8)r_out, flat2);
1411     ST_UB(q3, src);
1412     src += 16;
1413 
1414     /* q4 */
1415     tmp0_r = q7_r_in - q3_r_in;
1416     tmp0_r += q4_r_in;
1417     tmp0_r -= p3_r_in;
1418     tmp1_r += tmp0_r;
1419     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1420     tmp0_l = q7_l_in - q3_l_in;
1421     tmp0_l += q4_l_in;
1422     tmp0_l -= p3_l_in;
1423     tmp1_l += tmp0_l;
1424     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1425     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1426     q4 = __msa_bmnz_v(q4, (v16u8)r_out, flat2);
1427     ST_UB(q4, src);
1428     src += 16;
1429 
1430     /* q5 */
1431     tmp0_r = q7_r_in - q4_r_in;
1432     tmp0_r += q5_r_in;
1433     tmp0_r -= p2_r_in;
1434     tmp1_r += tmp0_r;
1435     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1436     tmp0_l = q7_l_in - q4_l_in;
1437     tmp0_l += q5_l_in;
1438     tmp0_l -= p2_l_in;
1439     tmp1_l += tmp0_l;
1440     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1441     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1442     q5 = __msa_bmnz_v(q5, (v16u8)r_out, flat2);
1443     ST_UB(q5, src);
1444     src += 16;
1445 
1446     /* q6 */
1447     tmp0_r = q7_r_in - q5_r_in;
1448     tmp0_r += q6_r_in;
1449     tmp0_r -= p1_r_in;
1450     tmp1_r += tmp0_r;
1451     r_out = __msa_srari_h((v8i16)tmp1_r, 4);
1452     tmp0_l = q7_l_in - q5_l_in;
1453     tmp0_l += q6_l_in;
1454     tmp0_l -= p1_l_in;
1455     tmp1_l += tmp0_l;
1456     l_out = __msa_srari_h((v8i16)tmp1_l, 4);
1457     r_out = (v8i16)__msa_pckev_b((v16i8)l_out, (v16i8)r_out);
1458     q6 = __msa_bmnz_v(q6, (v16u8)r_out, flat2);
1459     ST_UB(q6, src);
1460 
1461     return 0;
1462   }
1463 }
1464 
aom_lpf_vertical_16_dual_msa(uint8_t * src,int32_t pitch,const uint8_t * b_limit_ptr,const uint8_t * limit_ptr,const uint8_t * thresh_ptr)1465 void aom_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch,
1466                                   const uint8_t *b_limit_ptr,
1467                                   const uint8_t *limit_ptr,
1468                                   const uint8_t *thresh_ptr) {
1469   uint8_t early_exit = 0;
1470   DECLARE_ALIGNED(32, uint8_t, transposed_input[16 * 24]);
1471   uint8_t *filter48 = &transposed_input[16 * 16];
1472 
1473   transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
1474 
1475   early_exit =
1476       aom_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src,
1477                                pitch, b_limit_ptr, limit_ptr, thresh_ptr);
1478 
1479   if (0 == early_exit) {
1480     early_exit = aom_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
1481                                     &filter48[0]);
1482 
1483     if (0 == early_exit) {
1484       transpose_16x16(transposed_input, 16, (src - 8), pitch);
1485     }
1486   }
1487 }
1488