1 /*
2  * Copyright (c) 2015 - 2017 Parag Salasakar (Parag.Salasakar@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "h264dsp_mips.h"
23 
avc_wgt_4x2_msa(uint8_t * data,ptrdiff_t stride,int32_t log2_denom,int32_t src_weight,int32_t offset_in)24 static void avc_wgt_4x2_msa(uint8_t *data, ptrdiff_t stride,
25                             int32_t log2_denom, int32_t src_weight,
26                             int32_t offset_in)
27 {
28     uint32_t tp0, tp1, offset_val;
29     v16u8 zero = { 0 };
30     v16u8 src0 = { 0 };
31     v8i16 src0_r, tmp0, wgt, denom, offset;
32 
33     offset_val = (unsigned) offset_in << log2_denom;
34 
35     wgt = __msa_fill_h(src_weight);
36     offset = __msa_fill_h(offset_val);
37     denom = __msa_fill_h(log2_denom);
38 
39     LW2(data, stride, tp0, tp1);
40     INSERT_W2_UB(tp0, tp1, src0);
41     src0_r = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) src0);
42     tmp0 = wgt * src0_r;
43     tmp0 = __msa_adds_s_h(tmp0, offset);
44     tmp0 = __msa_maxi_s_h(tmp0, 0);
45     tmp0 = __msa_srlr_h(tmp0, denom);
46     tmp0 = (v8i16) __msa_sat_u_h((v8u16) tmp0, 7);
47     src0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
48     ST_W2(src0, 0, 1, data, stride);
49 }
50 
avc_wgt_4x4_msa(uint8_t * data,ptrdiff_t stride,int32_t log2_denom,int32_t src_weight,int32_t offset_in)51 static void avc_wgt_4x4_msa(uint8_t *data, ptrdiff_t stride,
52                             int32_t log2_denom, int32_t src_weight,
53                             int32_t offset_in)
54 {
55     uint32_t tp0, tp1, tp2, tp3, offset_val;
56     v16u8 src0 = { 0 };
57     v8i16 src0_r, src1_r, tmp0, tmp1, wgt, denom, offset;
58 
59     offset_val = (unsigned) offset_in << log2_denom;
60 
61     wgt = __msa_fill_h(src_weight);
62     offset = __msa_fill_h(offset_val);
63     denom = __msa_fill_h(log2_denom);
64 
65     LW4(data, stride, tp0, tp1, tp2, tp3);
66     INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
67     UNPCK_UB_SH(src0, src0_r, src1_r);
68     MUL2(wgt, src0_r, wgt, src1_r, tmp0, tmp1);
69     ADDS_SH2_SH(tmp0, offset, tmp1, offset, tmp0, tmp1);
70     MAXI_SH2_SH(tmp0, tmp1, 0);
71     tmp0 = __msa_srlr_h(tmp0, denom);
72     tmp1 = __msa_srlr_h(tmp1, denom);
73     SAT_UH2_SH(tmp0, tmp1, 7);
74     src0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
75     ST_W4(src0, 0, 1, 2, 3, data, stride);
76 }
77 
avc_wgt_4x8_msa(uint8_t * data,ptrdiff_t stride,int32_t log2_denom,int32_t src_weight,int32_t offset_in)78 static void avc_wgt_4x8_msa(uint8_t *data, ptrdiff_t stride,
79                             int32_t log2_denom, int32_t src_weight,
80                             int32_t offset_in)
81 {
82     uint32_t tp0, tp1, tp2, tp3, offset_val;
83     v16u8 src0 = { 0 }, src1 = { 0 };
84     v8i16 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
85     v8i16 wgt, denom, offset;
86 
87     offset_val = (unsigned) offset_in << log2_denom;
88 
89     wgt = __msa_fill_h(src_weight);
90     offset = __msa_fill_h(offset_val);
91     denom = __msa_fill_h(log2_denom);
92 
93     LW4(data, stride, tp0, tp1, tp2, tp3);
94     INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
95     LW4(data + 4 * stride, stride, tp0, tp1, tp2, tp3);
96     INSERT_W4_UB(tp0, tp1, tp2, tp3, src1);
97     UNPCK_UB_SH(src0, src0_r, src1_r);
98     UNPCK_UB_SH(src1, src2_r, src3_r);
99     MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
100          tmp3);
101     ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
102                 tmp1, tmp2, tmp3);
103     MAXI_SH4_SH(tmp0, tmp1, tmp2, tmp3, 0);
104     SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom);
105     SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
106     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
107     ST_W8(src0, src1, 0, 1, 2, 3, 0, 1, 2, 3, data, stride);
108 }
109 
avc_wgt_8x4_msa(uint8_t * data,ptrdiff_t stride,int32_t log2_denom,int32_t src_weight,int32_t offset_in)110 static void avc_wgt_8x4_msa(uint8_t *data, ptrdiff_t stride,
111                             int32_t log2_denom, int32_t src_weight,
112                             int32_t offset_in)
113 {
114     uint32_t offset_val;
115     uint64_t tp0, tp1, tp2, tp3;
116     v16u8 src0 = { 0 }, src1 = { 0 };
117     v8i16 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
118     v8i16 wgt, denom, offset;
119 
120     offset_val = (unsigned) offset_in << log2_denom;
121 
122     wgt = __msa_fill_h(src_weight);
123     offset = __msa_fill_h(offset_val);
124     denom = __msa_fill_h(log2_denom);
125 
126     LD4(data, stride, tp0, tp1, tp2, tp3);
127     INSERT_D2_UB(tp0, tp1, src0);
128     INSERT_D2_UB(tp2, tp3, src1);
129     UNPCK_UB_SH(src0, src0_r, src1_r);
130     UNPCK_UB_SH(src1, src2_r, src3_r);
131     MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
132          tmp3);
133     ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
134                 tmp1, tmp2, tmp3);
135     MAXI_SH4_SH(tmp0, tmp1, tmp2, tmp3, 0);
136     SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom);
137     SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
138     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
139     ST_D4(src0, src1, 0, 1, 0, 1, data, stride);
140 }
141 
avc_wgt_8x8_msa(uint8_t * data,ptrdiff_t stride,int32_t log2_denom,int32_t src_weight,int32_t offset_in)142 static void avc_wgt_8x8_msa(uint8_t *data, ptrdiff_t stride, int32_t log2_denom,
143                             int32_t src_weight, int32_t offset_in)
144 {
145     uint32_t offset_val;
146     uint64_t tp0, tp1, tp2, tp3;
147     v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
148     v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
149     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
150     v8i16 wgt, denom, offset;
151 
152     offset_val = (unsigned) offset_in << log2_denom;
153 
154     wgt = __msa_fill_h(src_weight);
155     offset = __msa_fill_h(offset_val);
156     denom = __msa_fill_h(log2_denom);
157 
158     LD4(data, stride, tp0, tp1, tp2, tp3);
159     INSERT_D2_UB(tp0, tp1, src0);
160     INSERT_D2_UB(tp2, tp3, src1);
161     LD4(data + 4 * stride, stride, tp0, tp1, tp2, tp3);
162     INSERT_D2_UB(tp0, tp1, src2);
163     INSERT_D2_UB(tp2, tp3, src3);
164     UNPCK_UB_SH(src0, src0_r, src1_r);
165     UNPCK_UB_SH(src1, src2_r, src3_r);
166     UNPCK_UB_SH(src2, src4_r, src5_r);
167     UNPCK_UB_SH(src3, src6_r, src7_r);
168     MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
169          tmp3);
170     MUL4(wgt, src4_r, wgt, src5_r, wgt, src6_r, wgt, src7_r, tmp4, tmp5, tmp6,
171          tmp7);
172     ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
173                 tmp1, tmp2, tmp3);
174     ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset, tmp4,
175                 tmp5, tmp6, tmp7);
176     MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
177     SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
178     SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
179     PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1,
180                 src2, src3);
181     ST_D8(src0, src1, src2, src3, 0, 1, 0, 1, 0, 1, 0, 1, data, stride);
182 }
183 
avc_wgt_8x16_msa(uint8_t * data,ptrdiff_t stride,int32_t log2_denom,int32_t src_weight,int32_t offset_in)184 static void avc_wgt_8x16_msa(uint8_t *data, ptrdiff_t stride,
185                              int32_t log2_denom, int32_t src_weight,
186                              int32_t offset_in)
187 {
188     uint32_t offset_val, cnt;
189     uint64_t tp0, tp1, tp2, tp3;
190     v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
191     v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
192     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
193     v8i16 wgt, denom, offset;
194 
195     offset_val = (unsigned) offset_in << log2_denom;
196 
197     wgt = __msa_fill_h(src_weight);
198     offset = __msa_fill_h(offset_val);
199     denom = __msa_fill_h(log2_denom);
200 
201     for (cnt = 2; cnt--;) {
202         LD4(data, stride, tp0, tp1, tp2, tp3);
203         INSERT_D2_UB(tp0, tp1, src0);
204         INSERT_D2_UB(tp2, tp3, src1);
205         LD4(data + 4 * stride, stride, tp0, tp1, tp2, tp3);
206         INSERT_D2_UB(tp0, tp1, src2);
207         INSERT_D2_UB(tp2, tp3, src3);
208         UNPCK_UB_SH(src0, src0_r, src1_r);
209         UNPCK_UB_SH(src1, src2_r, src3_r);
210         UNPCK_UB_SH(src2, src4_r, src5_r);
211         UNPCK_UB_SH(src3, src6_r, src7_r);
212         MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1,
213              tmp2, tmp3);
214         MUL4(wgt, src4_r, wgt, src5_r, wgt, src6_r, wgt, src7_r, tmp4, tmp5,
215              tmp6, tmp7);
216         ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset,
217                     tmp0, tmp1, tmp2, tmp3);
218         ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset,
219                     tmp4, tmp5, tmp6, tmp7);
220         MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
221         SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
222         SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
223         PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1,
224                     src2, src3);
225         ST_D8(src0, src1, src2, src3, 0, 1, 0, 1, 0, 1, 0, 1, data, stride);
226         data += 8 * stride;
227     }
228 }
229 
avc_biwgt_4x2_msa(uint8_t * src,uint8_t * dst,ptrdiff_t stride,int32_t log2_denom,int32_t src_weight,int32_t dst_weight,int32_t offset_in)230 static void avc_biwgt_4x2_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
231                               int32_t log2_denom, int32_t src_weight,
232                               int32_t dst_weight, int32_t offset_in)
233 {
234     uint32_t tp0, tp1;
235     v16i8 src_wgt, dst_wgt, wgt, vec0;
236     v16u8 src0 = { 0 }, dst0 = { 0 };
237     v8i16 tmp0, denom, offset, max255 = __msa_ldi_h(255);
238 
239     offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
240     offset_in += (128 * (src_weight + dst_weight));
241 
242     src_wgt = __msa_fill_b(src_weight);
243     dst_wgt = __msa_fill_b(dst_weight);
244     offset = __msa_fill_h(offset_in);
245     denom = __msa_fill_h(log2_denom + 1);
246 
247     wgt = __msa_ilvev_b(dst_wgt, src_wgt);
248 
249     LW2(src, stride, tp0, tp1);
250     INSERT_W2_UB(tp0, tp1, src0);
251     LW2(dst, stride, tp0, tp1);
252     INSERT_W2_UB(tp0, tp1, dst0);
253     XORI_B2_128_UB(src0, dst0);
254     vec0 = (v16i8) __msa_ilvr_b((v16i8) dst0, (v16i8) src0);
255     tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
256     tmp0 >>= denom;
257     tmp0 = __msa_maxi_s_h(tmp0, 0);
258     tmp0 = __msa_min_s_h(max255, tmp0);
259     dst0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
260     ST_W2(dst0, 0, 1, dst, stride);
261 }
262 
avc_biwgt_4x4_msa(uint8_t * src,uint8_t * dst,ptrdiff_t stride,int32_t log2_denom,int32_t src_weight,int32_t dst_weight,int32_t offset_in)263 static void avc_biwgt_4x4_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
264                               int32_t log2_denom, int32_t src_weight,
265                               int32_t dst_weight, int32_t offset_in)
266 {
267     uint32_t tp0, tp1, tp2, tp3;
268     v16i8 src_wgt, dst_wgt, wgt, vec0, vec1;
269     v16u8 src0, dst0;
270     v8i16 tmp0, tmp1, denom, offset;
271 
272     offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
273     offset_in += (128 * (src_weight + dst_weight));
274 
275     src_wgt = __msa_fill_b(src_weight);
276     dst_wgt = __msa_fill_b(dst_weight);
277     offset = __msa_fill_h(offset_in);
278     denom = __msa_fill_h(log2_denom + 1);
279 
280     wgt = __msa_ilvev_b(dst_wgt, src_wgt);
281 
282     LW4(src, stride, tp0, tp1, tp2, tp3);
283     INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
284     LW4(dst, stride, tp0, tp1, tp2, tp3);
285     INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
286     XORI_B2_128_UB(src0, dst0);
287     ILVRL_B2_SB(dst0, src0, vec0, vec1);
288     tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
289     tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
290     tmp0 >>= denom;
291     tmp1 >>= denom;
292     CLIP_SH2_0_255(tmp0, tmp1);
293     dst0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
294     ST_W4(dst0, 0, 1, 2, 3, dst, stride);
295 }
296 
avc_biwgt_4x8_msa(uint8_t * src,uint8_t * dst,ptrdiff_t stride,int32_t log2_denom,int32_t src_weight,int32_t dst_weight,int32_t offset_in)297 static void avc_biwgt_4x8_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
298                               int32_t log2_denom, int32_t src_weight,
299                               int32_t dst_weight, int32_t offset_in)
300 {
301     uint32_t tp0, tp1, tp2, tp3;
302     v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3;
303     v16u8 src0, src1, dst0, dst1;
304     v8i16 tmp0, tmp1, tmp2, tmp3, denom, offset;
305 
306     offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
307     offset_in += (128 * (src_weight + dst_weight));
308 
309     src_wgt = __msa_fill_b(src_weight);
310     dst_wgt = __msa_fill_b(dst_weight);
311     offset = __msa_fill_h(offset_in);
312     denom = __msa_fill_h(log2_denom + 1);
313     wgt = __msa_ilvev_b(dst_wgt, src_wgt);
314 
315     LW4(src, stride, tp0, tp1, tp2, tp3);
316     src += 4 * stride;
317     INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
318     LW4(src, stride, tp0, tp1, tp2, tp3);
319     INSERT_W4_UB(tp0, tp1, tp2, tp3, src1);
320     LW4(dst, stride, tp0, tp1, tp2, tp3);
321     INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
322     LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
323     INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
324     XORI_B4_128_UB(src0, src1, dst0, dst1);
325     ILVRL_B2_SB(dst0, src0, vec0, vec1);
326     ILVRL_B2_SB(dst1, src1, vec2, vec3);
327     tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
328     tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
329     tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
330     tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
331     SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
332     CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
333     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
334     ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
335 }
336 
avc_biwgt_8x4_msa(uint8_t * src,uint8_t * dst,ptrdiff_t stride,int32_t log2_denom,int32_t src_weight,int32_t dst_weight,int32_t offset_in)337 static void avc_biwgt_8x4_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
338                               int32_t log2_denom, int32_t src_weight,
339                               int32_t dst_weight, int32_t offset_in)
340 {
341     uint64_t tp0, tp1, tp2, tp3;
342     v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3;
343     v16u8 src0, src1, dst0, dst1;
344     v8i16 tmp0, tmp1, tmp2, tmp3, denom, offset;
345 
346     offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
347     offset_in += (128 * (src_weight + dst_weight));
348 
349     src_wgt = __msa_fill_b(src_weight);
350     dst_wgt = __msa_fill_b(dst_weight);
351     offset = __msa_fill_h(offset_in);
352     denom = __msa_fill_h(log2_denom + 1);
353 
354     wgt = __msa_ilvev_b(dst_wgt, src_wgt);
355 
356     LD4(src, stride, tp0, tp1, tp2, tp3);
357     INSERT_D2_UB(tp0, tp1, src0);
358     INSERT_D2_UB(tp2, tp3, src1);
359     LD4(dst, stride, tp0, tp1, tp2, tp3);
360     INSERT_D2_UB(tp0, tp1, dst0);
361     INSERT_D2_UB(tp2, tp3, dst1);
362     XORI_B4_128_UB(src0, src1, dst0, dst1);
363     ILVRL_B2_SB(dst0, src0, vec0, vec1);
364     ILVRL_B2_SB(dst1, src1, vec2, vec3);
365     tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
366     tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
367     tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
368     tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
369     SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
370     CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
371     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
372     ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
373 }
374 
avc_biwgt_8x8_msa(uint8_t * src,uint8_t * dst,ptrdiff_t stride,int32_t log2_denom,int32_t src_weight,int32_t dst_weight,int32_t offset_in)375 static void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
376                               int32_t log2_denom, int32_t src_weight,
377                               int32_t dst_weight, int32_t offset_in)
378 {
379     uint64_t tp0, tp1, tp2, tp3;
380     v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
381     v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
382     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset;
383 
384     offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
385     offset_in += (128 * (src_weight + dst_weight));
386 
387     src_wgt = __msa_fill_b(src_weight);
388     dst_wgt = __msa_fill_b(dst_weight);
389     offset = __msa_fill_h(offset_in);
390     denom = __msa_fill_h(log2_denom + 1);
391     wgt = __msa_ilvev_b(dst_wgt, src_wgt);
392 
393     LD4(src, stride, tp0, tp1, tp2, tp3);
394     INSERT_D2_UB(tp0, tp1, src0);
395     INSERT_D2_UB(tp2, tp3, src1);
396     LD4(src + 4 * stride, stride, tp0, tp1, tp2, tp3);
397     INSERT_D2_UB(tp0, tp1, src2);
398     INSERT_D2_UB(tp2, tp3, src3);
399     LD4(dst, stride, tp0, tp1, tp2, tp3);
400     INSERT_D2_UB(tp0, tp1, dst0);
401     INSERT_D2_UB(tp2, tp3, dst1);
402     LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
403     INSERT_D2_UB(tp0, tp1, dst2);
404     INSERT_D2_UB(tp2, tp3, dst3);
405     XORI_B8_128_UB(src0, src1, src2, src3, dst0, dst1, dst2, dst3);
406     ILVRL_B2_SB(dst0, src0, vec0, vec1);
407     ILVRL_B2_SB(dst1, src1, vec2, vec3);
408     ILVRL_B2_SB(dst2, src2, vec4, vec5);
409     ILVRL_B2_SB(dst3, src3, vec6, vec7);
410     tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
411     tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
412     tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
413     tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
414     tmp4 = __msa_dpadd_s_h(offset, wgt, vec4);
415     tmp5 = __msa_dpadd_s_h(offset, wgt, vec5);
416     tmp6 = __msa_dpadd_s_h(offset, wgt, vec6);
417     tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
418     SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
419     SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
420     CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
421     PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
422     PCKEV_B2_UB(tmp5, tmp4, tmp7, tmp6, dst2, dst3);
423     ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
424 }
425 
avc_biwgt_8x16_msa(uint8_t * src,uint8_t * dst,ptrdiff_t stride,int32_t log2_denom,int32_t src_weight,int32_t dst_weight,int32_t offset_in)426 static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
427                                int32_t log2_denom, int32_t src_weight,
428                                int32_t dst_weight, int32_t offset_in)
429 {
430     uint8_t cnt;
431     uint64_t tp0, tp1, tp2, tp3;
432     v16i8 src_wgt, dst_wgt, wgt;
433     v16u8 src0, src1, src2, src3;
434     v16u8 dst0, dst1, dst2, dst3;
435     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
436     v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
437     v8i16 denom, offset;
438 
439     offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
440     offset_in += (128 * (src_weight + dst_weight));
441 
442     src_wgt = __msa_fill_b(src_weight);
443     dst_wgt = __msa_fill_b(dst_weight);
444     offset = __msa_fill_h(offset_in);
445     denom = __msa_fill_h(log2_denom + 1);
446     wgt = __msa_ilvev_b(dst_wgt, src_wgt);
447 
448     for (cnt = 2; cnt--;) {
449         LD4(src, stride, tp0, tp1, tp2, tp3);
450         src += 4 * stride;
451         INSERT_D2_UB(tp0, tp1, src0);
452         INSERT_D2_UB(tp2, tp3, src1);
453         LD4(src, stride, tp0, tp1, tp2, tp3);
454         src += 4 * stride;
455         INSERT_D2_UB(tp0, tp1, src2);
456         INSERT_D2_UB(tp2, tp3, src3);
457         LD4(dst, stride, tp0, tp1, tp2, tp3);
458         INSERT_D2_UB(tp0, tp1, dst0);
459         INSERT_D2_UB(tp2, tp3, dst1);
460         LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
461         INSERT_D2_UB(tp0, tp1, dst2);
462         INSERT_D2_UB(tp2, tp3, dst3);
463         XORI_B4_128_UB(src0, src1, src2, src3);
464         XORI_B4_128_UB(dst0, dst1, dst2, dst3);
465         ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
466                    vec0, vec2, vec4, vec6);
467         ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3,
468                    vec1, vec3, vec5, vec7);
469 
470         temp0 = __msa_dpadd_s_h(offset, wgt, vec0);
471         temp1 = __msa_dpadd_s_h(offset, wgt, vec1);
472         temp2 = __msa_dpadd_s_h(offset, wgt, vec2);
473         temp3 = __msa_dpadd_s_h(offset, wgt, vec3);
474         temp4 = __msa_dpadd_s_h(offset, wgt, vec4);
475         temp5 = __msa_dpadd_s_h(offset, wgt, vec5);
476         temp6 = __msa_dpadd_s_h(offset, wgt, vec6);
477         temp7 = __msa_dpadd_s_h(offset, wgt, vec7);
478 
479         SRA_4V(temp0, temp1, temp2, temp3, denom);
480         SRA_4V(temp4, temp5, temp6, temp7, denom);
481         CLIP_SH8_0_255(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
482         PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
483                     dst0, dst1, dst2, dst3);
484         ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
485         dst += 8 * stride;
486     }
487 }
488 
489 #define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in,          \
490                                  q3_or_p3_org_in, p1_or_q1_org_in,          \
491                                  p2_or_q2_org_in, q1_or_p1_org_in,          \
492                                  p0_or_q0_out, p1_or_q1_out, p2_or_q2_out)  \
493 {                                                                           \
494     v8i16 threshold;                                                        \
495     v8i16 const3 = __msa_ldi_h(3);                                          \
496                                                                             \
497     threshold = (p0_or_q0_org_in) + (q3_or_p3_org_in);                      \
498     threshold += (p1_or_q1_org_in);                                         \
499                                                                             \
500     (p0_or_q0_out) = threshold << 1;                                        \
501     (p0_or_q0_out) += (p2_or_q2_org_in);                                    \
502     (p0_or_q0_out) += (q1_or_p1_org_in);                                    \
503     (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 3);                      \
504                                                                             \
505     (p1_or_q1_out) = (p2_or_q2_org_in) + threshold;                         \
506     (p1_or_q1_out) = __msa_srari_h((p1_or_q1_out), 2);                      \
507                                                                             \
508     (p2_or_q2_out) = (p2_or_q2_org_in) * const3;                            \
509     (p2_or_q2_out) += (p3_or_q3_org_in);                                    \
510     (p2_or_q2_out) += (p3_or_q3_org_in);                                    \
511     (p2_or_q2_out) += threshold;                                            \
512     (p2_or_q2_out) = __msa_srari_h((p2_or_q2_out), 3);                      \
513 }
514 
515 /* data[-u32_img_width] = (uint8_t)((2 * p1 + p0 + q1 + 2) >> 2); */
516 #define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in,   \
517                          p1_or_q1_org_in, p0_or_q0_out)      \
518 {                                                            \
519     (p0_or_q0_out) = (p0_or_q0_org_in) + (q1_or_p1_org_in);  \
520     (p0_or_q0_out) += (p1_or_q1_org_in);                     \
521     (p0_or_q0_out) += (p1_or_q1_org_in);                     \
522     (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 2);       \
523 }
524 
525 #define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in,    \
526                          p1_or_q1_org_in, p2_or_q2_org_in,    \
527                          negate_tc_in, tc_in, p1_or_q1_out)   \
528 {                                                             \
529     v8i16 clip3, temp;                                        \
530                                                               \
531     clip3 = (v8i16) __msa_aver_u_h((v8u16) p0_or_q0_org_in,   \
532                                    (v8u16) q0_or_p0_org_in);  \
533     temp = p1_or_q1_org_in << 1;                              \
534     clip3 = clip3 - temp;                                     \
535     clip3 = __msa_ave_s_h(p2_or_q2_org_in, clip3);            \
536     CLIP_SH(clip3, negate_tc_in, tc_in);                      \
537     p1_or_q1_out = p1_or_q1_org_in + clip3;                   \
538 }
539 
540 #define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in,          \
541                      p1_or_q1_org_in, q1_or_p1_org_in,          \
542                      negate_threshold_in, threshold_in,         \
543                      p0_or_q0_out, q0_or_p0_out)                \
544 {                                                               \
545     v8i16 q0_sub_p0, p1_sub_q1, delta;                          \
546                                                                 \
547     q0_sub_p0 = q0_or_p0_org_in - p0_or_q0_org_in;              \
548     p1_sub_q1 = p1_or_q1_org_in - q1_or_p1_org_in;              \
549     q0_sub_p0 <<= 2;                                            \
550     p1_sub_q1 += 4;                                             \
551     delta = q0_sub_p0 + p1_sub_q1;                              \
552     delta >>= 3;                                                \
553                                                                 \
554     CLIP_SH(delta, negate_threshold_in, threshold_in);          \
555                                                                 \
556     p0_or_q0_out = p0_or_q0_org_in + delta;                     \
557     q0_or_p0_out = q0_or_p0_org_in - delta;                     \
558                                                                 \
559     CLIP_SH2_0_255(p0_or_q0_out, q0_or_p0_out);                 \
560 }
561 
562 #define AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res)      \
563 {                                                                        \
564     uint32_t load0, load1, load2, load3;                                 \
565     v16u8 src0 = { 0 };                                                  \
566     v16u8 src1 = { 0 };                                                  \
567     v16u8 src2 = { 0 };                                                  \
568     v16u8 src3 = { 0 };                                                  \
569     v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;                            \
570     v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;           \
571     v8i16 tc, q0_sub_p0, p1_sub_q1, delta;                               \
572     v8i16 res0_r, res1_r;                                                \
573     v16i8 zeros = { 0 };                                                 \
574     v16u8 res0, res1;                                                    \
575                                                                          \
576     LW4((src - 2), stride, load0, load1, load2, load3);                  \
577     src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0);               \
578     src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1);               \
579     src2 = (v16u8) __msa_insert_w((v4i32) src2, 0, load2);               \
580     src3 = (v16u8) __msa_insert_w((v4i32) src3, 0, load3);               \
581                                                                          \
582     TRANSPOSE4x4_UB_UB(src0, src1, src2, src3, src0, src1, src2, src3);  \
583                                                                          \
584     p0_asub_q0 = __msa_asub_u_b(src2, src1);                             \
585     p1_asub_p0 = __msa_asub_u_b(src1, src0);                             \
586     q1_asub_q0 = __msa_asub_u_b(src2, src3);                             \
587                                                                          \
588     tc = __msa_fill_h(tc_val);                                           \
589                                                                          \
590     is_less_than_alpha = (p0_asub_q0 < alpha);                           \
591     is_less_than_beta = (p1_asub_p0 < beta);                             \
592     is_less_than = is_less_than_alpha & is_less_than_beta;               \
593     is_less_than_beta = (q1_asub_q0 < beta);                             \
594     is_less_than = is_less_than_beta & is_less_than;                     \
595                                                                          \
596     ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1);            \
597     HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1);             \
598                                                                          \
599     q0_sub_p0 <<= 2;                                                     \
600     delta = q0_sub_p0 + p1_sub_q1;                                       \
601     delta = __msa_srari_h(delta, 3);                                     \
602                                                                          \
603     CLIP_SH(delta, -tc, tc);                                             \
604                                                                          \
605     ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r);                \
606                                                                          \
607     res0_r += delta;                                                     \
608     res1_r -= delta;                                                     \
609                                                                          \
610     CLIP_SH2_0_255(res0_r, res1_r);                                      \
611     PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);             \
612                                                                          \
613     res0 = __msa_bmnz_v(src1, res0, is_less_than);                       \
614     res1 = __msa_bmnz_v(src2, res1, is_less_than);                       \
615                                                                          \
616     res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0);              \
617 }
618 
619 #define TRANSPOSE2x4_B_UB(in0, in1, out0, out1, out2, out3)  \
620 {                                                            \
621     v16i8 zero_m = { 0 };                                    \
622                                                              \
623     out0 = (v16u8) __msa_ilvr_b((v16i8) in1, (v16i8) in0);   \
624     out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 2);    \
625     SLDI_B2_UB(zero_m, out1, zero_m, out2, 2, out2, out3);   \
626 }
627 
628 #define AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res)  \
629 {                                                                          \
630     uint32_t load0, load1;                                                 \
631     v16u8 src0 = { 0 };                                                    \
632     v16u8 src1 = { 0 };                                                    \
633     v16u8 src2 = { 0 };                                                    \
634     v16u8 src3 = { 0 };                                                    \
635     v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;                              \
636     v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;             \
637     v8i16 tc, q0_sub_p0, p1_sub_q1, delta, res0_r, res1_r;                 \
638     v16i8 zeros = { 0 };                                                   \
639     v16u8 res0, res1;                                                      \
640                                                                            \
641     load0 = LW(src - 2);                                                   \
642     load1 = LW(src - 2 + stride);                                          \
643                                                                            \
644     src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0);                 \
645     src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1);                 \
646                                                                            \
647     TRANSPOSE2x4_B_UB(src0, src1, src0, src1, src2, src3);                 \
648                                                                            \
649     p0_asub_q0 = __msa_asub_u_b(src2, src1);                               \
650     p1_asub_p0 = __msa_asub_u_b(src1, src0);                               \
651     q1_asub_q0 = __msa_asub_u_b(src2, src3);                               \
652                                                                            \
653     tc = __msa_fill_h(tc_val);                                             \
654                                                                            \
655     is_less_than_alpha = (p0_asub_q0 < alpha);                             \
656     is_less_than_beta = (p1_asub_p0 < beta);                               \
657     is_less_than = is_less_than_alpha & is_less_than_beta;                 \
658     is_less_than_beta = (q1_asub_q0 < beta);                               \
659     is_less_than = is_less_than_beta & is_less_than;                       \
660                                                                            \
661     ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1);              \
662     HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1);               \
663                                                                            \
664     q0_sub_p0 <<= 2;                                                       \
665     delta = q0_sub_p0 + p1_sub_q1;                                         \
666     delta = __msa_srari_h(delta, 3);                                       \
667     CLIP_SH(delta, -tc, tc);                                               \
668                                                                            \
669     ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r);                  \
670                                                                            \
671     res0_r += delta;                                                       \
672     res1_r -= delta;                                                       \
673                                                                            \
674     CLIP_SH2_0_255(res0_r, res1_r);                                        \
675     PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);               \
676                                                                            \
677     res0 = __msa_bmnz_v(src1, res0, is_less_than);                         \
678     res1 = __msa_bmnz_v(src2, res1, is_less_than);                         \
679                                                                            \
680     res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0);                \
681 }
682 
avc_loopfilter_luma_intra_edge_hor_msa(uint8_t * data,uint8_t alpha_in,uint8_t beta_in,ptrdiff_t img_width)683 static void avc_loopfilter_luma_intra_edge_hor_msa(uint8_t *data,
684                                                    uint8_t alpha_in,
685                                                    uint8_t beta_in,
686                                                    ptrdiff_t img_width)
687 {
688     v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
689     v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
690     v16u8 p1_org, p0_org, q0_org, q1_org;
691 
692     LD_UB4(data - (img_width << 1), img_width, p1_org, p0_org, q0_org, q1_org);
693 
694     p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
695     p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
696     q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
697 
698     is_less_than_alpha = (p0_asub_q0 < alpha_in);
699     is_less_than_beta = (p1_asub_p0 < beta_in);
700     is_less_than = is_less_than_beta & is_less_than_alpha;
701     is_less_than_beta = (q1_asub_q0 < beta_in);
702     is_less_than = is_less_than_beta & is_less_than;
703 
704     if (!__msa_test_bz_v(is_less_than)) {
705         v16u8 p2_asub_p0, q2_asub_q0, p0, q0, negate_is_less_than_beta;
706         v8i16 p0_r = { 0 };
707         v8i16 q0_r = { 0 };
708         v8i16 p0_l = { 0 };
709         v8i16 q0_l = { 0 };
710         v16i8 zero = { 0 };
711         v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
712         v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
713         v16u8 q2_org = LD_UB(data + (2 * img_width));
714         v16u8 p2_org = LD_UB(data - (3 * img_width));
715         v16u8 tmp_flag = (v16u8)__msa_fill_b((alpha_in >> 2) + 2);
716 
717         UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
718         UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
719         UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
720 
721         tmp_flag = (p0_asub_q0 < tmp_flag);
722 
723         p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
724         is_less_than_beta = (p2_asub_p0 < beta_in);
725         is_less_than_beta = is_less_than_beta & tmp_flag;
726         negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
727         is_less_than_beta = is_less_than_beta & is_less_than;
728         negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
729 
730         q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
731         q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
732 
733         /* combine and store */
734         if (!__msa_test_bz_v(is_less_than_beta)) {
735             v8i16 p3_org_l, p3_org_r;
736             v16u8 p3_org = LD_UB(data - (img_width << 2));
737             v16u8 p2, p1;
738             v8i16 p2_r = { 0 };
739             v8i16 p2_l = { 0 };
740             v8i16 p1_r = { 0 };
741             v8i16 p1_l = { 0 };
742 
743             ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r);
744             AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, p1_org_r,
745                                      p2_r, q1_org_r, p0_r, p1_r, p2_r);
746 
747             ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l);
748             AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_l, p0_org_l, q0_org_l, p1_org_l,
749                                      p2_l, q1_org_l, p0_l, p1_l, p2_l);
750 
751             PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2);
752 
753             p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta);
754             p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
755             p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta);
756 
757             ST_UB(p1_org, data - (2 * img_width));
758             ST_UB(p2_org, data - (3 * img_width));
759         }
760 
761         AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_r);
762         AVC_LPF_P0_OR_Q0(p0_org_l, q1_org_l, p1_org_l, p0_l);
763 
764         /* combine */
765         p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r);
766         p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta);
767 
768         ST_UB(p0_org, data - img_width);
769 
770         /* if (tmpFlag && (unsigned)ABS(q2-q0) < thresholds->beta_in) */
771         q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
772         is_less_than_beta = (q2_asub_q0 < beta_in);
773         is_less_than_beta = is_less_than_beta & tmp_flag;
774         negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
775         is_less_than_beta = is_less_than_beta & is_less_than;
776         negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
777 
778         /* combine and store */
779         if (!__msa_test_bz_v(is_less_than_beta)) {
780             v8i16 q3_org_r, q3_org_l;
781             v16u8 q3_org = LD_UB(data + (3 * img_width));
782             v16u8 q1, q2;
783             v8i16 q2_r = { 0 };
784             v8i16 q2_l = { 0 };
785             v8i16 q1_r = { 0 };
786             v8i16 q1_l = { 0 };
787 
788             ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r);
789             AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_r, q0_org_r, p0_org_r, q1_org_r,
790                                      q2_r, p1_org_r, q0_r, q1_r, q2_r);
791 
792             ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l);
793             AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_l, q0_org_l, p0_org_l, q1_org_l,
794                                      q2_l, p1_org_l, q0_l, q1_l, q2_l);
795 
796             PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2);
797             q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta);
798             q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
799             q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta);
800 
801             ST_UB(q1_org, data + img_width);
802             ST_UB(q2_org, data + 2 * img_width);
803         }
804 
805         AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_r);
806         AVC_LPF_P0_OR_Q0(q0_org_l, p1_org_l, q1_org_l, q0_l);
807 
808         /* combine */
809         q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r);
810         q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta);
811 
812         ST_UB(q0_org, data);
813     }
814 }
815 
avc_loopfilter_luma_intra_edge_ver_msa(uint8_t * data,uint8_t alpha_in,uint8_t beta_in,ptrdiff_t img_width)816 static void avc_loopfilter_luma_intra_edge_ver_msa(uint8_t *data,
817                                                    uint8_t alpha_in,
818                                                    uint8_t beta_in,
819                                                    ptrdiff_t img_width)
820 {
821     uint8_t *src = data - 4;
822     v16u8 alpha, beta, p0_asub_q0;
823     v16u8 is_less_than_alpha, is_less_than, is_less_than_beta;
824     v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
825     v16u8 p1_asub_p0, q1_asub_q0;
826 
827 
828     {
829         v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
830         v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
831 
832         LD_UB8(src, img_width, row0, row1, row2, row3, row4, row5, row6, row7);
833         LD_UB8(src + (8 * img_width), img_width,
834                row8, row9, row10, row11, row12, row13, row14, row15);
835 
836         TRANSPOSE16x8_UB_UB(row0, row1, row2, row3,
837                             row4, row5, row6, row7,
838                             row8, row9, row10, row11,
839                             row12, row13, row14, row15,
840                             p3_org, p2_org, p1_org, p0_org,
841                             q0_org, q1_org, q2_org, q3_org);
842     }
843 
844     p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
845     p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
846     q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
847 
848     alpha = (v16u8) __msa_fill_b(alpha_in);
849     beta = (v16u8) __msa_fill_b(beta_in);
850 
851     is_less_than_alpha = (p0_asub_q0 < alpha);
852     is_less_than_beta = (p1_asub_p0 < beta);
853     is_less_than = is_less_than_beta & is_less_than_alpha;
854     is_less_than_beta = (q1_asub_q0 < beta);
855     is_less_than = is_less_than_beta & is_less_than;
856 
857     if (!__msa_test_bz_v(is_less_than)) {
858         v8i16 p0_r = { 0 };
859         v8i16 q0_r = { 0 };
860         v8i16 p0_l = { 0 };
861         v8i16 q0_l = { 0 };
862         v16i8 zero = { 0 };
863         v16u8 tmp_flag, p0, q0, p2_asub_p0, q2_asub_q0;
864         v16u8 negate_is_less_than_beta;
865         v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
866         v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
867 
868         UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
869         UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
870         UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
871         UNPCK_UB_SH(q1_org, q1_org_r, q1_org_l);
872 
873         tmp_flag = alpha >> 2;
874         tmp_flag = tmp_flag + 2;
875         tmp_flag = (p0_asub_q0 < tmp_flag);
876 
877         p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
878         is_less_than_beta = (p2_asub_p0 < beta);
879         is_less_than_beta = tmp_flag & is_less_than_beta;
880         negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
881         is_less_than_beta = is_less_than_beta & is_less_than;
882         negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
883 
884         if (!__msa_test_bz_v(is_less_than_beta)) {
885             v16u8 p2, p1;
886             v8i16 p3_org_r, p3_org_l;
887             v8i16 p2_l = { 0 };
888             v8i16 p2_r = { 0 };
889             v8i16 p1_l = { 0 };
890             v8i16 p1_r = { 0 };
891 
892             ILVR_B2_SH(zero, p3_org, zero, p2_org, p3_org_r, p2_r);
893             AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_r, p0_org_r, q0_org_r, p1_org_r,
894                                      p2_r, q1_org_r, p0_r, p1_r, p2_r);
895 
896             ILVL_B2_SH(zero, p3_org, zero, p2_org, p3_org_l, p2_l);
897             AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_l, p0_org_l, q0_org_l, p1_org_l,
898                                          p2_l, q1_org_l, p0_l, p1_l, p2_l);
899 
900             PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2);
901             p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta);
902             p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
903             p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta);
904         }
905 
906         AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_r);
907         AVC_LPF_P0_OR_Q0(p0_org_l, q1_org_l, p1_org_l, p0_l);
908 
909         p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r);
910         p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta);
911 
912         q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
913         is_less_than_beta = (q2_asub_q0 < beta);
914 
915         is_less_than_beta = is_less_than_beta & tmp_flag;
916         negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
917 
918         is_less_than_beta = is_less_than_beta & is_less_than;
919         negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
920 
921         if (!__msa_test_bz_v(is_less_than_beta)) {
922             v16u8 q1, q2;
923             v8i16 q3_org_r, q3_org_l;
924             v8i16 q1_l = { 0 };
925             v8i16 q1_r = { 0 };
926             v8i16 q2_l = { 0 };
927             v8i16 q2_r = { 0 };
928 
929             ILVR_B2_SH(zero, q3_org, zero, q2_org, q3_org_r, q2_r);
930             AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_r, q0_org_r, p0_org_r, q1_org_r,
931                                      q2_r, p1_org_r, q0_r, q1_r, q2_r);
932 
933             ILVL_B2_SH(zero, q3_org, zero, q2_org, q3_org_l, q2_l);
934             AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_l, q0_org_l, p0_org_l, q1_org_l,
935                                      q2_l, p1_org_l, q0_l, q1_l, q2_l);
936 
937             PCKEV_B3_UB(q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2);
938             q0_org = __msa_bmnz_v(q0_org, q0, is_less_than_beta);
939             q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
940             q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta);
941         }
942 
943         AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_r);
944         AVC_LPF_P0_OR_Q0(q0_org_l, p1_org_l, q1_org_l, q0_l);
945 
946         q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r);
947         q0_org = __msa_bmnz_v(q0_org, q0, negate_is_less_than_beta);
948 
949     {
950         v8i16 tp0, tp1, tp2, tp3, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
951 
952         ILVRL_B2_SH(p1_org, p2_org, tp0, tp2);
953         ILVRL_B2_SH(q0_org, p0_org, tp1, tp3);
954         ILVRL_B2_SH(q2_org, q1_org, tmp2, tmp5);
955 
956         ILVRL_H2_SH(tp1, tp0, tmp3, tmp4);
957         ILVRL_H2_SH(tp3, tp2, tmp6, tmp7);
958 
959         src = data - 3;
960         ST_W4(tmp3, 0, 1, 2, 3, src, img_width);
961         ST_H4(tmp2, 0, 1, 2, 3, src + 4, img_width);
962         src += 4 * img_width;
963         ST_W4(tmp4, 0, 1, 2, 3, src, img_width);
964         ST_H4(tmp2, 4, 5, 6, 7, src + 4, img_width);
965         src += 4 * img_width;
966 
967         ST_W4(tmp6, 0, 1, 2, 3, src, img_width);
968         ST_H4(tmp5, 0, 1, 2, 3, src + 4, img_width);
969         src += 4 * img_width;
970         ST_W4(tmp7, 0, 1, 2, 3, src, img_width);
971         ST_H4(tmp5, 4, 5, 6, 7, src + 4, img_width);
972     }
973     }
974 }
975 
avc_h_loop_filter_luma_mbaff_intra_msa(uint8_t * src,ptrdiff_t stride,int32_t alpha_in,int32_t beta_in)976 static void avc_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src,
977                                                    ptrdiff_t stride,
978                                                    int32_t alpha_in,
979                                                    int32_t beta_in)
980 {
981     uint64_t load0, load1;
982     uint32_t out0, out2;
983     uint16_t out1, out3;
984     v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
985     v8u16 dst0_r, dst1_r, dst4_r, dst5_r;
986     v8u16 dst2_x_r, dst2_y_r, dst3_x_r, dst3_y_r;
987     v16u8 dst0, dst1, dst4, dst5, dst2_x, dst2_y, dst3_x, dst3_y;
988     v8i16 tmp0, tmp1, tmp2, tmp3;
989     v16u8 alpha, beta;
990     v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0;
991     v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;
992     v16u8 is_less_than_beta1, is_less_than_beta2;
993     v16i8 src0 = { 0 };
994     v16i8 src1 = { 0 };
995     v16i8 src2 = { 0 };
996     v16i8 src3 = { 0 };
997     v16i8 src4 = { 0 };
998     v16i8 src5 = { 0 };
999     v16i8 src6 = { 0 };
1000     v16i8 src7 = { 0 };
1001     v16i8 zeros = { 0 };
1002 
1003     load0 = LD(src - 4);
1004     load1 = LD(src + stride - 4);
1005     src0 = (v16i8) __msa_insert_d((v2i64) src0, 0, load0);
1006     src1 = (v16i8) __msa_insert_d((v2i64) src1, 0, load1);
1007 
1008     load0 = LD(src + (2 * stride) - 4);
1009     load1 = LD(src + (3 * stride) - 4);
1010     src2 = (v16i8) __msa_insert_d((v2i64) src2, 0, load0);
1011     src3 = (v16i8) __msa_insert_d((v2i64) src3, 0, load1);
1012 
1013     load0 = LD(src + (4 * stride) - 4);
1014     load1 = LD(src + (5 * stride) - 4);
1015     src4 = (v16i8) __msa_insert_d((v2i64) src4, 0, load0);
1016     src5 = (v16i8) __msa_insert_d((v2i64) src5, 0, load1);
1017 
1018     load0 = LD(src + (6 * stride) - 4);
1019     load1 = LD(src + (7 * stride) - 4);
1020     src6 = (v16i8) __msa_insert_d((v2i64) src6, 0, load0);
1021     src7 = (v16i8) __msa_insert_d((v2i64) src7, 0, load1);
1022 
1023     ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
1024                src0, src1, src2, src3);
1025 
1026     ILVR_H2_SH(src1, src0, src3, src2, tmp0, tmp2);
1027     ILVL_H2_SH(src1, src0, src3, src2, tmp1, tmp3);
1028 
1029     ILVR_W2_SB(tmp2, tmp0, tmp3, tmp1, src6, src3);
1030     ILVL_W2_SB(tmp2, tmp0, tmp3, tmp1, src1, src5);
1031     SLDI_B4_SB(zeros, src6, zeros, src1, zeros, src3, zeros, src5,
1032                8, src0, src2, src4, src7);
1033 
1034     p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3);
1035     p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2);
1036     q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3);
1037 
1038     alpha = (v16u8) __msa_fill_b(alpha_in);
1039     beta = (v16u8) __msa_fill_b(beta_in);
1040 
1041     is_less_than_alpha = (p0_asub_q0 < alpha);
1042     is_less_than_beta = (p1_asub_p0 < beta);
1043     is_less_than = is_less_than_alpha & is_less_than_beta;
1044     is_less_than_beta = (q1_asub_q0 < beta);
1045     is_less_than = is_less_than & is_less_than_beta;
1046 
1047     alpha >>= 2;
1048     alpha += 2;
1049 
1050     is_less_than_alpha = (p0_asub_q0 < alpha);
1051 
1052     p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2);
1053     is_less_than_beta1 = (p2_asub_p0 < beta);
1054     q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3);
1055     is_less_than_beta2 = (q2_asub_q0 < beta);
1056 
1057     ILVR_B4_UH(zeros, src0, zeros, src1, zeros, src2, zeros, src3,
1058                src0_r, src1_r, src2_r, src3_r);
1059     ILVR_B4_UH(zeros, src4, zeros, src5, zeros, src6, zeros, src7,
1060                src4_r, src5_r, src6_r, src7_r);
1061 
1062     dst2_x_r = src1_r + src2_r + src3_r;
1063     dst2_x_r = src0_r + (2 * (dst2_x_r)) + src4_r;
1064     dst2_x_r = (v8u16) __msa_srari_h((v8i16) dst2_x_r, 3);
1065     dst1_r = src0_r + src1_r + src2_r + src3_r;
1066     dst1_r = (v8u16) __msa_srari_h((v8i16) dst1_r, 2);
1067 
1068     dst0_r = (2 * src6_r) + (3 * src0_r);
1069     dst0_r += src1_r + src2_r + src3_r;
1070     dst0_r = (v8u16) __msa_srari_h((v8i16) dst0_r, 3);
1071     dst2_y_r = (2 * src1_r) + src2_r + src4_r;
1072     dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2);
1073 
1074     PCKEV_B2_UB(dst2_x_r, dst2_x_r, dst2_y_r, dst2_y_r, dst2_x, dst2_y);
1075     dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_beta1);
1076 
1077     dst3_x_r = src2_r + src3_r + src4_r;
1078     dst3_x_r = src1_r + (2 * dst3_x_r) + src5_r;
1079     dst3_x_r = (v8u16) __msa_srari_h((v8i16) dst3_x_r, 3);
1080     dst4_r = src2_r + src3_r + src4_r + src5_r;
1081     dst4_r = (v8u16) __msa_srari_h((v8i16) dst4_r, 2);
1082 
1083     dst5_r = (2 * src7_r) + (3 * src5_r);
1084     dst5_r += src4_r + src3_r + src2_r;
1085     dst5_r = (v8u16) __msa_srari_h((v8i16) dst5_r, 3);
1086     dst3_y_r = (2 * src4_r) + src3_r + src1_r;
1087     dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2);
1088 
1089     PCKEV_B2_UB(dst3_x_r, dst3_x_r, dst3_y_r, dst3_y_r, dst3_x, dst3_y);
1090     dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_beta2);
1091 
1092     dst2_y_r = (2 * src1_r) + src2_r + src4_r;
1093     dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2);
1094     dst3_y_r = (2 * src4_r) + src3_r + src1_r;
1095     dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2);
1096 
1097     PCKEV_B2_UB(dst2_y_r, dst2_y_r, dst3_y_r, dst3_y_r, dst2_y, dst3_y);
1098 
1099     dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_alpha);
1100     dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_alpha);
1101     dst2_x = __msa_bmnz_v((v16u8) src2, dst2_x, is_less_than);
1102     dst3_x = __msa_bmnz_v((v16u8) src3, dst3_x, is_less_than);
1103 
1104     is_less_than = is_less_than_alpha & is_less_than;
1105     dst1 = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst1_r);
1106     is_less_than_beta1 = is_less_than_beta1 & is_less_than;
1107     dst1 = __msa_bmnz_v((v16u8) src1, dst1, is_less_than_beta1);
1108 
1109     dst0 = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
1110     dst0 = __msa_bmnz_v((v16u8) src0, dst0, is_less_than_beta1);
1111     dst4 = (v16u8) __msa_pckev_b((v16i8) dst4_r, (v16i8) dst4_r);
1112     is_less_than_beta2 = is_less_than_beta2 & is_less_than;
1113     dst4 = __msa_bmnz_v((v16u8) src4, dst4, is_less_than_beta2);
1114     dst5 = (v16u8) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst5_r);
1115     dst5 = __msa_bmnz_v((v16u8) src5, dst5, is_less_than_beta2);
1116 
1117     ILVR_B2_UB(dst1, dst0, dst3_x, dst2_x, dst0, dst1);
1118     dst2_x = (v16u8) __msa_ilvr_b((v16i8) dst5, (v16i8) dst4);
1119     ILVRL_H2_SH(dst1, dst0, tmp0, tmp1);
1120     ILVRL_H2_SH(zeros, dst2_x, tmp2, tmp3);
1121 
1122     ILVR_W2_UB(tmp2, tmp0, tmp3, tmp1, dst0, dst4);
1123     SLDI_B2_UB(zeros, dst0, zeros, dst4, 8, dst1, dst5);
1124     dst2_x = (v16u8) __msa_ilvl_w((v4i32) tmp2, (v4i32) tmp0);
1125     dst2_y = (v16u8) __msa_ilvl_w((v4i32) tmp3, (v4i32) tmp1);
1126     SLDI_B2_UB(zeros, dst2_x, zeros, dst2_y, 8, dst3_x, dst3_y);
1127 
1128     out0 = __msa_copy_u_w((v4i32) dst0, 0);
1129     out1 = __msa_copy_u_h((v8i16) dst0, 2);
1130     out2 = __msa_copy_u_w((v4i32) dst1, 0);
1131     out3 = __msa_copy_u_h((v8i16) dst1, 2);
1132 
1133     SW(out0, (src - 3));
1134     SH(out1, (src + 1));
1135     src += stride;
1136     SW(out2, (src - 3));
1137     SH(out3, (src + 1));
1138     src += stride;
1139 
1140     out0 = __msa_copy_u_w((v4i32) dst2_x, 0);
1141     out1 = __msa_copy_u_h((v8i16) dst2_x, 2);
1142     out2 = __msa_copy_u_w((v4i32) dst3_x, 0);
1143     out3 = __msa_copy_u_h((v8i16) dst3_x, 2);
1144 
1145     SW(out0, (src - 3));
1146     SH(out1, (src + 1));
1147     src += stride;
1148     SW(out2, (src - 3));
1149     SH(out3, (src + 1));
1150     src += stride;
1151 
1152     out0 = __msa_copy_u_w((v4i32) dst4, 0);
1153     out1 = __msa_copy_u_h((v8i16) dst4, 2);
1154     out2 = __msa_copy_u_w((v4i32) dst5, 0);
1155     out3 = __msa_copy_u_h((v8i16) dst5, 2);
1156 
1157     SW(out0, (src - 3));
1158     SH(out1, (src + 1));
1159     src += stride;
1160     SW(out2, (src - 3));
1161     SH(out3, (src + 1));
1162     src += stride;
1163 
1164     out0 = __msa_copy_u_w((v4i32) dst2_y, 0);
1165     out1 = __msa_copy_u_h((v8i16) dst2_y, 2);
1166     out2 = __msa_copy_u_w((v4i32) dst3_y, 0);
1167     out3 = __msa_copy_u_h((v8i16) dst3_y, 2);
1168 
1169     SW(out0, (src - 3));
1170     SH(out1, (src + 1));
1171     src += stride;
1172     SW(out2, (src - 3));
1173     SH(out3, (src + 1));
1174 }
1175 
avc_loopfilter_cb_or_cr_intra_edge_hor_msa(uint8_t * data_cb_or_cr,uint8_t alpha_in,uint8_t beta_in,ptrdiff_t img_width)1176 static void avc_loopfilter_cb_or_cr_intra_edge_hor_msa(uint8_t *data_cb_or_cr,
1177                                                        uint8_t alpha_in,
1178                                                        uint8_t beta_in,
1179                                                        ptrdiff_t img_width)
1180 {
1181     v16u8 alpha, beta;
1182     v16u8 is_less_than;
1183     v8i16 p0_or_q0, q0_or_p0;
1184     v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org;
1185     v16i8 zero = { 0 };
1186     v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1187     v16u8 is_less_than_alpha, is_less_than_beta;
1188     v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1189 
1190     alpha = (v16u8) __msa_fill_b(alpha_in);
1191     beta = (v16u8) __msa_fill_b(beta_in);
1192 
1193     LD_UB4(data_cb_or_cr - (img_width << 1), img_width,
1194            p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org);
1195 
1196     p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org);
1197     p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org);
1198     q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org);
1199 
1200     is_less_than_alpha = (p0_asub_q0 < alpha);
1201     is_less_than_beta = (p1_asub_p0 < beta);
1202     is_less_than = is_less_than_beta & is_less_than_alpha;
1203     is_less_than_beta = (q1_asub_q0 < beta);
1204     is_less_than = is_less_than_beta & is_less_than;
1205 
1206     is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
1207 
1208     if (!__msa_test_bz_v(is_less_than)) {
1209         ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org,
1210                    zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1211         AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_or_q0);
1212         AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_or_p0);
1213         PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0);
1214 
1215         p0_or_q0_org =
1216             __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than);
1217         q0_or_p0_org =
1218             __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than);
1219 
1220         ST_UB(q0_or_p0_org, data_cb_or_cr);
1221         ST_UB(p0_or_q0_org, data_cb_or_cr - img_width);
1222     }
1223 }
1224 
avc_loopfilter_cb_or_cr_intra_edge_ver_msa(uint8_t * data_cb_or_cr,uint8_t alpha_in,uint8_t beta_in,ptrdiff_t img_width)1225 static void avc_loopfilter_cb_or_cr_intra_edge_ver_msa(uint8_t *data_cb_or_cr,
1226                                                        uint8_t alpha_in,
1227                                                        uint8_t beta_in,
1228                                                        ptrdiff_t img_width)
1229 {
1230     v8i16 tmp1;
1231     v16u8 alpha, beta, is_less_than;
1232     v8i16 p0_or_q0, q0_or_p0;
1233     v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org;
1234     v16i8 zero = { 0 };
1235     v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1236     v16u8 is_less_than_alpha, is_less_than_beta;
1237     v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1238 
1239     {
1240         v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1241 
1242         LD_UB8((data_cb_or_cr - 2), img_width,
1243                row0, row1, row2, row3, row4, row5, row6, row7);
1244 
1245         TRANSPOSE8x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
1246                            p1_or_q1_org, p0_or_q0_org,
1247                            q0_or_p0_org, q1_or_p1_org);
1248     }
1249 
1250     alpha = (v16u8) __msa_fill_b(alpha_in);
1251     beta = (v16u8) __msa_fill_b(beta_in);
1252 
1253     p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org);
1254     p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org);
1255     q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org);
1256 
1257     is_less_than_alpha = (p0_asub_q0 < alpha);
1258     is_less_than_beta = (p1_asub_p0 < beta);
1259     is_less_than = is_less_than_beta & is_less_than_alpha;
1260     is_less_than_beta = (q1_asub_q0 < beta);
1261     is_less_than = is_less_than_beta & is_less_than;
1262     is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
1263 
1264     if (!__msa_test_bz_v(is_less_than)) {
1265         ILVR_B4_SH(zero, p1_or_q1_org, zero, p0_or_q0_org, zero, q0_or_p0_org,
1266                    zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r);
1267 
1268         AVC_LPF_P0_OR_Q0(p0_org_r, q1_org_r, p1_org_r, p0_or_q0);
1269         AVC_LPF_P0_OR_Q0(q0_org_r, p1_org_r, q1_org_r, q0_or_p0);
1270 
1271         /* convert 16 bit output into 8 bit output */
1272         PCKEV_B2_SH(zero, p0_or_q0, zero, q0_or_p0, p0_or_q0, q0_or_p0);
1273 
1274         p0_or_q0_org =
1275             __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than);
1276         q0_or_p0_org =
1277             __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than);
1278         tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_or_p0_org, (v16i8) p0_or_q0_org);
1279 
1280         data_cb_or_cr -= 1;
1281         ST_H4(tmp1, 0, 1, 2, 3, data_cb_or_cr, img_width);
1282         data_cb_or_cr += 4 * img_width;
1283         ST_H4(tmp1, 4, 5, 6, 7, data_cb_or_cr, img_width);
1284     }
1285 }
1286 
avc_loopfilter_luma_inter_edge_ver_msa(uint8_t * data,uint8_t bs0,uint8_t bs1,uint8_t bs2,uint8_t bs3,uint8_t tc0,uint8_t tc1,uint8_t tc2,uint8_t tc3,uint8_t alpha_in,uint8_t beta_in,ptrdiff_t img_width)1287 static void avc_loopfilter_luma_inter_edge_ver_msa(uint8_t *data,
1288                                                    uint8_t bs0, uint8_t bs1,
1289                                                    uint8_t bs2, uint8_t bs3,
1290                                                    uint8_t tc0, uint8_t tc1,
1291                                                    uint8_t tc2, uint8_t tc3,
1292                                                    uint8_t alpha_in,
1293                                                    uint8_t beta_in,
1294                                                    ptrdiff_t img_width)
1295 {
1296     v16u8 tmp_vec, bs = { 0 };
1297 
1298     tmp_vec = (v16u8) __msa_fill_b(bs0);
1299     bs = (v16u8) __msa_insve_w((v4i32) bs, 0, (v4i32) tmp_vec);
1300     tmp_vec = (v16u8) __msa_fill_b(bs1);
1301     bs = (v16u8) __msa_insve_w((v4i32) bs, 1, (v4i32) tmp_vec);
1302     tmp_vec = (v16u8) __msa_fill_b(bs2);
1303     bs = (v16u8) __msa_insve_w((v4i32) bs, 2, (v4i32) tmp_vec);
1304     tmp_vec = (v16u8) __msa_fill_b(bs3);
1305     bs = (v16u8) __msa_insve_w((v4i32) bs, 3, (v4i32) tmp_vec);
1306 
1307     if (!__msa_test_bz_v(bs)) {
1308         uint8_t *src = data - 4;
1309         v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
1310         v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
1311         v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
1312         v16u8 is_bs_greater_than0;
1313         v16u8 tc = { 0 };
1314         v16i8 zero = { 0 };
1315 
1316         tmp_vec = (v16u8) __msa_fill_b(tc0);
1317         tc = (v16u8) __msa_insve_w((v4i32) tc, 0, (v4i32) tmp_vec);
1318         tmp_vec = (v16u8) __msa_fill_b(tc1);
1319         tc = (v16u8) __msa_insve_w((v4i32) tc, 1, (v4i32) tmp_vec);
1320         tmp_vec = (v16u8) __msa_fill_b(tc2);
1321         tc = (v16u8) __msa_insve_w((v4i32) tc, 2, (v4i32) tmp_vec);
1322         tmp_vec = (v16u8) __msa_fill_b(tc3);
1323         tc = (v16u8) __msa_insve_w((v4i32) tc, 3, (v4i32) tmp_vec);
1324 
1325         is_bs_greater_than0 = (zero < bs);
1326 
1327         {
1328             v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1329             v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
1330 
1331             LD_UB8(src, img_width,
1332                    row0, row1, row2, row3, row4, row5, row6, row7);
1333             src += (8 * img_width);
1334             LD_UB8(src, img_width,
1335                    row8, row9, row10, row11, row12, row13, row14, row15);
1336 
1337             TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
1338                                 row8, row9, row10, row11,
1339                                 row12, row13, row14, row15,
1340                                 p3_org, p2_org, p1_org, p0_org,
1341                                 q0_org, q1_org, q2_org, q3_org);
1342         }
1343 
1344         p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1345         p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1346         q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1347 
1348         alpha = (v16u8) __msa_fill_b(alpha_in);
1349         beta = (v16u8) __msa_fill_b(beta_in);
1350 
1351         is_less_than_alpha = (p0_asub_q0 < alpha);
1352         is_less_than_beta = (p1_asub_p0 < beta);
1353         is_less_than = is_less_than_beta & is_less_than_alpha;
1354         is_less_than_beta = (q1_asub_q0 < beta);
1355         is_less_than = is_less_than_beta & is_less_than;
1356         is_less_than = is_less_than & is_bs_greater_than0;
1357 
1358         if (!__msa_test_bz_v(is_less_than)) {
1359             v16i8 negate_tc, sign_negate_tc;
1360             v16u8 p0, q0, p2_asub_p0, q2_asub_q0;
1361             v8i16 tc_r, tc_l, negate_tc_r, i16_negatetc_l;
1362             v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1363             v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
1364             v8i16 p0_r, q0_r, p0_l, q0_l;
1365 
1366             negate_tc = zero - (v16i8) tc;
1367             sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
1368 
1369             ILVRL_B2_SH(sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l);
1370 
1371             UNPCK_UB_SH(tc, tc_r, tc_l);
1372             UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
1373             UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
1374             UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
1375 
1376             p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
1377             is_less_than_beta = (p2_asub_p0 < beta);
1378             is_less_than_beta = is_less_than_beta & is_less_than;
1379 
1380             if (!__msa_test_bz_v(is_less_than_beta)) {
1381                 v16u8 p1;
1382                 v8i16 p1_r = { 0 };
1383                 v8i16 p1_l = { 0 };
1384                 v8i16 p2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) p2_org);
1385                 v8i16 p2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) p2_org);
1386 
1387                 AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, p1_org_r, p2_org_r,
1388                                  negate_tc_r, tc_r, p1_r);
1389                 AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, p1_org_l, p2_org_l,
1390                                  i16_negatetc_l, tc_l, p1_l);
1391 
1392                 p1 = (v16u8) __msa_pckev_b((v16i8) p1_l, (v16i8) p1_r);
1393                 p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
1394 
1395                 is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1396                 tc = tc + is_less_than_beta;
1397             }
1398 
1399             q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
1400             is_less_than_beta = (q2_asub_q0 < beta);
1401             is_less_than_beta = is_less_than_beta & is_less_than;
1402 
1403             q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
1404             q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
1405 
1406             if (!__msa_test_bz_v(is_less_than_beta)) {
1407                 v16u8 q1;
1408                 v8i16 q1_r = { 0 };
1409                 v8i16 q1_l = { 0 };
1410                 v8i16 q2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q2_org);
1411                 v8i16 q2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q2_org);
1412 
1413                 AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, q1_org_r, q2_org_r,
1414                                  negate_tc_r, tc_r, q1_r);
1415                 AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, q1_org_l, q2_org_l,
1416                                  i16_negatetc_l, tc_l, q1_l);
1417 
1418                 q1 = (v16u8) __msa_pckev_b((v16i8) q1_l, (v16i8) q1_r);
1419                 q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
1420 
1421                 is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1422                 tc = tc + is_less_than_beta;
1423             }
1424 
1425             {
1426                 v8i16 threshold_r, negate_thresh_r;
1427                 v8i16 threshold_l, negate_thresh_l;
1428                 v16i8 negate_thresh, sign_negate_thresh;
1429 
1430                 negate_thresh = zero - (v16i8) tc;
1431                 sign_negate_thresh = __msa_clti_s_b(negate_thresh, 0);
1432 
1433                 ILVR_B2_SH(zero, tc, sign_negate_thresh, negate_thresh,
1434                            threshold_r, negate_thresh_r);
1435 
1436                 AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r,
1437                              negate_thresh_r, threshold_r, p0_r, q0_r);
1438 
1439                 threshold_l = (v8i16) __msa_ilvl_b(zero, (v16i8) tc);
1440                 negate_thresh_l = (v8i16) __msa_ilvl_b(sign_negate_thresh,
1441                                                        negate_thresh);
1442 
1443                 AVC_LPF_P0Q0(q0_org_l, p0_org_l, p1_org_l, q1_org_l,
1444                              negate_thresh_l, threshold_l, p0_l, q0_l);
1445             }
1446 
1447             PCKEV_B2_UB(p0_l, p0_r, q0_l, q0_r, p0, q0);
1448 
1449             p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
1450             q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
1451 
1452         {
1453             v16i8 tp0, tp1, tp2, tp3;
1454             v8i16 tmp2, tmp5;
1455             v4i32 tmp3, tmp4, tmp6, tmp7;
1456             uint32_t out0, out2;
1457             uint16_t out1, out3;
1458 
1459             src = data - 3;
1460 
1461             ILVRL_B2_SB(p1_org, p2_org, tp0, tp2);
1462             ILVRL_B2_SB(q0_org, p0_org, tp1, tp3);
1463             ILVRL_B2_SH(q2_org, q1_org, tmp2, tmp5);
1464 
1465             ILVRL_H2_SW(tp1, tp0, tmp3, tmp4);
1466             ILVRL_H2_SW(tp3, tp2, tmp6, tmp7);
1467 
1468             out0 = __msa_copy_u_w(tmp3, 0);
1469             out1 = __msa_copy_u_h(tmp2, 0);
1470             out2 = __msa_copy_u_w(tmp3, 1);
1471             out3 = __msa_copy_u_h(tmp2, 1);
1472 
1473             SW(out0, src);
1474             SH(out1, (src + 4));
1475             src += img_width;
1476             SW(out2, src);
1477             SH(out3, (src + 4));
1478 
1479             out0 = __msa_copy_u_w(tmp3, 2);
1480             out1 = __msa_copy_u_h(tmp2, 2);
1481             out2 = __msa_copy_u_w(tmp3, 3);
1482             out3 = __msa_copy_u_h(tmp2, 3);
1483 
1484             src += img_width;
1485             SW(out0, src);
1486             SH(out1, (src + 4));
1487             src += img_width;
1488             SW(out2, src);
1489             SH(out3, (src + 4));
1490 
1491             out0 = __msa_copy_u_w(tmp4, 0);
1492             out1 = __msa_copy_u_h(tmp2, 4);
1493             out2 = __msa_copy_u_w(tmp4, 1);
1494             out3 = __msa_copy_u_h(tmp2, 5);
1495 
1496             src += img_width;
1497             SW(out0, src);
1498             SH(out1, (src + 4));
1499             src += img_width;
1500             SW(out2, src);
1501             SH(out3, (src + 4));
1502 
1503             out0 = __msa_copy_u_w(tmp4, 2);
1504             out1 = __msa_copy_u_h(tmp2, 6);
1505             out2 = __msa_copy_u_w(tmp4, 3);
1506             out3 = __msa_copy_u_h(tmp2, 7);
1507 
1508             src += img_width;
1509             SW(out0, src);
1510             SH(out1, (src + 4));
1511             src += img_width;
1512             SW(out2, src);
1513             SH(out3, (src + 4));
1514 
1515             out0 = __msa_copy_u_w(tmp6, 0);
1516             out1 = __msa_copy_u_h(tmp5, 0);
1517             out2 = __msa_copy_u_w(tmp6, 1);
1518             out3 = __msa_copy_u_h(tmp5, 1);
1519 
1520             src += img_width;
1521             SW(out0, src);
1522             SH(out1, (src + 4));
1523             src += img_width;
1524             SW(out2, src);
1525             SH(out3, (src + 4));
1526 
1527             out0 = __msa_copy_u_w(tmp6, 2);
1528             out1 = __msa_copy_u_h(tmp5, 2);
1529             out2 = __msa_copy_u_w(tmp6, 3);
1530             out3 = __msa_copy_u_h(tmp5, 3);
1531 
1532             src += img_width;
1533             SW(out0, src);
1534             SH(out1, (src + 4));
1535             src += img_width;
1536             SW(out2, src);
1537             SH(out3, (src + 4));
1538 
1539             out0 = __msa_copy_u_w(tmp7, 0);
1540             out1 = __msa_copy_u_h(tmp5, 4);
1541             out2 = __msa_copy_u_w(tmp7, 1);
1542             out3 = __msa_copy_u_h(tmp5, 5);
1543 
1544             src += img_width;
1545             SW(out0, src);
1546             SH(out1, (src + 4));
1547             src += img_width;
1548             SW(out2, src);
1549             SH(out3, (src + 4));
1550 
1551             out0 = __msa_copy_u_w(tmp7, 2);
1552             out1 = __msa_copy_u_h(tmp5, 6);
1553             out2 = __msa_copy_u_w(tmp7, 3);
1554             out3 = __msa_copy_u_h(tmp5, 7);
1555 
1556             src += img_width;
1557             SW(out0, src);
1558             SH(out1, (src + 4));
1559             src += img_width;
1560             SW(out2, src);
1561             SH(out3, (src + 4));
1562         }
1563         }
1564     }
1565 }
1566 
avc_loopfilter_luma_inter_edge_hor_msa(uint8_t * data,uint8_t bs0,uint8_t bs1,uint8_t bs2,uint8_t bs3,uint8_t tc0,uint8_t tc1,uint8_t tc2,uint8_t tc3,uint8_t alpha_in,uint8_t beta_in,ptrdiff_t image_width)1567 static void avc_loopfilter_luma_inter_edge_hor_msa(uint8_t *data,
1568                                                    uint8_t bs0, uint8_t bs1,
1569                                                    uint8_t bs2, uint8_t bs3,
1570                                                    uint8_t tc0, uint8_t tc1,
1571                                                    uint8_t tc2, uint8_t tc3,
1572                                                    uint8_t alpha_in,
1573                                                    uint8_t beta_in,
1574                                                    ptrdiff_t image_width)
1575 {
1576     v16u8 tmp_vec;
1577     v16u8 bs = { 0 };
1578 
1579     tmp_vec = (v16u8) __msa_fill_b(bs0);
1580     bs = (v16u8) __msa_insve_w((v4i32) bs, 0, (v4i32) tmp_vec);
1581     tmp_vec = (v16u8) __msa_fill_b(bs1);
1582     bs = (v16u8) __msa_insve_w((v4i32) bs, 1, (v4i32) tmp_vec);
1583     tmp_vec = (v16u8) __msa_fill_b(bs2);
1584     bs = (v16u8) __msa_insve_w((v4i32) bs, 2, (v4i32) tmp_vec);
1585     tmp_vec = (v16u8) __msa_fill_b(bs3);
1586     bs = (v16u8) __msa_insve_w((v4i32) bs, 3, (v4i32) tmp_vec);
1587 
1588     if (!__msa_test_bz_v(bs)) {
1589         v16u8 alpha, beta, is_less_than, is_less_than_beta;
1590         v16u8 p0, q0, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org;
1591         v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1592         v16u8 is_less_than_alpha, is_bs_greater_than0;
1593         v8i16 p0_r, q0_r, p0_l, q0_l;
1594         v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1595         v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
1596         v16i8 zero = { 0 };
1597         v16i8 tc = { 0 };
1598 
1599         tmp_vec = (v16u8) __msa_fill_b(tc0);
1600         tc = (v16i8) __msa_insve_w((v4i32) tc, 0, (v4i32) tmp_vec);
1601         tmp_vec = (v16u8) __msa_fill_b(tc1);
1602         tc = (v16i8) __msa_insve_w((v4i32) tc, 1, (v4i32) tmp_vec);
1603         tmp_vec = (v16u8) __msa_fill_b(tc2);
1604         tc = (v16i8) __msa_insve_w((v4i32) tc, 2, (v4i32) tmp_vec);
1605         tmp_vec = (v16u8) __msa_fill_b(tc3);
1606         tc = (v16i8) __msa_insve_w((v4i32) tc, 3, (v4i32) tmp_vec);
1607 
1608         alpha = (v16u8) __msa_fill_b(alpha_in);
1609         beta = (v16u8) __msa_fill_b(beta_in);
1610 
1611         LD_UB5(data - (3 * image_width), image_width,
1612                p2_org, p1_org, p0_org, q0_org, q1_org);
1613 
1614         is_bs_greater_than0 = ((v16u8) zero < bs);
1615         p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1616         p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1617         q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1618 
1619         is_less_than_alpha = (p0_asub_q0 < alpha);
1620         is_less_than_beta = (p1_asub_p0 < beta);
1621         is_less_than = is_less_than_beta & is_less_than_alpha;
1622         is_less_than_beta = (q1_asub_q0 < beta);
1623         is_less_than = is_less_than_beta & is_less_than;
1624         is_less_than = is_less_than & is_bs_greater_than0;
1625 
1626         if (!__msa_test_bz_v(is_less_than)) {
1627             v16i8 sign_negate_tc, negate_tc;
1628             v8i16 negate_tc_r, i16_negatetc_l, tc_l, tc_r;
1629             v16u8 p2_asub_p0, q2_asub_q0;
1630 
1631             q2_org = LD_UB(data + (2 * image_width));
1632             negate_tc = zero - tc;
1633             sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
1634 
1635             ILVRL_B2_SH(sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l);
1636 
1637             UNPCK_UB_SH(tc, tc_r, tc_l);
1638             UNPCK_UB_SH(p1_org, p1_org_r, p1_org_l);
1639             UNPCK_UB_SH(p0_org, p0_org_r, p0_org_l);
1640             UNPCK_UB_SH(q0_org, q0_org_r, q0_org_l);
1641 
1642             p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
1643             is_less_than_beta = (p2_asub_p0 < beta);
1644             is_less_than_beta = is_less_than_beta & is_less_than;
1645 
1646             if (!__msa_test_bz_v(is_less_than_beta)) {
1647                 v16u8 p1;
1648                 v8i16 p1_r = { 0 };
1649                 v8i16 p1_l = { 0 };
1650                 v8i16 p2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) p2_org);
1651                 v8i16 p2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) p2_org);
1652 
1653                 AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, p1_org_r, p2_org_r,
1654                                  negate_tc_r, tc_r, p1_r);
1655                 AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, p1_org_l, p2_org_l,
1656                                  i16_negatetc_l, tc_l, p1_l);
1657 
1658                 p1 = (v16u8) __msa_pckev_b((v16i8) p1_l, (v16i8) p1_r);
1659                 p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
1660                 ST_UB(p1_org, data - (2 * image_width));
1661 
1662                 is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1663                 tc = tc + (v16i8) is_less_than_beta;
1664             }
1665 
1666             q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
1667             is_less_than_beta = (q2_asub_q0 < beta);
1668             is_less_than_beta = is_less_than_beta & is_less_than;
1669 
1670             q1_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q1_org);
1671             q1_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q1_org);
1672 
1673             if (!__msa_test_bz_v(is_less_than_beta)) {
1674                 v16u8 q1;
1675                 v8i16 q1_r = { 0 };
1676                 v8i16 q1_l = { 0 };
1677                 v8i16 q2_org_r = (v8i16) __msa_ilvr_b(zero, (v16i8) q2_org);
1678                 v8i16 q2_org_l = (v8i16) __msa_ilvl_b(zero, (v16i8) q2_org);
1679 
1680                 AVC_LPF_P1_OR_Q1(p0_org_r, q0_org_r, q1_org_r, q2_org_r,
1681                                  negate_tc_r, tc_r, q1_r);
1682                 AVC_LPF_P1_OR_Q1(p0_org_l, q0_org_l, q1_org_l, q2_org_l,
1683                                  i16_negatetc_l, tc_l, q1_l);
1684 
1685                 q1 = (v16u8) __msa_pckev_b((v16i8) q1_l, (v16i8) q1_r);
1686                 q1_org = __msa_bmnz_v(q1_org, q1, is_less_than_beta);
1687                 ST_UB(q1_org, data + image_width);
1688 
1689                 is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
1690                 tc = tc + (v16i8) is_less_than_beta;
1691             }
1692             {
1693                 v16i8 negate_thresh, sign_negate_thresh;
1694                 v8i16 threshold_r, threshold_l;
1695                 v8i16 negate_thresh_l, negate_thresh_r;
1696 
1697                 negate_thresh = zero - tc;
1698                 sign_negate_thresh = __msa_clti_s_b(negate_thresh, 0);
1699 
1700                 ILVR_B2_SH(zero, tc, sign_negate_thresh, negate_thresh,
1701                            threshold_r, negate_thresh_r);
1702                 AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r,
1703                              negate_thresh_r, threshold_r, p0_r, q0_r);
1704 
1705                 threshold_l = (v8i16) __msa_ilvl_b(zero, tc);
1706                 negate_thresh_l = (v8i16) __msa_ilvl_b(sign_negate_thresh,
1707                                                        negate_thresh);
1708                 AVC_LPF_P0Q0(q0_org_l, p0_org_l, p1_org_l, q1_org_l,
1709                              negate_thresh_l, threshold_l, p0_l, q0_l);
1710             }
1711 
1712             PCKEV_B2_UB(p0_l, p0_r, q0_l, q0_r, p0, q0);
1713 
1714             p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
1715             q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
1716 
1717             ST_UB(p0_org, (data - image_width));
1718             ST_UB(q0_org, data);
1719         }
1720     }
1721 }
1722 
avc_h_loop_filter_luma_mbaff_msa(uint8_t * in,ptrdiff_t stride,int32_t alpha_in,int32_t beta_in,int8_t * tc0)1723 static void avc_h_loop_filter_luma_mbaff_msa(uint8_t *in, ptrdiff_t stride,
1724                                              int32_t alpha_in, int32_t beta_in,
1725                                              int8_t *tc0)
1726 {
1727     uint8_t *data = in;
1728     uint32_t out0, out1, out2, out3;
1729     uint64_t load;
1730     uint32_t tc_val;
1731     v16u8 alpha, beta;
1732     v16i8 inp0 = { 0 };
1733     v16i8 inp1 = { 0 };
1734     v16i8 inp2 = { 0 };
1735     v16i8 inp3 = { 0 };
1736     v16i8 inp4 = { 0 };
1737     v16i8 inp5 = { 0 };
1738     v16i8 inp6 = { 0 };
1739     v16i8 inp7 = { 0 };
1740     v16i8 src0, src1, src2, src3;
1741     v8i16 src4, src5, src6, src7;
1742     v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0;
1743     v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;
1744     v16u8 is_less_than_beta1, is_less_than_beta2;
1745     v8i16 tc, tc_orig_r, tc_plus1;
1746     v16u8 is_tc_orig1, is_tc_orig2, tc_orig = { 0 };
1747     v8i16 p0_ilvr_q0, p0_add_q0, q0_sub_p0, p1_sub_q1;
1748     v8i16 src2_r, src3_r;
1749     v8i16 p2_r, p1_r, q2_r, q1_r;
1750     v16u8 p2, q2, p0, q0;
1751     v4i32 dst0, dst1;
1752     v16i8 zeros = { 0 };
1753 
1754     alpha = (v16u8) __msa_fill_b(alpha_in);
1755     beta = (v16u8) __msa_fill_b(beta_in);
1756 
1757     if (tc0[0] < 0) {
1758         data += (2 * stride);
1759     } else {
1760         load = LD(data - 3);
1761         inp0 = (v16i8) __msa_insert_d((v2i64) inp0, 0, load);
1762         load = LD(data - 3 + stride);
1763         inp1 = (v16i8) __msa_insert_d((v2i64) inp1, 0, load);
1764         data += (2 * stride);
1765     }
1766 
1767     if (tc0[1] < 0) {
1768         data += (2 * stride);
1769     } else {
1770         load = LD(data - 3);
1771         inp2 = (v16i8) __msa_insert_d((v2i64) inp2, 0, load);
1772         load = LD(data - 3 + stride);
1773         inp3 = (v16i8) __msa_insert_d((v2i64) inp3, 0, load);
1774         data += (2 * stride);
1775     }
1776 
1777     if (tc0[2] < 0) {
1778         data += (2 * stride);
1779     } else {
1780         load = LD(data - 3);
1781         inp4 = (v16i8) __msa_insert_d((v2i64) inp4, 0, load);
1782         load = LD(data - 3 + stride);
1783         inp5 = (v16i8) __msa_insert_d((v2i64) inp5, 0, load);
1784         data += (2 * stride);
1785     }
1786 
1787     if (tc0[3] < 0) {
1788         data += (2 * stride);
1789     } else {
1790         load = LD(data - 3);
1791         inp6 = (v16i8) __msa_insert_d((v2i64) inp6, 0, load);
1792         load = LD(data - 3 + stride);
1793         inp7 = (v16i8) __msa_insert_d((v2i64) inp7, 0, load);
1794         data += (2 * stride);
1795     }
1796 
1797     ILVR_B4_SB(inp1, inp0, inp3, inp2, inp5, inp4, inp7, inp6,
1798                src0, src1, src2, src3);
1799 
1800     ILVR_H2_SH(src1, src0, src3, src2, src4, src6);
1801     ILVL_H2_SH(src1, src0, src3, src2, src5, src7);
1802 
1803     src0 = (v16i8) __msa_ilvr_w((v4i32) src6, (v4i32) src4);
1804     src1 = __msa_sldi_b(zeros, (v16i8) src0, 8);
1805     src2 = (v16i8) __msa_ilvl_w((v4i32) src6, (v4i32) src4);
1806     src3 = __msa_sldi_b(zeros, (v16i8) src2, 8);
1807     src4 = (v8i16) __msa_ilvr_w((v4i32) src7, (v4i32) src5);
1808     src5 = (v8i16) __msa_sldi_b(zeros, (v16i8) src4, 8);
1809 
1810     p0_asub_q0 = __msa_asub_u_b((v16u8) src2, (v16u8) src3);
1811     p1_asub_p0 = __msa_asub_u_b((v16u8) src1, (v16u8) src2);
1812     q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3);
1813     p2_asub_p0 = __msa_asub_u_b((v16u8) src0, (v16u8) src2);
1814     q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3);
1815 
1816     is_less_than_alpha = (p0_asub_q0 < alpha);
1817     is_less_than_beta = (p1_asub_p0 < beta);
1818     is_less_than = is_less_than_alpha & is_less_than_beta;
1819     is_less_than_beta = (q1_asub_q0 < beta);
1820     is_less_than = is_less_than_beta & is_less_than;
1821 
1822     is_less_than_beta1 = (p2_asub_p0 < beta);
1823     is_less_than_beta2 = (q2_asub_q0 < beta);
1824 
1825     p0_ilvr_q0 = (v8i16) __msa_ilvr_b((v16i8) src3, (v16i8) src2);
1826     p0_add_q0 = (v8i16) __msa_hadd_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0);
1827     p0_add_q0 = __msa_srari_h(p0_add_q0, 1);
1828 
1829     ILVR_B2_SH(zeros, src0, zeros, src1, p2_r, p1_r);
1830     p2_r += p0_add_q0;
1831     p2_r >>= 1;
1832     p2_r -= p1_r;
1833     ILVR_B2_SH(zeros, src5, zeros, src4, q2_r, q1_r);
1834     q2_r += p0_add_q0;
1835     q2_r >>= 1;
1836     q2_r -= q1_r;
1837 
1838     tc_val = LW(tc0);
1839     tc_orig = (v16u8) __msa_insert_w((v4i32) tc_orig, 0, tc_val);
1840     tc_orig = (v16u8) __msa_ilvr_b((v16i8) tc_orig, (v16i8) tc_orig);
1841     is_tc_orig1 = tc_orig;
1842     is_tc_orig2 = tc_orig;
1843     tc_orig_r = (v8i16) __msa_ilvr_b(zeros, (v16i8) tc_orig);
1844     tc = tc_orig_r;
1845 
1846     CLIP_SH(p2_r, -tc_orig_r, tc_orig_r);
1847     CLIP_SH(q2_r, -tc_orig_r, tc_orig_r);
1848 
1849     p2_r += p1_r;
1850     q2_r += q1_r;
1851 
1852     PCKEV_B2_UB(p2_r, p2_r, q2_r, q2_r, p2, q2);
1853 
1854     is_tc_orig1 = (zeros < is_tc_orig1);
1855     is_tc_orig2 = is_tc_orig1;
1856     is_tc_orig1 = is_less_than_beta1 & is_tc_orig1;
1857     is_tc_orig2 = is_less_than_beta2 & is_tc_orig2;
1858     is_tc_orig1 = is_less_than & is_tc_orig1;
1859     is_tc_orig2 = is_less_than & is_tc_orig2;
1860 
1861     p2 = __msa_bmnz_v((v16u8) src1, p2, is_tc_orig1);
1862     q2 = __msa_bmnz_v((v16u8) src4, q2, is_tc_orig2);
1863 
1864     q0_sub_p0 = __msa_hsub_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0);
1865     q0_sub_p0 <<= 2;
1866     p1_sub_q1 = p1_r - q1_r;
1867     q0_sub_p0 += p1_sub_q1;
1868     q0_sub_p0 = __msa_srari_h(q0_sub_p0, 3);
1869 
1870     tc_plus1 = tc + 1;
1871     is_less_than_beta1 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta1,
1872                                               (v16i8) is_less_than_beta1);
1873     tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta1);
1874     tc_plus1 = tc + 1;
1875     is_less_than_beta2 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta2,
1876                                               (v16i8) is_less_than_beta2);
1877     tc = (v8i16) __msa_bmnz_v((v16u8) tc, (v16u8) tc_plus1, is_less_than_beta2);
1878 
1879     CLIP_SH(q0_sub_p0, -tc, tc);
1880 
1881     ILVR_B2_SH(zeros, src2, zeros, src3, src2_r, src3_r);
1882     src2_r += q0_sub_p0;
1883     src3_r -= q0_sub_p0;
1884 
1885     CLIP_SH2_0_255(src2_r, src3_r);
1886 
1887     PCKEV_B2_UB(src2_r, src2_r, src3_r, src3_r, p0, q0);
1888 
1889     p0 = __msa_bmnz_v((v16u8) src2, p0, is_less_than);
1890     q0 = __msa_bmnz_v((v16u8) src3, q0, is_less_than);
1891 
1892     ILVR_B2_UB(p0, p2, q2, q0, p2, q2);
1893 
1894     ILVRL_H2_SW(q2, p2, dst0, dst1);
1895 
1896     data = in;
1897 
1898     out0 = __msa_copy_u_w(dst0, 0);
1899     out1 = __msa_copy_u_w(dst0, 1);
1900     out2 = __msa_copy_u_w(dst0, 2);
1901     out3 = __msa_copy_u_w(dst0, 3);
1902 
1903     if (tc0[0] < 0) {
1904         data += (2 * stride);
1905     } else {
1906         SW(out0, (data - 2));
1907         data += stride;
1908         SW(out1, (data - 2));
1909         data += stride;
1910     }
1911 
1912     if (tc0[1] < 0) {
1913         data += (2 * stride);
1914     } else {
1915         SW(out2, (data - 2));
1916         data += stride;
1917         SW(out3, (data - 2));
1918         data += stride;
1919     }
1920 
1921     out0 = __msa_copy_u_w(dst1, 0);
1922     out1 = __msa_copy_u_w(dst1, 1);
1923     out2 = __msa_copy_u_w(dst1, 2);
1924     out3 = __msa_copy_u_w(dst1, 3);
1925 
1926     if (tc0[2] < 0) {
1927         data += (2 * stride);
1928     } else {
1929         SW(out0, (data - 2));
1930         data += stride;
1931         SW(out1, (data - 2));
1932         data += stride;
1933     }
1934 
1935     if (tc0[3] >= 0) {
1936         SW(out2, (data - 2));
1937         data += stride;
1938         SW(out3, (data - 2));
1939     }
1940 }
1941 
avc_loopfilter_cb_or_cr_inter_edge_hor_msa(uint8_t * data,uint8_t bs0,uint8_t bs1,uint8_t bs2,uint8_t bs3,uint8_t tc0,uint8_t tc1,uint8_t tc2,uint8_t tc3,uint8_t alpha_in,uint8_t beta_in,ptrdiff_t img_width)1942 static void avc_loopfilter_cb_or_cr_inter_edge_hor_msa(uint8_t *data,
1943                                                        uint8_t bs0, uint8_t bs1,
1944                                                        uint8_t bs2, uint8_t bs3,
1945                                                        uint8_t tc0, uint8_t tc1,
1946                                                        uint8_t tc2, uint8_t tc3,
1947                                                        uint8_t alpha_in,
1948                                                        uint8_t beta_in,
1949                                                        ptrdiff_t img_width)
1950 {
1951     v16u8 alpha, beta;
1952     v8i16 tmp_vec;
1953     v8i16 bs = { 0 };
1954     v8i16 tc = { 0 };
1955     v16u8 p0, q0, p0_asub_q0, p1_asub_p0, q1_asub_q0;
1956     v16u8 is_less_than;
1957     v16u8 is_less_than_beta, is_less_than_alpha, is_bs_greater_than0;
1958     v8i16 p0_r, q0_r;
1959     v16u8 p1_org, p0_org, q0_org, q1_org;
1960     v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1961     v16i8 negate_tc, sign_negate_tc;
1962     v8i16 tc_r, negate_tc_r;
1963     v16i8 zero = { 0 };
1964 
1965     tmp_vec = (v8i16) __msa_fill_b(bs0);
1966     bs = __msa_insve_h(bs, 0, tmp_vec);
1967     tmp_vec = (v8i16) __msa_fill_b(bs1);
1968     bs = __msa_insve_h(bs, 1, tmp_vec);
1969     tmp_vec = (v8i16) __msa_fill_b(bs2);
1970     bs = __msa_insve_h(bs, 2, tmp_vec);
1971     tmp_vec = (v8i16) __msa_fill_b(bs3);
1972     bs = __msa_insve_h(bs, 3, tmp_vec);
1973 
1974     if (!__msa_test_bz_v((v16u8) bs)) {
1975         tmp_vec = (v8i16) __msa_fill_b(tc0);
1976         tc = __msa_insve_h(tc, 0, tmp_vec);
1977         tmp_vec = (v8i16) __msa_fill_b(tc1);
1978         tc = __msa_insve_h(tc, 1, tmp_vec);
1979         tmp_vec = (v8i16) __msa_fill_b(tc2);
1980         tc = __msa_insve_h(tc, 2, tmp_vec);
1981         tmp_vec = (v8i16) __msa_fill_b(tc3);
1982         tc = __msa_insve_h(tc, 3, tmp_vec);
1983 
1984         is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs);
1985 
1986         alpha = (v16u8) __msa_fill_b(alpha_in);
1987         beta = (v16u8) __msa_fill_b(beta_in);
1988 
1989         LD_UB4(data - (img_width << 1), img_width,
1990                p1_org, p0_org, q0_org, q1_org);
1991 
1992         p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
1993         p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
1994         q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
1995 
1996         is_less_than_alpha = (p0_asub_q0 < alpha);
1997         is_less_than_beta = (p1_asub_p0 < beta);
1998         is_less_than = is_less_than_beta & is_less_than_alpha;
1999         is_less_than_beta = (q1_asub_q0 < beta);
2000         is_less_than = is_less_than_beta & is_less_than;
2001         is_less_than = is_less_than & is_bs_greater_than0;
2002 
2003         is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
2004 
2005         if (!__msa_test_bz_v(is_less_than)) {
2006             negate_tc = zero - (v16i8) tc;
2007             sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
2008 
2009             ILVR_B2_SH(zero, tc, sign_negate_tc, negate_tc, tc_r, negate_tc_r);
2010 
2011             ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org,
2012                        p1_org_r, p0_org_r, q0_org_r, q1_org_r);
2013 
2014             AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r,
2015                          tc_r, p0_r, q0_r);
2016 
2017             PCKEV_B2_UB(zero, p0_r, zero, q0_r, p0, q0);
2018 
2019             p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
2020             q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
2021 
2022             ST_UB(q0_org, data);
2023             ST_UB(p0_org, (data - img_width));
2024         }
2025     }
2026 }
2027 
avc_loopfilter_cb_or_cr_inter_edge_ver_msa(uint8_t * data,uint8_t bs0,uint8_t bs1,uint8_t bs2,uint8_t bs3,uint8_t tc0,uint8_t tc1,uint8_t tc2,uint8_t tc3,uint8_t alpha_in,uint8_t beta_in,ptrdiff_t img_width)2028 static void avc_loopfilter_cb_or_cr_inter_edge_ver_msa(uint8_t *data,
2029                                                        uint8_t bs0, uint8_t bs1,
2030                                                        uint8_t bs2, uint8_t bs3,
2031                                                        uint8_t tc0, uint8_t tc1,
2032                                                        uint8_t tc2, uint8_t tc3,
2033                                                        uint8_t alpha_in,
2034                                                        uint8_t beta_in,
2035                                                        ptrdiff_t img_width)
2036 {
2037     uint8_t *src;
2038     v16u8 alpha, beta;
2039     v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
2040     v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
2041     v16u8 p0, q0;
2042     v8i16 p0_r = { 0 };
2043     v8i16 q0_r = { 0 };
2044     v16u8 p1_org, p0_org, q0_org, q1_org;
2045     v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
2046     v16u8 is_bs_greater_than0;
2047     v8i16 tc_r, negate_tc_r;
2048     v16i8 negate_tc, sign_negate_tc;
2049     v16i8 zero = { 0 };
2050     v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
2051     v8i16 tmp1, tmp_vec, bs = { 0 };
2052     v8i16 tc = { 0 };
2053 
2054     tmp_vec = (v8i16) __msa_fill_b(bs0);
2055     bs = __msa_insve_h(bs, 0, tmp_vec);
2056     tmp_vec = (v8i16) __msa_fill_b(bs1);
2057     bs = __msa_insve_h(bs, 1, tmp_vec);
2058     tmp_vec = (v8i16) __msa_fill_b(bs2);
2059     bs = __msa_insve_h(bs, 2, tmp_vec);
2060     tmp_vec = (v8i16) __msa_fill_b(bs3);
2061     bs = __msa_insve_h(bs, 3, tmp_vec);
2062 
2063     if (!__msa_test_bz_v((v16u8) bs)) {
2064         tmp_vec = (v8i16) __msa_fill_b(tc0);
2065         tc = __msa_insve_h(tc, 0, tmp_vec);
2066         tmp_vec = (v8i16) __msa_fill_b(tc1);
2067         tc = __msa_insve_h(tc, 1, tmp_vec);
2068         tmp_vec = (v8i16) __msa_fill_b(tc2);
2069         tc = __msa_insve_h(tc, 2, tmp_vec);
2070         tmp_vec = (v8i16) __msa_fill_b(tc3);
2071         tc = __msa_insve_h(tc, 3, tmp_vec);
2072 
2073         is_bs_greater_than0 = (v16u8) (zero < (v16i8) bs);
2074 
2075         LD_UB8((data - 2), img_width,
2076                row0, row1, row2, row3, row4, row5, row6, row7);
2077 
2078         TRANSPOSE8x4_UB_UB(row0, row1, row2, row3,
2079                            row4, row5, row6, row7,
2080                            p1_org, p0_org, q0_org, q1_org);
2081 
2082         p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
2083         p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
2084         q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
2085 
2086         alpha = (v16u8) __msa_fill_b(alpha_in);
2087         beta = (v16u8) __msa_fill_b(beta_in);
2088 
2089         is_less_than_alpha = (p0_asub_q0 < alpha);
2090         is_less_than_beta = (p1_asub_p0 < beta);
2091         is_less_than = is_less_than_beta & is_less_than_alpha;
2092         is_less_than_beta = (q1_asub_q0 < beta);
2093         is_less_than = is_less_than_beta & is_less_than;
2094         is_less_than = is_bs_greater_than0 & is_less_than;
2095 
2096         is_less_than = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) is_less_than);
2097 
2098         if (!__msa_test_bz_v(is_less_than)) {
2099             ILVR_B4_SH(zero, p1_org, zero, p0_org, zero, q0_org, zero, q1_org,
2100                        p1_org_r, p0_org_r, q0_org_r, q1_org_r);
2101 
2102             negate_tc = zero - (v16i8) tc;
2103             sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
2104 
2105             ILVR_B2_SH(sign_negate_tc, negate_tc, zero, tc, negate_tc_r, tc_r);
2106 
2107             AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r,
2108                          tc_r, p0_r, q0_r);
2109 
2110             PCKEV_B2_UB(zero, p0_r, zero, q0_r, p0, q0);
2111 
2112             p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
2113             q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
2114             tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_org, (v16i8) p0_org);
2115             src = data - 1;
2116             ST_H4(tmp1, 0, 1, 2, 3, src, img_width);
2117             src += 4 * img_width;
2118             ST_H4(tmp1, 4, 5, 6, 7, src, img_width);
2119         }
2120     }
2121 }
2122 
avc_h_loop_filter_chroma422_msa(uint8_t * src,ptrdiff_t stride,int32_t alpha_in,int32_t beta_in,int8_t * tc0)2123 static void avc_h_loop_filter_chroma422_msa(uint8_t *src, ptrdiff_t stride,
2124                                             int32_t alpha_in, int32_t beta_in,
2125                                             int8_t *tc0)
2126 {
2127     int32_t col, tc_val;
2128     v16u8 alpha, beta, res;
2129 
2130     alpha = (v16u8) __msa_fill_b(alpha_in);
2131     beta = (v16u8) __msa_fill_b(beta_in);
2132 
2133     for (col = 0; col < 4; col++) {
2134         tc_val = (tc0[col] - 1) + 1;
2135 
2136         if (tc_val <= 0) {
2137             src += (4 * stride);
2138             continue;
2139         }
2140 
2141         AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res);
2142         ST_H4(res, 0, 1, 2, 3, (src - 1), stride);
2143         src += (4 * stride);
2144     }
2145 }
2146 
avc_h_loop_filter_chroma422_mbaff_msa(uint8_t * src,ptrdiff_t stride,int32_t alpha_in,int32_t beta_in,int8_t * tc0)2147 static void avc_h_loop_filter_chroma422_mbaff_msa(uint8_t *src,
2148                                                   ptrdiff_t stride,
2149                                                   int32_t alpha_in,
2150                                                   int32_t beta_in,
2151                                                   int8_t *tc0)
2152 {
2153     int32_t col, tc_val;
2154     int16_t out0, out1;
2155     v16u8 alpha, beta, res;
2156 
2157     alpha = (v16u8) __msa_fill_b(alpha_in);
2158     beta = (v16u8) __msa_fill_b(beta_in);
2159 
2160     for (col = 0; col < 4; col++) {
2161         tc_val = (tc0[col] - 1) + 1;
2162 
2163         if (tc_val <= 0) {
2164             src += 4 * stride;
2165             continue;
2166         }
2167 
2168         AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res);
2169 
2170         out0 = __msa_copy_s_h((v8i16) res, 0);
2171         out1 = __msa_copy_s_h((v8i16) res, 1);
2172 
2173         SH(out0, (src - 1));
2174         src += stride;
2175         SH(out1, (src - 1));
2176         src += stride;
2177     }
2178 }
2179 
ff_h264_h_lpf_luma_inter_msa(uint8_t * data,ptrdiff_t img_width,int alpha,int beta,int8_t * tc)2180 void ff_h264_h_lpf_luma_inter_msa(uint8_t *data, ptrdiff_t img_width,
2181                                   int alpha, int beta, int8_t *tc)
2182 {
2183     uint8_t bs0 = 1;
2184     uint8_t bs1 = 1;
2185     uint8_t bs2 = 1;
2186     uint8_t bs3 = 1;
2187 
2188     if (tc[0] < 0)
2189         bs0 = 0;
2190     if (tc[1] < 0)
2191         bs1 = 0;
2192     if (tc[2] < 0)
2193         bs2 = 0;
2194     if (tc[3] < 0)
2195         bs3 = 0;
2196 
2197     avc_loopfilter_luma_inter_edge_ver_msa(data, bs0, bs1, bs2, bs3,
2198                                            tc[0], tc[1], tc[2], tc[3],
2199                                            alpha, beta, img_width);
2200 }
2201 
ff_h264_v_lpf_luma_inter_msa(uint8_t * data,ptrdiff_t img_width,int alpha,int beta,int8_t * tc)2202 void ff_h264_v_lpf_luma_inter_msa(uint8_t *data, ptrdiff_t img_width,
2203                                   int alpha, int beta, int8_t *tc)
2204 {
2205 
2206     uint8_t bs0 = 1;
2207     uint8_t bs1 = 1;
2208     uint8_t bs2 = 1;
2209     uint8_t bs3 = 1;
2210 
2211     if (tc[0] < 0)
2212         bs0 = 0;
2213     if (tc[1] < 0)
2214         bs1 = 0;
2215     if (tc[2] < 0)
2216         bs2 = 0;
2217     if (tc[3] < 0)
2218         bs3 = 0;
2219 
2220     avc_loopfilter_luma_inter_edge_hor_msa(data, bs0, bs1, bs2, bs3,
2221                                            tc[0], tc[1], tc[2], tc[3],
2222                                            alpha, beta, img_width);
2223 }
2224 
ff_h264_h_lpf_chroma_inter_msa(uint8_t * data,ptrdiff_t img_width,int alpha,int beta,int8_t * tc)2225 void ff_h264_h_lpf_chroma_inter_msa(uint8_t *data, ptrdiff_t img_width,
2226                                     int alpha, int beta, int8_t *tc)
2227 {
2228     uint8_t bs0 = 1;
2229     uint8_t bs1 = 1;
2230     uint8_t bs2 = 1;
2231     uint8_t bs3 = 1;
2232 
2233     if (tc[0] < 0)
2234         bs0 = 0;
2235     if (tc[1] < 0)
2236         bs1 = 0;
2237     if (tc[2] < 0)
2238         bs2 = 0;
2239     if (tc[3] < 0)
2240         bs3 = 0;
2241 
2242     avc_loopfilter_cb_or_cr_inter_edge_ver_msa(data, bs0, bs1, bs2, bs3,
2243                                                tc[0], tc[1], tc[2], tc[3],
2244                                                alpha, beta, img_width);
2245 }
2246 
ff_h264_v_lpf_chroma_inter_msa(uint8_t * data,ptrdiff_t img_width,int alpha,int beta,int8_t * tc)2247 void ff_h264_v_lpf_chroma_inter_msa(uint8_t *data, ptrdiff_t img_width,
2248                                     int alpha, int beta, int8_t *tc)
2249 {
2250     uint8_t bs0 = 1;
2251     uint8_t bs1 = 1;
2252     uint8_t bs2 = 1;
2253     uint8_t bs3 = 1;
2254 
2255     if (tc[0] < 0)
2256         bs0 = 0;
2257     if (tc[1] < 0)
2258         bs1 = 0;
2259     if (tc[2] < 0)
2260         bs2 = 0;
2261     if (tc[3] < 0)
2262         bs3 = 0;
2263 
2264     avc_loopfilter_cb_or_cr_inter_edge_hor_msa(data, bs0, bs1, bs2, bs3,
2265                                                tc[0], tc[1], tc[2], tc[3],
2266                                                alpha, beta, img_width);
2267 }
2268 
ff_h264_h_lpf_luma_intra_msa(uint8_t * data,ptrdiff_t img_width,int alpha,int beta)2269 void ff_h264_h_lpf_luma_intra_msa(uint8_t *data, ptrdiff_t img_width,
2270                                   int alpha, int beta)
2271 {
2272     avc_loopfilter_luma_intra_edge_ver_msa(data, (uint8_t) alpha,
2273                                            (uint8_t) beta,
2274                                            img_width);
2275 }
2276 
ff_h264_v_lpf_luma_intra_msa(uint8_t * data,ptrdiff_t img_width,int alpha,int beta)2277 void ff_h264_v_lpf_luma_intra_msa(uint8_t *data, ptrdiff_t img_width,
2278                                   int alpha, int beta)
2279 {
2280     avc_loopfilter_luma_intra_edge_hor_msa(data, (uint8_t) alpha,
2281                                            (uint8_t) beta,
2282                                            img_width);
2283 }
2284 
ff_h264_h_lpf_chroma_intra_msa(uint8_t * data,ptrdiff_t img_width,int alpha,int beta)2285 void ff_h264_h_lpf_chroma_intra_msa(uint8_t *data, ptrdiff_t img_width,
2286                                     int alpha, int beta)
2287 {
2288     avc_loopfilter_cb_or_cr_intra_edge_ver_msa(data, (uint8_t) alpha,
2289                                                (uint8_t) beta,
2290                                                img_width);
2291 }
2292 
ff_h264_v_lpf_chroma_intra_msa(uint8_t * data,ptrdiff_t img_width,int alpha,int beta)2293 void ff_h264_v_lpf_chroma_intra_msa(uint8_t *data, ptrdiff_t img_width,
2294                                     int alpha, int beta)
2295 {
2296     avc_loopfilter_cb_or_cr_intra_edge_hor_msa(data, (uint8_t) alpha,
2297                                                (uint8_t) beta,
2298                                                img_width);
2299 }
2300 
ff_h264_h_loop_filter_chroma422_msa(uint8_t * src,ptrdiff_t ystride,int32_t alpha,int32_t beta,int8_t * tc0)2301 void ff_h264_h_loop_filter_chroma422_msa(uint8_t *src,
2302                                          ptrdiff_t ystride,
2303                                          int32_t alpha, int32_t beta,
2304                                          int8_t *tc0)
2305 {
2306     avc_h_loop_filter_chroma422_msa(src, ystride, alpha, beta, tc0);
2307 }
2308 
ff_h264_h_loop_filter_chroma422_mbaff_msa(uint8_t * src,ptrdiff_t ystride,int32_t alpha,int32_t beta,int8_t * tc0)2309 void ff_h264_h_loop_filter_chroma422_mbaff_msa(uint8_t *src,
2310                                                ptrdiff_t ystride,
2311                                                int32_t alpha,
2312                                                int32_t beta,
2313                                                int8_t *tc0)
2314 {
2315     avc_h_loop_filter_chroma422_mbaff_msa(src, ystride, alpha, beta, tc0);
2316 }
2317 
ff_h264_h_loop_filter_luma_mbaff_msa(uint8_t * src,ptrdiff_t ystride,int32_t alpha,int32_t beta,int8_t * tc0)2318 void ff_h264_h_loop_filter_luma_mbaff_msa(uint8_t *src,
2319                                           ptrdiff_t ystride,
2320                                           int32_t alpha,
2321                                           int32_t beta,
2322                                           int8_t *tc0)
2323 {
2324     avc_h_loop_filter_luma_mbaff_msa(src, ystride, alpha, beta, tc0);
2325 }
2326 
ff_h264_h_loop_filter_luma_mbaff_intra_msa(uint8_t * src,ptrdiff_t ystride,int32_t alpha,int32_t beta)2327 void ff_h264_h_loop_filter_luma_mbaff_intra_msa(uint8_t *src,
2328                                                 ptrdiff_t ystride,
2329                                                 int32_t alpha,
2330                                                 int32_t beta)
2331 {
2332     avc_h_loop_filter_luma_mbaff_intra_msa(src, ystride, alpha, beta);
2333 }
2334 
ff_weight_h264_pixels16_8_msa(uint8_t * src,ptrdiff_t stride,int height,int log2_denom,int weight_src,int offset_in)2335 void ff_weight_h264_pixels16_8_msa(uint8_t *src, ptrdiff_t stride,
2336                                    int height, int log2_denom,
2337                                    int weight_src, int offset_in)
2338 {
2339     uint32_t offset_val;
2340     v16i8 zero = { 0 };
2341     v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2342     v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2343     v8i16 src0_l, src1_l, src2_l, src3_l, src0_r, src1_r, src2_r, src3_r;
2344     v8i16 src4_l, src5_l, src6_l, src7_l, src4_r, src5_r, src6_r, src7_r;
2345     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2346     v8i16 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
2347     v8i16 wgt, denom, offset;
2348 
2349     offset_val = (unsigned) offset_in << log2_denom;
2350 
2351     wgt = __msa_fill_h(weight_src);
2352     offset = __msa_fill_h(offset_val);
2353     denom = __msa_fill_h(log2_denom);
2354 
2355     LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2356     ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_r, src1_r,
2357                src2_r, src3_r);
2358     ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_l, src1_l,
2359                src2_l, src3_l);
2360     ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_r, src5_r,
2361                src6_r, src7_r);
2362     ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_l, src5_l,
2363                src6_l, src7_l);
2364     MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l, tmp0, tmp1, tmp2,
2365          tmp3);
2366     MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l, tmp4, tmp5, tmp6,
2367          tmp7);
2368     MUL4(wgt, src4_r, wgt, src4_l, wgt, src5_r, wgt, src5_l, tmp8, tmp9, tmp10,
2369          tmp11);
2370     MUL4(wgt, src6_r, wgt, src6_l, wgt, src7_r, wgt, src7_l, tmp12, tmp13,
2371          tmp14, tmp15);
2372     ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset, tmp0,
2373                 tmp1, tmp2, tmp3);
2374     ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset, tmp4,
2375                 tmp5, tmp6, tmp7);
2376     ADDS_SH4_SH(tmp8, offset, tmp9, offset, tmp10, offset, tmp11, offset, tmp8,
2377                 tmp9, tmp10, tmp11);
2378     ADDS_SH4_SH(tmp12, offset, tmp13, offset, tmp14, offset, tmp15, offset,
2379                 tmp12, tmp13, tmp14, tmp15);
2380     MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
2381     MAXI_SH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 0);
2382     SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
2383     SRLR_H8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, denom);
2384     SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
2385     SAT_UH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 7);
2386     PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2387                 dst2, dst3);
2388     PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2389                 dst5, dst6, dst7);
2390     ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src, stride);
2391     src += 8 * stride;
2392 
2393     if (16 == height) {
2394         LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2395         ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_r,
2396                    src1_r, src2_r, src3_r);
2397         ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, src0_l,
2398                    src1_l, src2_l, src3_l);
2399         ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_r,
2400                    src5_r, src6_r, src7_r);
2401         ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, src4_l,
2402                    src5_l, src6_l, src7_l);
2403         MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l, tmp0, tmp1,
2404              tmp2, tmp3);
2405         MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l, tmp4, tmp5,
2406              tmp6, tmp7);
2407         MUL4(wgt, src4_r, wgt, src4_l, wgt, src5_r, wgt, src5_l, tmp8, tmp9,
2408              tmp10, tmp11);
2409         MUL4(wgt, src6_r, wgt, src6_l, wgt, src7_r, wgt, src7_l, tmp12, tmp13,
2410              tmp14, tmp15);
2411         ADDS_SH4_SH(tmp0, offset, tmp1, offset, tmp2, offset, tmp3, offset,
2412                     tmp0, tmp1, tmp2, tmp3);
2413         ADDS_SH4_SH(tmp4, offset, tmp5, offset, tmp6, offset, tmp7, offset,
2414                     tmp4, tmp5, tmp6, tmp7);
2415         ADDS_SH4_SH(tmp8, offset, tmp9, offset, tmp10, offset, tmp11, offset,
2416                     tmp8, tmp9, tmp10, tmp11);
2417         ADDS_SH4_SH(tmp12, offset, tmp13, offset, tmp14, offset, tmp15, offset,
2418                     tmp12, tmp13, tmp14, tmp15);
2419         MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
2420         MAXI_SH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 0);
2421         SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
2422         SRLR_H8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, denom);
2423         SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
2424         SAT_UH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 7);
2425         PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2426                     dst2, dst3);
2427         PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2428                     dst5, dst6, dst7);
2429         ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, src, stride);
2430     }
2431 }
2432 
ff_weight_h264_pixels8_8_msa(uint8_t * src,ptrdiff_t stride,int height,int log2_denom,int weight_src,int offset)2433 void ff_weight_h264_pixels8_8_msa(uint8_t *src, ptrdiff_t stride,
2434                                   int height, int log2_denom,
2435                                   int weight_src, int offset)
2436 {
2437     if (4 == height) {
2438         avc_wgt_8x4_msa(src, stride, log2_denom, weight_src, offset);
2439     } else if (8 == height) {
2440         avc_wgt_8x8_msa(src, stride, log2_denom, weight_src, offset);
2441     } else {
2442         avc_wgt_8x16_msa(src, stride, log2_denom, weight_src, offset);
2443     }
2444 }
2445 
ff_weight_h264_pixels4_8_msa(uint8_t * src,ptrdiff_t stride,int height,int log2_denom,int weight_src,int offset)2446 void ff_weight_h264_pixels4_8_msa(uint8_t *src, ptrdiff_t stride,
2447                                   int height, int log2_denom,
2448                                   int weight_src, int offset)
2449 {
2450     if (2 == height) {
2451         avc_wgt_4x2_msa(src, stride, log2_denom, weight_src, offset);
2452     } else if (4 == height) {
2453         avc_wgt_4x4_msa(src, stride, log2_denom, weight_src, offset);
2454     } else {
2455         avc_wgt_4x8_msa(src, stride, log2_denom, weight_src, offset);
2456     }
2457 }
2458 
ff_biweight_h264_pixels16_8_msa(uint8_t * dst,uint8_t * src,ptrdiff_t stride,int height,int log2_denom,int weight_dst,int weight_src,int offset_in)2459 void ff_biweight_h264_pixels16_8_msa(uint8_t *dst, uint8_t *src,
2460                                      ptrdiff_t stride, int height,
2461                                      int log2_denom, int weight_dst,
2462                                      int weight_src, int offset_in)
2463 {
2464     v16i8 src_wgt, dst_wgt, wgt;
2465     v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2466     v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2467     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2468     v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2469     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2470     v8i16 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
2471     v8i16 denom, offset;
2472 
2473     offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
2474     offset_in += (128 * (weight_src + weight_dst));
2475 
2476     src_wgt = __msa_fill_b(weight_src);
2477     dst_wgt = __msa_fill_b(weight_dst);
2478     offset = __msa_fill_h(offset_in);
2479     denom = __msa_fill_h(log2_denom + 1);
2480 
2481     wgt = __msa_ilvev_b(dst_wgt, src_wgt);
2482 
2483     LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2484     src += 8 * stride;
2485     LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2486     XORI_B8_128_UB(src0, src1, src2, src3, src4, src5, src6, src7);
2487     XORI_B8_128_UB(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2488     ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec0, vec2, vec4,
2489                vec6);
2490     ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec1, vec3, vec5,
2491                vec7);
2492     ILVR_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec8, vec10,
2493                vec12, vec14);
2494     ILVL_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec9, vec11,
2495                vec13, vec15);
2496     tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
2497     tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
2498     tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
2499     tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
2500     tmp4 = __msa_dpadd_s_h(offset, wgt, vec4);
2501     tmp5 = __msa_dpadd_s_h(offset, wgt, vec5);
2502     tmp6 = __msa_dpadd_s_h(offset, wgt, vec6);
2503     tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
2504     tmp8 = __msa_dpadd_s_h(offset, wgt, vec8);
2505     tmp9 = __msa_dpadd_s_h(offset, wgt, vec9);
2506     tmp10 = __msa_dpadd_s_h(offset, wgt, vec10);
2507     tmp11 = __msa_dpadd_s_h(offset, wgt, vec11);
2508     tmp12 = __msa_dpadd_s_h(offset, wgt, vec12);
2509     tmp13 = __msa_dpadd_s_h(offset, wgt, vec13);
2510     tmp14 = __msa_dpadd_s_h(offset, wgt, vec14);
2511     tmp15 = __msa_dpadd_s_h(offset, wgt, vec15);
2512     SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
2513     SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
2514     SRA_4V(tmp8, tmp9, tmp10, tmp11, denom);
2515     SRA_4V(tmp12, tmp13, tmp14, tmp15, denom);
2516     CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
2517     CLIP_SH8_0_255(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
2518     PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2519                 dst2, dst3);
2520     PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2521                 dst5, dst6, dst7);
2522     ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
2523     dst += 8 * stride;
2524 
2525     if (16 == height) {
2526         LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2527         LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2528         XORI_B8_128_UB(src0, src1, src2, src3, src4, src5, src6, src7);
2529         XORI_B8_128_UB(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2530         ILVR_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec0, vec2,
2531                    vec4, vec6);
2532         ILVL_B4_SB(dst0, src0, dst1, src1, dst2, src2, dst3, src3, vec1, vec3,
2533                    vec5, vec7);
2534         ILVR_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec8, vec10,
2535                    vec12, vec14);
2536         ILVL_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec9, vec11,
2537                    vec13, vec15);
2538         tmp0 = __msa_dpadd_s_h(offset, wgt, vec0);
2539         tmp1 = __msa_dpadd_s_h(offset, wgt, vec1);
2540         tmp2 = __msa_dpadd_s_h(offset, wgt, vec2);
2541         tmp3 = __msa_dpadd_s_h(offset, wgt, vec3);
2542         tmp4 = __msa_dpadd_s_h(offset, wgt, vec4);
2543         tmp5 = __msa_dpadd_s_h(offset, wgt, vec5);
2544         tmp6 = __msa_dpadd_s_h(offset, wgt, vec6);
2545         tmp7 = __msa_dpadd_s_h(offset, wgt, vec7);
2546         tmp8 = __msa_dpadd_s_h(offset, wgt, vec8);
2547         tmp9 = __msa_dpadd_s_h(offset, wgt, vec9);
2548         tmp10 = __msa_dpadd_s_h(offset, wgt, vec10);
2549         tmp11 = __msa_dpadd_s_h(offset, wgt, vec11);
2550         tmp12 = __msa_dpadd_s_h(offset, wgt, vec12);
2551         tmp13 = __msa_dpadd_s_h(offset, wgt, vec13);
2552         tmp14 = __msa_dpadd_s_h(offset, wgt, vec14);
2553         tmp15 = __msa_dpadd_s_h(offset, wgt, vec15);
2554         SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
2555         SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
2556         SRA_4V(tmp8, tmp9, tmp10, tmp11, denom);
2557         SRA_4V(tmp12, tmp13, tmp14, tmp15, denom);
2558         CLIP_SH8_0_255(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
2559         CLIP_SH8_0_255(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
2560         PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
2561                     dst2, dst3);
2562         PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
2563                     dst5, dst6, dst7);
2564         ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
2565     }
2566 }
2567 
ff_biweight_h264_pixels8_8_msa(uint8_t * dst,uint8_t * src,ptrdiff_t stride,int height,int log2_denom,int weight_dst,int weight_src,int offset)2568 void ff_biweight_h264_pixels8_8_msa(uint8_t *dst, uint8_t *src,
2569                                     ptrdiff_t stride, int height,
2570                                     int log2_denom, int weight_dst,
2571                                     int weight_src, int offset)
2572 {
2573     if (4 == height) {
2574         avc_biwgt_8x4_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2575                           offset);
2576     } else if (8 == height) {
2577         avc_biwgt_8x8_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2578                           offset);
2579     } else {
2580         avc_biwgt_8x16_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2581                            offset);
2582     }
2583 }
2584 
ff_biweight_h264_pixels4_8_msa(uint8_t * dst,uint8_t * src,ptrdiff_t stride,int height,int log2_denom,int weight_dst,int weight_src,int offset)2585 void ff_biweight_h264_pixels4_8_msa(uint8_t *dst, uint8_t *src,
2586                                     ptrdiff_t stride, int height,
2587                                     int log2_denom, int weight_dst,
2588                                     int weight_src, int offset)
2589 {
2590     if (2 == height) {
2591         avc_biwgt_4x2_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2592                           offset);
2593     } else if (4 == height) {
2594         avc_biwgt_4x4_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2595                           offset);
2596     } else {
2597         avc_biwgt_4x8_msa(src, dst, stride, log2_denom, weight_src, weight_dst,
2598                           offset);
2599     }
2600 }
2601