1 /*
2 * Copyright (c) 2015 -2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "libavcodec/mips/hevcdsp_mips.h"
23
hevc_loopfilter_luma_hor_msa(uint8_t * src,int32_t stride,int32_t beta,int32_t * tc,uint8_t * p_is_pcm,uint8_t * q_is_pcm)24 static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride,
25 int32_t beta, int32_t *tc,
26 uint8_t *p_is_pcm, uint8_t *q_is_pcm)
27 {
28 uint8_t *p3 = src - (stride << 2);
29 uint8_t *p2 = src - ((stride << 1) + stride);
30 uint8_t *p1 = src - (stride << 1);
31 uint8_t *p0 = src - stride;
32 uint8_t *q0 = src;
33 uint8_t *q1 = src + stride;
34 uint8_t *q2 = src + (stride << 1);
35 uint8_t *q3 = src + (stride << 1) + stride;
36 uint8_t flag0, flag1;
37 int32_t dp00, dq00, dp30, dq30, d00, d30;
38 int32_t d0030, d0434;
39 int32_t dp04, dq04, dp34, dq34, d04, d34;
40 int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
41 int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp;
42 uint64_t dst_val0, dst_val1;
43 v16u8 dst0, dst1, dst2, dst3, dst4, dst5;
44 v2i64 cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
45 v2i64 cmp3;
46 v8u16 temp0, temp1;
47 v8i16 temp2;
48 v8i16 tc_pos, tc_neg;
49 v8i16 diff0, diff1, delta0, delta1, delta2, abs_delta0;
50 v16i8 zero = { 0 };
51 v8u16 p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
52
53 dp00 = abs(p2[0] - (p1[0] << 1) + p0[0]);
54 dq00 = abs(q2[0] - (q1[0] << 1) + q0[0]);
55 dp30 = abs(p2[3] - (p1[3] << 1) + p0[3]);
56 dq30 = abs(q2[3] - (q1[3] << 1) + q0[3]);
57 d00 = dp00 + dq00;
58 d30 = dp30 + dq30;
59 dp04 = abs(p2[4] - (p1[4] << 1) + p0[4]);
60 dq04 = abs(q2[4] - (q1[4] << 1) + q0[4]);
61 dp34 = abs(p2[7] - (p1[7] << 1) + p0[7]);
62 dq34 = abs(q2[7] - (q1[7] << 1) + q0[7]);
63 d04 = dp04 + dq04;
64 d34 = dp34 + dq34;
65
66 p_is_pcm0 = p_is_pcm[0];
67 p_is_pcm4 = p_is_pcm[1];
68 q_is_pcm0 = q_is_pcm[0];
69 q_is_pcm4 = q_is_pcm[1];
70
71 cmp0 = __msa_fill_d(p_is_pcm0);
72 cmp1 = __msa_fill_d(p_is_pcm4);
73 p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
74 p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
75
76 d0030 = (d00 + d30) >= beta;
77 d0434 = (d04 + d34) >= beta;
78
79 cmp0 = (v2i64) __msa_fill_w(d0030);
80 cmp1 = (v2i64) __msa_fill_w(d0434);
81 cmp3 = (v2i64) __msa_ilvev_w((v4i32) cmp1, (v4i32) cmp0);
82 cmp3 = (v2i64) __msa_ceqi_w((v4i32) cmp3, 0);
83
84 if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
85 (!d0030 || !d0434)) {
86 p3_src = LD_UH(p3);
87 p2_src = LD_UH(p2);
88 p1_src = LD_UH(p1);
89 p0_src = LD_UH(p0);
90
91 cmp0 = __msa_fill_d(q_is_pcm0);
92 cmp1 = __msa_fill_d(q_is_pcm4);
93 q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
94 q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
95
96 tc0 = tc[0];
97 beta30 = beta >> 3;
98 beta20 = beta >> 2;
99 tc250 = ((tc0 * 5 + 1) >> 1);
100 tc4 = tc[1];
101 tc254 = ((tc4 * 5 + 1) >> 1);
102
103 cmp0 = (v2i64) __msa_fill_h(tc0);
104 cmp1 = (v2i64) __msa_fill_h(tc4);
105
106 ILVR_B4_UH(zero, p3_src, zero, p2_src, zero, p1_src, zero, p0_src,
107 p3_src, p2_src, p1_src, p0_src);
108 q0_src = LD_UH(q0);
109 q1_src = LD_UH(q1);
110 q2_src = LD_UH(q2);
111 q3_src = LD_UH(q3);
112
113 flag0 = abs(p3[0] - p0[0]) + abs(q3[0] - q0[0]) < beta30 &&
114 abs(p0[0] - q0[0]) < tc250;
115 flag0 = flag0 && (abs(p3[3] - p0[3]) + abs(q3[3] - q0[3]) < beta30 &&
116 abs(p0[3] - q0[3]) < tc250 && (d00 << 1) < beta20 &&
117 (d30 << 1) < beta20);
118
119 tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
120 ILVR_B4_UH(zero, q0_src, zero, q1_src, zero, q2_src, zero, q3_src,
121 q0_src, q1_src, q2_src, q3_src);
122 flag1 = abs(p3[4] - p0[4]) + abs(q3[4] - q0[4]) < beta30 &&
123 abs(p0[4] - q0[4]) < tc254;
124 flag1 = flag1 && (abs(p3[7] - p0[7]) + abs(q3[7] - q0[7]) < beta30 &&
125 abs(p0[7] - q0[7]) < tc254 && (d04 << 1) < beta20 &&
126 (d34 << 1) < beta20);
127
128 cmp0 = (v2i64) __msa_fill_w(flag0);
129 cmp1 = (v2i64) __msa_fill_w(flag1);
130 cmp2 = (v2i64) __msa_ilvev_w((v4i32) cmp1, (v4i32) cmp0);
131 cmp2 = (v2i64) __msa_ceqi_w((v4i32) cmp2, 0);
132
133 if (flag0 && flag1) { /* strong only */
134 /* strong filter */
135 tc_pos <<= 1;
136 tc_neg = -tc_pos;
137
138 /* p part */
139 temp0 = (p1_src + p0_src + q0_src);
140 temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
141 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
142 temp2 = (v8i16) (temp1 - p2_src);
143 CLIP_SH(temp2, tc_neg, tc_pos);
144 dst0 = (v16u8) (temp2 + (v8i16) p2_src);
145
146 temp1 = temp0 + p2_src;
147 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
148 temp2 = (v8i16) (temp1 - p1_src);
149 CLIP_SH(temp2, tc_neg, tc_pos);
150 dst1 = (v16u8) (temp2 + (v8i16) p1_src);
151
152 temp1 = (temp0 << 1) + p2_src + q1_src;
153 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
154 temp2 = (v8i16) (temp1 - p0_src);
155 CLIP_SH(temp2, tc_neg, tc_pos);
156 dst2 = (v16u8) (temp2 + (v8i16) p0_src);
157
158 dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
159 dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
160 dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
161
162 /* q part */
163 temp0 = (q1_src + p0_src + q0_src);
164
165 temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
166 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
167 temp2 = (v8i16) (temp1 - q2_src);
168 CLIP_SH(temp2, tc_neg, tc_pos);
169 dst5 = (v16u8) (temp2 + (v8i16) q2_src);
170
171 temp1 = temp0 + q2_src;
172 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
173 temp2 = (v8i16) (temp1 - q1_src);
174 CLIP_SH(temp2, tc_neg, tc_pos);
175 dst4 = (v16u8) (temp2 + (v8i16) q1_src);
176
177 temp1 = (temp0 << 1) + p1_src + q2_src;
178 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
179 temp2 = (v8i16) (temp1 - q0_src);
180 CLIP_SH(temp2, tc_neg, tc_pos);
181 dst3 = (v16u8) (temp2 + (v8i16) q0_src);
182
183 dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
184 dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
185 dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
186
187 /* pack results to 8 bit */
188 PCKEV_B2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
189 dst2 = (v16u8) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
190
191 /* pack src to 8 bit */
192 PCKEV_B2_UB(p1_src, p2_src, q0_src, p0_src, dst3, dst4);
193 dst5 = (v16u8) __msa_pckev_b((v16i8) q2_src, (v16i8) q1_src);
194
195 dst0 = __msa_bmz_v(dst0, dst3, (v16u8) cmp3);
196 dst1 = __msa_bmz_v(dst1, dst4, (v16u8) cmp3);
197 dst2 = __msa_bmz_v(dst2, dst5, (v16u8) cmp3);
198
199 dst_val0 = __msa_copy_u_d((v2i64) dst2, 0);
200 dst_val1 = __msa_copy_u_d((v2i64) dst2, 1);
201
202 ST_D4(dst0, dst1, 0, 1, 0, 1, p2, stride);
203 SD(dst_val0, p2 + 4 * stride);
204 SD(dst_val1, p2 + 5 * stride);
205 /* strong filter ends */
206 } else if (flag0 == flag1) { /* weak only */
207 /* weak filter */
208 tc_neg = -tc_pos;
209
210 diff0 = (v8i16) (q0_src - p0_src);
211 diff1 = (v8i16) (q1_src - p1_src);
212 diff0 = (diff0 << 3) + diff0;
213 diff1 = (diff1 << 1) + diff1;
214 delta0 = diff0 - diff1;
215 delta0 = __msa_srari_h(delta0, 4);
216
217 temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
218 abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
219 abs_delta0 = (v8u16) abs_delta0 < temp1;
220
221 CLIP_SH(delta0, tc_neg, tc_pos);
222
223 temp2 = (v8i16) (delta0 + p0_src);
224 CLIP_SH_0_255(temp2);
225 temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src,
226 (v16u8) p_is_pcm_vec);
227
228 temp2 = (v8i16) (q0_src - delta0);
229 CLIP_SH_0_255(temp2);
230 temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
231 (v16u8) q_is_pcm_vec);
232
233 p_is_pcm_vec = ~p_is_pcm_vec;
234 q_is_pcm_vec = ~q_is_pcm_vec;
235 tmp = (beta + (beta >> 1)) >> 3;
236 cmp0 = __msa_fill_d(dp00 + dp30 < tmp);
237 cmp1 = __msa_fill_d(dp04 + dp34 < tmp);
238 cmp0 = __msa_ilvev_d(cmp1, cmp0);
239 cmp0 = __msa_ceqi_d(cmp0, 0);
240 p_is_pcm_vec = p_is_pcm_vec | cmp0;
241
242 cmp0 = __msa_fill_d(dq00 + dq30 < tmp);
243 cmp1 = __msa_fill_d(dq04 + dq34 < tmp);
244 cmp0 = __msa_ilvev_d(cmp1, cmp0);
245 cmp0 = __msa_ceqi_d(cmp0, 0);
246 q_is_pcm_vec = q_is_pcm_vec | cmp0;
247
248 tc_pos >>= 1;
249 tc_neg = -tc_pos;
250
251 delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
252 delta1 -= (v8i16) p1_src;
253 delta1 += delta0;
254 delta1 >>= 1;
255 CLIP_SH(delta1, tc_neg, tc_pos);
256 delta1 = (v8i16) p1_src + (v8i16) delta1;
257 CLIP_SH_0_255(delta1);
258 delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
259 (v16u8) p_is_pcm_vec);
260
261 delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
262 delta2 = delta2 - (v8i16) q1_src;
263 delta2 = delta2 - delta0;
264 delta2 = delta2 >> 1;
265 CLIP_SH(delta2, tc_neg, tc_pos);
266 delta2 = (v8i16) q1_src + (v8i16) delta2;
267 CLIP_SH_0_255(delta2);
268 delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
269 (v16u8) q_is_pcm_vec);
270
271 dst1 = (v16u8) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
272 (v16u8) abs_delta0);
273 dst2 = (v16u8) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
274 (v16u8) abs_delta0);
275 dst3 = (v16u8) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
276 (v16u8) abs_delta0);
277 dst4 = (v16u8) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
278 (v16u8) abs_delta0);
279 /* pack results to 8 bit */
280 PCKEV_B2_UB(dst2, dst1, dst4, dst3, dst0, dst1);
281
282 /* pack src to 8 bit */
283 PCKEV_B2_UB(p0_src, p1_src, q1_src, q0_src, dst2, dst3);
284
285 dst0 = __msa_bmz_v(dst0, dst2, (v16u8) cmp3);
286 dst1 = __msa_bmz_v(dst1, dst3, (v16u8) cmp3);
287
288 p2 += stride;
289 ST_D4(dst0, dst1, 0, 1, 0, 1, p2, stride);
290 /* weak filter ends */
291 } else { /* strong + weak */
292 /* strong filter */
293 tc_pos <<= 1;
294 tc_neg = -tc_pos;
295
296 /* p part */
297 temp0 = (p1_src + p0_src + q0_src);
298 temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
299 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
300 temp2 = (v8i16) (temp1 - p2_src);
301 CLIP_SH(temp2, tc_neg, tc_pos);
302 dst0 = (v16u8) (temp2 + (v8i16) p2_src);
303
304 temp1 = temp0 + p2_src;
305 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
306 temp2 = (v8i16) (temp1 - p1_src);
307 CLIP_SH(temp2, tc_neg, tc_pos);
308 dst1 = (v16u8) (temp2 + (v8i16) p1_src);
309
310 temp1 = (temp0 << 1) + p2_src + q1_src;
311 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
312 temp2 = (v8i16) (temp1 - p0_src);
313 CLIP_SH(temp2, tc_neg, tc_pos);
314 dst2 = (v16u8) (temp2 + (v8i16) p0_src);
315
316 dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
317 dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
318 dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
319
320 /* q part */
321 temp0 = (q1_src + p0_src + q0_src);
322
323 temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
324 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
325 temp2 = (v8i16) (temp1 - q2_src);
326 CLIP_SH(temp2, tc_neg, tc_pos);
327 dst5 = (v16u8) (temp2 + (v8i16) q2_src);
328
329 temp1 = temp0 + q2_src;
330 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
331 temp2 = (v8i16) (temp1 - q1_src);
332 CLIP_SH(temp2, tc_neg, tc_pos);
333 dst4 = (v16u8) (temp2 + (v8i16) q1_src);
334
335 temp1 = (temp0 << 1) + p1_src + q2_src;
336 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
337 temp2 = (v8i16) (temp1 - q0_src);
338 CLIP_SH(temp2, tc_neg, tc_pos);
339 dst3 = (v16u8) (temp2 + (v8i16) q0_src);
340
341 dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
342 dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
343 dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
344
345 /* pack strong results to 8 bit */
346 PCKEV_B2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
347 dst2 = (v16u8) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
348 /* strong filter ends */
349
350 /* weak filter */
351 tc_pos >>= 1;
352 tc_neg = -tc_pos;
353
354 diff0 = (v8i16) (q0_src - p0_src);
355 diff1 = (v8i16) (q1_src - p1_src);
356 diff0 = (diff0 << 3) + diff0;
357 diff1 = (diff1 << 1) + diff1;
358 delta0 = diff0 - diff1;
359 delta0 = __msa_srari_h(delta0, 4);
360
361 temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
362 abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
363 abs_delta0 = (v8u16) abs_delta0 < temp1;
364
365 CLIP_SH(delta0, tc_neg, tc_pos);
366
367 temp2 = (v8i16) (delta0 + p0_src);
368 CLIP_SH_0_255(temp2);
369 temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src,
370 (v16u8) p_is_pcm_vec);
371
372 temp2 = (v8i16) (q0_src - delta0);
373 CLIP_SH_0_255(temp2);
374 temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
375 (v16u8) q_is_pcm_vec);
376
377 p_is_pcm_vec = ~p_is_pcm_vec;
378 q_is_pcm_vec = ~q_is_pcm_vec;
379 tmp = (beta + (beta >> 1)) >> 3;
380 cmp0 = __msa_fill_d(dp00 + dp30 < tmp);
381 cmp1 = __msa_fill_d(dp04 + dp34 < tmp);
382 cmp0 = __msa_ilvev_d(cmp1, cmp0);
383 p_is_pcm_vec = p_is_pcm_vec | __msa_ceqi_d(cmp0, 0);
384
385 cmp0 = __msa_fill_d(dq00 + dq30 < tmp);
386 cmp1 = __msa_fill_d(dq04 + dq34 < tmp);
387 cmp0 = __msa_ilvev_d(cmp1, cmp0);
388 q_is_pcm_vec = q_is_pcm_vec | __msa_ceqi_d(cmp0, 0);
389
390 tc_pos >>= 1;
391 tc_neg = -tc_pos;
392
393 delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
394 delta1 -= (v8i16) p1_src;
395 delta1 += delta0;
396 delta1 >>= 1;
397 CLIP_SH(delta1, tc_neg, tc_pos);
398 delta1 = (v8i16) p1_src + (v8i16) delta1;
399 CLIP_SH_0_255(delta1);
400 delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
401 (v16u8) p_is_pcm_vec);
402
403 delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
404 delta2 = delta2 - (v8i16) q1_src;
405 delta2 = delta2 - delta0;
406 delta2 = delta2 >> 1;
407 CLIP_SH(delta2, tc_neg, tc_pos);
408 delta2 = (v8i16) q1_src + (v8i16) delta2;
409 CLIP_SH_0_255(delta2);
410 delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
411 (v16u8) q_is_pcm_vec);
412
413 delta1 = (v8i16) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
414 (v16u8) abs_delta0);
415 temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
416 (v16u8) abs_delta0);
417 temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
418 (v16u8) abs_delta0);
419 delta2 = (v8i16) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
420 (v16u8) abs_delta0);
421 /* weak filter ends */
422
423 /* pack weak results to 8 bit */
424 PCKEV_B2_UB(delta1, p2_src, temp2, temp0, dst3, dst4);
425 dst5 = (v16u8) __msa_pckev_b((v16i8) q2_src, (v16i8) delta2);
426
427 /* select between weak or strong */
428 dst0 = __msa_bmnz_v(dst0, dst3, (v16u8) cmp2);
429 dst1 = __msa_bmnz_v(dst1, dst4, (v16u8) cmp2);
430 dst2 = __msa_bmnz_v(dst2, dst5, (v16u8) cmp2);
431
432 /* pack src to 8 bit */
433 PCKEV_B2_UB(p1_src, p2_src, q0_src, p0_src, dst3, dst4);
434 dst5 = (v16u8) __msa_pckev_b((v16i8) q2_src, (v16i8) q1_src);
435
436 dst0 = __msa_bmz_v(dst0, dst3, (v16u8) cmp3);
437 dst1 = __msa_bmz_v(dst1, dst4, (v16u8) cmp3);
438 dst2 = __msa_bmz_v(dst2, dst5, (v16u8) cmp3);
439
440 dst_val0 = __msa_copy_u_d((v2i64) dst2, 0);
441 dst_val1 = __msa_copy_u_d((v2i64) dst2, 1);
442
443 ST_D4(dst0, dst1, 0, 1, 0, 1, p2, stride);
444 SD(dst_val0, p2 + 4 * stride);
445 SD(dst_val1, p2 + 5 * stride);
446 }
447 }
448 }
449
hevc_loopfilter_luma_ver_msa(uint8_t * src,int32_t stride,int32_t beta,int32_t * tc,uint8_t * p_is_pcm,uint8_t * q_is_pcm)450 static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride,
451 int32_t beta, int32_t *tc,
452 uint8_t *p_is_pcm, uint8_t *q_is_pcm)
453 {
454 uint8_t *p3 = src;
455 uint8_t *p2 = src + 3 * stride;
456 uint8_t *p1 = src + (stride << 2);
457 uint8_t *p0 = src + 7 * stride;
458 uint8_t flag0, flag1;
459 uint16_t tmp0, tmp1;
460 uint32_t tmp2, tmp3;
461 int32_t dp00, dq00, dp30, dq30, d00, d30;
462 int32_t d0030, d0434;
463 int32_t dp04, dq04, dp34, dq34, d04, d34;
464 int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
465 int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp;
466 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
467 v2i64 cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
468 v2i64 cmp3;
469 v8u16 temp0, temp1;
470 v8i16 temp2;
471 v8i16 tc_pos, tc_neg;
472 v8i16 diff0, diff1, delta0, delta1, delta2, abs_delta0;
473 v16i8 zero = { 0 };
474 v8u16 p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
475
476 dp00 = abs(p3[-3] - (p3[-2] << 1) + p3[-1]);
477 dq00 = abs(p3[2] - (p3[1] << 1) + p3[0]);
478 dp30 = abs(p2[-3] - (p2[-2] << 1) + p2[-1]);
479 dq30 = abs(p2[2] - (p2[1] << 1) + p2[0]);
480 d00 = dp00 + dq00;
481 d30 = dp30 + dq30;
482 p_is_pcm0 = p_is_pcm[0];
483 q_is_pcm0 = q_is_pcm[0];
484
485 dp04 = abs(p1[-3] - (p1[-2] << 1) + p1[-1]);
486 dq04 = abs(p1[2] - (p1[1] << 1) + p1[0]);
487 dp34 = abs(p0[-3] - (p0[-2] << 1) + p0[-1]);
488 dq34 = abs(p0[2] - (p0[1] << 1) + p0[0]);
489 d04 = dp04 + dq04;
490 d34 = dp34 + dq34;
491 p_is_pcm4 = p_is_pcm[1];
492 q_is_pcm4 = q_is_pcm[1];
493
494 cmp0 = __msa_fill_d(p_is_pcm0);
495 cmp1 = __msa_fill_d(p_is_pcm4);
496 p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
497 p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
498
499 d0030 = (d00 + d30) >= beta;
500 d0434 = (d04 + d34) >= beta;
501
502 cmp0 = __msa_fill_d(d0030);
503 cmp1 = __msa_fill_d(d0434);
504 cmp3 = __msa_ilvev_d(cmp1, cmp0);
505 cmp3 = (v2i64) __msa_ceqi_d(cmp3, 0);
506
507 if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
508 (!d0030 || !d0434)) {
509 src -= 4;
510 LD_UH8(src, stride, p3_src, p2_src, p1_src, p0_src, q0_src, q1_src,
511 q2_src, q3_src);
512
513 cmp0 = __msa_fill_d(q_is_pcm0);
514 cmp1 = __msa_fill_d(q_is_pcm4);
515 q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
516 q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
517
518 tc0 = tc[0];
519 beta30 = beta >> 3;
520 beta20 = beta >> 2;
521 tc250 = ((tc0 * 5 + 1) >> 1);
522
523 tc4 = tc[1];
524 tc254 = ((tc4 * 5 + 1) >> 1);
525 cmp0 = (v2i64) __msa_fill_h(tc0 << 1);
526 cmp1 = (v2i64) __msa_fill_h(tc4 << 1);
527 tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
528
529 TRANSPOSE8x8_UB_UH(p3_src, p2_src, p1_src, p0_src, q0_src, q1_src,
530 q2_src, q3_src, p3_src, p2_src, p1_src, p0_src,
531 q0_src, q1_src, q2_src, q3_src);
532
533 flag0 = abs(p3[-4] - p3[-1]) + abs(p3[3] - p3[0]) < beta30 &&
534 abs(p3[-1] - p3[0]) < tc250;
535 flag0 = flag0 && (abs(p2[-4] - p2[-1]) + abs(p2[3] - p2[0]) < beta30 &&
536 abs(p2[-1] - p2[0]) < tc250 && (d00 << 1) < beta20 &&
537 (d30 << 1) < beta20);
538 cmp0 = __msa_fill_d(flag0);
539 ILVR_B4_UH(zero, p3_src, zero, p2_src, zero, p1_src, zero, p0_src,
540 p3_src, p2_src, p1_src, p0_src);
541
542 flag1 = abs(p1[-4] - p1[-1]) + abs(p1[3] - p1[0]) < beta30 &&
543 abs(p1[-1] - p1[0]) < tc254;
544 flag1 = flag1 && (abs(p0[-4] - p0[-1]) + abs(p0[3] - p0[0]) < beta30 &&
545 abs(p0[-1] - p0[0]) < tc254 && (d04 << 1) < beta20 &&
546 (d34 << 1) < beta20);
547 ILVR_B4_UH(zero, q0_src, zero, q1_src, zero, q2_src, zero, q3_src,
548 q0_src, q1_src, q2_src, q3_src);
549
550 cmp1 = __msa_fill_d(flag1);
551 cmp2 = __msa_ilvev_d(cmp1, cmp0);
552 cmp2 = __msa_ceqi_d(cmp2, 0);
553
554 if (flag0 && flag1) { /* strong only */
555 /* strong filter */
556 tc_neg = -tc_pos;
557
558 /* p part */
559 temp0 = (p1_src + p0_src + q0_src);
560
561 temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
562 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
563 temp2 = (v8i16) (temp1 - p2_src);
564 CLIP_SH(temp2, tc_neg, tc_pos);
565 dst0 = (v16u8) (temp2 + (v8i16) p2_src);
566
567 temp1 = temp0 + p2_src;
568 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
569 temp2 = (v8i16) (temp1 - p1_src);
570 CLIP_SH(temp2, tc_neg, tc_pos);
571 dst1 = (v16u8) (temp2 + (v8i16) p1_src);
572
573 temp1 = (temp0 << 1) + p2_src + q1_src;
574 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
575 temp2 = (v8i16) (temp1 - p0_src);
576 CLIP_SH(temp2, tc_neg, tc_pos);
577 dst2 = (v16u8) (temp2 + (v8i16) p0_src);
578
579 dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
580 dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
581 dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
582
583 /* q part */
584 temp0 = (q1_src + p0_src + q0_src);
585 temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
586 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
587 temp2 = (v8i16) (temp1 - q2_src);
588 CLIP_SH(temp2, tc_neg, tc_pos);
589 dst5 = (v16u8) (temp2 + (v8i16) q2_src);
590
591 temp1 = temp0 + q2_src;
592 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
593 temp2 = (v8i16) (temp1 - q1_src);
594 CLIP_SH(temp2, tc_neg, tc_pos);
595 dst4 = (v16u8) (temp2 + (v8i16) q1_src);
596
597 temp1 = (temp0 << 1) + p1_src + q2_src;
598 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
599 temp2 = (v8i16) (temp1 - q0_src);
600 CLIP_SH(temp2, tc_neg, tc_pos);
601 dst3 = (v16u8) (temp2 + (v8i16) q0_src);
602
603 dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
604 dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
605 dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
606 /* strong filter ends */
607 } else if (flag0 == flag1) { /* weak only */
608 /* weak filter */
609 tc_pos >>= 1;
610 tc_neg = -tc_pos;
611
612 diff0 = (v8i16) (q0_src - p0_src);
613 diff1 = (v8i16) (q1_src - p1_src);
614 diff0 = (diff0 << 3) + diff0;
615 diff1 = (diff1 << 1) + diff1;
616 delta0 = diff0 - diff1;
617 delta0 = __msa_srari_h(delta0, 4);
618
619 temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
620 abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
621 abs_delta0 = (v8u16) abs_delta0 < temp1;
622
623 CLIP_SH(delta0, tc_neg, tc_pos);
624 temp2 = (v8i16) (delta0 + p0_src);
625 CLIP_SH_0_255(temp2);
626 temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src,
627 (v16u8) p_is_pcm_vec);
628
629 temp2 = (v8i16) (q0_src - delta0);
630 CLIP_SH_0_255(temp2);
631 temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
632 (v16u8) q_is_pcm_vec);
633
634 tmp = ((beta + (beta >> 1)) >> 3);
635 cmp0 = __msa_fill_d(!p_is_pcm0 && ((dp00 + dp30) < tmp));
636 cmp1 = __msa_fill_d(!p_is_pcm4 && ((dp04 + dp34) < tmp));
637 p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
638 p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
639
640 cmp0 = (v2i64) __msa_fill_h((!q_is_pcm0) && (dq00 + dq30 < tmp));
641 cmp1 = (v2i64) __msa_fill_h((!q_is_pcm4) && (dq04 + dq34 < tmp));
642 q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
643 q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
644
645 tc_pos >>= 1;
646 tc_neg = -tc_pos;
647
648 delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
649 delta1 -= (v8i16) p1_src;
650 delta1 += delta0;
651 delta1 >>= 1;
652 CLIP_SH(delta1, tc_neg, tc_pos);
653 delta1 = (v8i16) p1_src + (v8i16) delta1;
654 CLIP_SH_0_255(delta1);
655 delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
656 (v16u8) p_is_pcm_vec);
657
658 delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
659 delta2 = delta2 - (v8i16) q1_src;
660 delta2 = delta2 - delta0;
661 delta2 = delta2 >> 1;
662 CLIP_SH(delta2, tc_neg, tc_pos);
663 delta2 = (v8i16) q1_src + (v8i16) delta2;
664 CLIP_SH_0_255(delta2);
665 delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
666 (v16u8) q_is_pcm_vec);
667
668 dst0 = __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
669 (v16u8) abs_delta0);
670 dst1 = __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
671 (v16u8) abs_delta0);
672 dst2 = __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
673 (v16u8) abs_delta0);
674 dst3 = __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
675 (v16u8) abs_delta0);
676 /* weak filter ends */
677
678 dst0 = __msa_bmz_v(dst0, (v16u8) p1_src, (v16u8) cmp3);
679 dst1 = __msa_bmz_v(dst1, (v16u8) p0_src, (v16u8) cmp3);
680 dst2 = __msa_bmz_v(dst2, (v16u8) q0_src, (v16u8) cmp3);
681 dst3 = __msa_bmz_v(dst3, (v16u8) q1_src, (v16u8) cmp3);
682
683 PCKEV_B2_UB(dst2, dst0, dst3, dst1, dst0, dst1);
684
685 /* transpose */
686 ILVRL_B2_UB(dst1, dst0, dst4, dst5);
687 ILVRL_H2_UB(dst5, dst4, dst0, dst1);
688
689 src += 2;
690
691 tmp2 = __msa_copy_u_w((v4i32) dst0, 0);
692 tmp3 = __msa_copy_u_w((v4i32) dst0, 1);
693 SW(tmp2, src);
694 src += stride;
695 SW(tmp3, src);
696 src += stride;
697
698 tmp2 = __msa_copy_u_w((v4i32) dst0, 2);
699 tmp3 = __msa_copy_u_w((v4i32) dst0, 3);
700 SW(tmp2, src);
701 src += stride;
702 SW(tmp3, src);
703 src += stride;
704
705 tmp2 = __msa_copy_u_w((v4i32) dst1, 0);
706 tmp3 = __msa_copy_u_w((v4i32) dst1, 1);
707 SW(tmp2, src);
708 src += stride;
709 SW(tmp3, src);
710 src += stride;
711
712 tmp2 = __msa_copy_u_w((v4i32) dst1, 2);
713 tmp3 = __msa_copy_u_w((v4i32) dst1, 3);
714 SW(tmp2, src);
715 src += stride;
716 SW(tmp3, src);
717
718 return;
719 } else { /* strong + weak */
720 /* strong filter */
721 tc_neg = -tc_pos;
722
723 /* p part */
724 temp0 = (p1_src + p0_src + q0_src);
725
726 temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
727 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
728 temp2 = (v8i16) (temp1 - p2_src);
729 CLIP_SH(temp2, tc_neg, tc_pos);
730 dst0 = (v16u8) (temp2 + (v8i16) p2_src);
731
732 temp1 = temp0 + p2_src;
733 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
734 temp2 = (v8i16) (temp1 - p1_src);
735 CLIP_SH(temp2, tc_neg, tc_pos);
736 dst1 = (v16u8) (temp2 + (v8i16) p1_src);
737
738 temp1 = (temp0 << 1) + p2_src + q1_src;
739 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
740 temp2 = (v8i16) (temp1 - p0_src);
741 CLIP_SH(temp2, tc_neg, tc_pos);
742 dst2 = (v16u8) (temp2 + (v8i16) p0_src);
743
744 dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
745 dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
746 dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
747
748 /* q part */
749 temp0 = (q1_src + p0_src + q0_src);
750 temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
751 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
752 temp2 = (v8i16) (temp1 - q2_src);
753 CLIP_SH(temp2, tc_neg, tc_pos);
754 dst5 = (v16u8) (temp2 + (v8i16) q2_src);
755
756 temp1 = temp0 + q2_src;
757 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
758 temp2 = (v8i16) (temp1 - q1_src);
759 CLIP_SH(temp2, tc_neg, tc_pos);
760 dst4 = (v16u8) (temp2 + (v8i16) q1_src);
761
762 temp1 = (temp0 << 1) + p1_src + q2_src;
763 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
764 temp2 = (v8i16) (temp1 - q0_src);
765 CLIP_SH(temp2, tc_neg, tc_pos);
766 dst3 = (v16u8) (temp2 + (v8i16) q0_src);
767
768 dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
769 dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
770 dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
771 /* strong filter ends */
772
773 /* weak filter */
774 tc_pos >>= 1;
775 tc_neg = -tc_pos;
776
777 diff0 = (v8i16) (q0_src - p0_src);
778 diff1 = (v8i16) (q1_src - p1_src);
779 diff0 = (diff0 << 3) + diff0;
780 diff1 = (diff1 << 1) + diff1;
781 delta0 = diff0 - diff1;
782 delta0 = __msa_srari_h(delta0, 4);
783
784 temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
785 abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
786 abs_delta0 = (v8u16) abs_delta0 < temp1;
787
788 CLIP_SH(delta0, tc_neg, tc_pos);
789
790 temp2 = (v8i16) (delta0 + p0_src);
791 CLIP_SH_0_255(temp2);
792 temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src,
793 (v16u8) p_is_pcm_vec);
794
795 temp2 = (v8i16) (q0_src - delta0);
796 CLIP_SH_0_255(temp2);
797 temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
798 (v16u8) q_is_pcm_vec);
799
800 tmp = (beta + (beta >> 1)) >> 3;
801 cmp0 = __msa_fill_d(!p_is_pcm0 && ((dp00 + dp30) < tmp));
802 cmp1 = __msa_fill_d(!p_is_pcm4 && ((dp04 + dp34) < tmp));
803 p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
804 p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
805
806 cmp0 = (v2i64) __msa_fill_h((!q_is_pcm0) && (dq00 + dq30 < tmp));
807 cmp1 = (v2i64) __msa_fill_h((!q_is_pcm4) && (dq04 + dq34 < tmp));
808 q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
809 q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
810
811 tc_pos >>= 1;
812 tc_neg = -tc_pos;
813
814 delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
815 delta1 -= (v8i16) p1_src;
816 delta1 += delta0;
817 delta1 >>= 1;
818 CLIP_SH(delta1, tc_neg, tc_pos);
819 delta1 = (v8i16) p1_src + (v8i16) delta1;
820 CLIP_SH_0_255(delta1);
821 delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
822 (v16u8) p_is_pcm_vec);
823
824 delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
825 delta2 = delta2 - (v8i16) q1_src;
826 delta2 = delta2 - delta0;
827 delta2 = delta2 >> 1;
828 CLIP_SH(delta2, tc_neg, tc_pos);
829 delta2 = (v8i16) q1_src + (v8i16) delta2;
830 CLIP_SH_0_255(delta2);
831 delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
832 (v16u8) q_is_pcm_vec);
833 delta1 = (v8i16) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
834 (v16u8) abs_delta0);
835 temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
836 (v16u8) abs_delta0);
837 temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
838 (v16u8) abs_delta0);
839 delta2 = (v8i16) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
840 (v16u8) abs_delta0);
841 /* weak filter ends*/
842
843 /* select between weak or strong */
844 dst2 = __msa_bmnz_v(dst2, (v16u8) temp0, (v16u8) cmp2);
845 dst3 = __msa_bmnz_v(dst3, (v16u8) temp2, (v16u8) cmp2);
846 dst1 = __msa_bmnz_v(dst1, (v16u8) delta1, (v16u8) cmp2);
847 dst4 = __msa_bmnz_v(dst4, (v16u8) delta2, (v16u8) cmp2);
848 dst0 = __msa_bmnz_v(dst0, (v16u8) p2_src, (v16u8) cmp2);
849 dst5 = __msa_bmnz_v(dst5, (v16u8) q2_src, (v16u8) cmp2);
850 }
851
852 dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) cmp3);
853 dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) cmp3);
854 dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) cmp3);
855 dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) cmp3);
856 dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) cmp3);
857 dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) cmp3);
858
859 /* pack results to 8 bit */
860 PCKEV_B4_UB(dst2, dst0, dst3, dst1, dst4, dst4, dst5, dst5, dst0, dst1,
861 dst2, dst3);
862
863 /* transpose */
864 ILVRL_B2_UB(dst1, dst0, dst4, dst5);
865 ILVRL_B2_UB(dst3, dst2, dst6, dst7);
866 ILVRL_H2_UB(dst5, dst4, dst0, dst1);
867 ILVRL_H2_UB(dst7, dst6, dst2, dst3);
868
869 src += 1;
870
871 tmp2 = __msa_copy_u_w((v4i32) dst0, 0);
872 tmp3 = __msa_copy_u_w((v4i32) dst0, 1);
873 tmp0 = __msa_copy_u_h((v8i16) dst2, 0);
874 tmp1 = __msa_copy_u_h((v8i16) dst2, 2);
875 SW(tmp2, src);
876 SH(tmp0, src + 4);
877 src += stride;
878 SW(tmp3, src);
879 SH(tmp1, src + 4);
880 src += stride;
881
882 tmp2 = __msa_copy_u_w((v4i32) dst0, 2);
883 tmp3 = __msa_copy_u_w((v4i32) dst0, 3);
884 tmp0 = __msa_copy_u_h((v8i16) dst2, 4);
885 tmp1 = __msa_copy_u_h((v8i16) dst2, 6);
886 SW(tmp2, src);
887 SH(tmp0, src + 4);
888 src += stride;
889 SW(tmp3, src);
890 SH(tmp1, src + 4);
891 src += stride;
892
893 tmp2 = __msa_copy_u_w((v4i32) dst1, 0);
894 tmp3 = __msa_copy_u_w((v4i32) dst1, 1);
895 tmp0 = __msa_copy_u_h((v8i16) dst3, 0);
896 tmp1 = __msa_copy_u_h((v8i16) dst3, 2);
897 SW(tmp2, src);
898 SH(tmp0, src + 4);
899 src += stride;
900 SW(tmp3, src);
901 SH(tmp1, src + 4);
902 src += stride;
903
904 tmp2 = __msa_copy_u_w((v4i32) dst1, 2);
905 tmp3 = __msa_copy_u_w((v4i32) dst1, 3);
906 tmp0 = __msa_copy_u_h((v8i16) dst3, 4);
907 tmp1 = __msa_copy_u_h((v8i16) dst3, 6);
908 SW(tmp2, src);
909 SH(tmp0, src + 4);
910 src += stride;
911 SW(tmp3, src);
912 SH(tmp1, src + 4);
913 }
914 }
915
hevc_loopfilter_chroma_hor_msa(uint8_t * src,int32_t stride,int32_t * tc,uint8_t * p_is_pcm,uint8_t * q_is_pcm)916 static void hevc_loopfilter_chroma_hor_msa(uint8_t *src, int32_t stride,
917 int32_t *tc, uint8_t *p_is_pcm,
918 uint8_t *q_is_pcm)
919 {
920 uint8_t *p1_ptr = src - (stride << 1);
921 uint8_t *p0_ptr = src - stride;
922 uint8_t *q0_ptr = src;
923 uint8_t *q1_ptr = src + stride;
924 v2i64 cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
925 v8u16 p1, p0, q0, q1;
926 v8i16 tc_pos, tc_neg;
927 v16i8 zero = { 0 };
928 v8i16 temp0, temp1, delta;
929
930 if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
931 cmp0 = (v2i64) __msa_fill_h(tc[0]);
932 cmp1 = (v2i64) __msa_fill_h(tc[1]);
933 tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
934 tc_neg = -tc_pos;
935
936 cmp0 = __msa_fill_d(p_is_pcm[0]);
937 cmp1 = __msa_fill_d(p_is_pcm[1]);
938 p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
939 p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
940
941 cmp0 = __msa_fill_d(q_is_pcm[0]);
942 cmp1 = __msa_fill_d(q_is_pcm[1]);
943 q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
944 q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
945
946 p1 = LD_UH(p1_ptr);
947 p0 = LD_UH(p0_ptr);
948 q0 = LD_UH(q0_ptr);
949 q1 = LD_UH(q1_ptr);
950
951 ILVR_B4_UH(zero, p1, zero, p0, zero, q0, zero, q1, p1, p0, q0, q1);
952
953 temp0 = (v8i16) (q0 - p0);
954 temp1 = (v8i16) (p1 - q1);
955 temp0 <<= 2;
956 temp0 += temp1;
957 delta = __msa_srari_h((v8i16) temp0, 3);
958 CLIP_SH(delta, tc_neg, tc_pos);
959
960 temp0 = (v8i16) ((v8i16) p0 + delta);
961 CLIP_SH_0_255(temp0);
962 temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0,
963 (v16u8) p_is_pcm_vec);
964
965 temp1 = (v8i16) ((v8i16) q0 - delta);
966 CLIP_SH_0_255(temp1);
967 temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8) q0,
968 (v16u8) q_is_pcm_vec);
969
970 tc_pos = (v8i16) __msa_clei_s_d((v2i64) tc_pos, 0);
971 temp0 = (v8i16) __msa_bmnz_v((v16u8) temp0, (v16u8) p0, (v16u8) tc_pos);
972 temp1 = (v8i16) __msa_bmnz_v((v16u8) temp1, (v16u8) q0, (v16u8) tc_pos);
973
974 temp0 = (v8i16) __msa_pckev_b((v16i8) temp1, (v16i8) temp0);
975 ST_D2(temp0, 0, 1, p0_ptr, stride);
976 }
977 }
978
hevc_loopfilter_chroma_ver_msa(uint8_t * src,int32_t stride,int32_t * tc,uint8_t * p_is_pcm,uint8_t * q_is_pcm)979 static void hevc_loopfilter_chroma_ver_msa(uint8_t *src, int32_t stride,
980 int32_t *tc, uint8_t *p_is_pcm,
981 uint8_t *q_is_pcm)
982 {
983 v2i64 cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
984 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
985 v8u16 p1, p0, q0, q1;
986 v8i16 tc_pos, tc_neg;
987 v16i8 zero = { 0 };
988 v8i16 temp0, temp1, delta;
989
990 if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
991 cmp0 = (v2i64) __msa_fill_h(tc[0]);
992 cmp1 = (v2i64) __msa_fill_h(tc[1]);
993 tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
994 tc_neg = -tc_pos;
995
996 cmp0 = __msa_fill_d(p_is_pcm[0]);
997 cmp1 = __msa_fill_d(p_is_pcm[1]);
998 p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
999 p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
1000
1001 cmp0 = __msa_fill_d(q_is_pcm[0]);
1002 cmp1 = __msa_fill_d(q_is_pcm[1]);
1003 q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
1004 q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
1005
1006 src -= 2;
1007 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1008 TRANSPOSE8x4_UB_UH(src0, src1, src2, src3, src4, src5, src6, src7,
1009 p1, p0, q0, q1);
1010 ILVR_B4_UH(zero, p1, zero, p0, zero, q0, zero, q1, p1, p0, q0, q1);
1011
1012 temp0 = (v8i16) (q0 - p0);
1013 temp1 = (v8i16) (p1 - q1);
1014 temp0 <<= 2;
1015 temp0 += temp1;
1016 delta = __msa_srari_h((v8i16) temp0, 3);
1017 CLIP_SH(delta, tc_neg, tc_pos);
1018
1019 temp0 = (v8i16) ((v8i16) p0 + delta);
1020 CLIP_SH_0_255(temp0);
1021 temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0,
1022 (v16u8) p_is_pcm_vec);
1023
1024 temp1 = (v8i16) ((v8i16) q0 - delta);
1025 CLIP_SH_0_255(temp1);
1026 temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8) q0,
1027 (v16u8) q_is_pcm_vec);
1028
1029 tc_pos = (v8i16) __msa_clei_s_d((v2i64) tc_pos, 0);
1030 temp0 = (v8i16) __msa_bmnz_v((v16u8) temp0, (v16u8) p0, (v16u8) tc_pos);
1031 temp1 = (v8i16) __msa_bmnz_v((v16u8) temp1, (v16u8) q0, (v16u8) tc_pos);
1032
1033 temp0 = (v8i16) __msa_ilvev_b((v16i8) temp1, (v16i8) temp0);
1034
1035 src += 1;
1036 ST_H8(temp0, 0, 1, 2, 3, 4, 5, 6, 7, src, stride);
1037 }
1038 }
1039
hevc_sao_band_filter_4width_msa(uint8_t * dst,int32_t dst_stride,uint8_t * src,int32_t src_stride,int32_t sao_left_class,int16_t * sao_offset_val,int32_t height)1040 static void hevc_sao_band_filter_4width_msa(uint8_t *dst, int32_t dst_stride,
1041 uint8_t *src, int32_t src_stride,
1042 int32_t sao_left_class,
1043 int16_t *sao_offset_val,
1044 int32_t height)
1045 {
1046 v16u8 src0, src1, src2, src3;
1047 v16i8 src0_r, src1_r;
1048 v16i8 offset, offset_val, mask;
1049 v16i8 dst0, offset0, offset1;
1050 v16i8 zero = { 0 };
1051
1052 offset_val = LD_SB(sao_offset_val + 1);
1053 offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val);
1054
1055 offset_val = __msa_pckev_b(offset_val, offset_val);
1056 offset1 = (v16i8) __msa_insve_w((v4i32) zero, 3, (v4i32) offset_val);
1057 offset0 = __msa_sld_b(offset1, zero, 28 - ((sao_left_class) & 31));
1058 offset1 = __msa_sld_b(zero, offset1, 28 - ((sao_left_class) & 31));
1059
1060 /* load in advance. */
1061 LD_UB4(src, src_stride, src0, src1, src2, src3);
1062
1063 if (!((sao_left_class > 12) & (sao_left_class < 29))) {
1064 SWAP(offset0, offset1);
1065 }
1066
1067 for (height -= 4; height; height -= 4) {
1068 src += (4 * src_stride);
1069
1070 ILVEV_D2_SB(src0, src1, src2, src3, src0_r, src1_r);
1071
1072 src0_r = (v16i8) __msa_pckev_w((v4i32) src1_r, (v4i32) src0_r);
1073 mask = __msa_srli_b(src0_r, 3);
1074 offset = __msa_vshf_b(mask, offset1, offset0);
1075
1076 src0_r = (v16i8) __msa_xori_b((v16u8) src0_r, 128);
1077 dst0 = __msa_adds_s_b(src0_r, offset);
1078 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1079
1080 /* load in advance. */
1081 LD_UB4(src, src_stride, src0, src1, src2, src3);
1082
1083 /* store results */
1084 ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
1085 dst += (4 * dst_stride);
1086 }
1087
1088 ILVEV_D2_SB(src0, src1, src2, src3, src0_r, src1_r);
1089
1090 src0_r = (v16i8) __msa_pckev_w((v4i32) src1_r, (v4i32) src0_r);
1091 mask = __msa_srli_b(src0_r, 3);
1092 offset = __msa_vshf_b(mask, offset1, offset0);
1093
1094 src0_r = (v16i8) __msa_xori_b((v16u8) src0_r, 128);
1095 dst0 = __msa_adds_s_b(src0_r, offset);
1096 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1097
1098 /* store results */
1099 ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
1100 }
1101
hevc_sao_band_filter_8width_msa(uint8_t * dst,int32_t dst_stride,uint8_t * src,int32_t src_stride,int32_t sao_left_class,int16_t * sao_offset_val,int32_t height)1102 static void hevc_sao_band_filter_8width_msa(uint8_t *dst, int32_t dst_stride,
1103 uint8_t *src, int32_t src_stride,
1104 int32_t sao_left_class,
1105 int16_t *sao_offset_val,
1106 int32_t height)
1107 {
1108 v16u8 src0, src1, src2, src3;
1109 v16i8 src0_r, src1_r, mask0, mask1;
1110 v16i8 offset_mask0, offset_mask1, offset_val;
1111 v16i8 offset0, offset1, dst0, dst1;
1112 v16i8 zero = { 0 };
1113
1114 offset_val = LD_SB(sao_offset_val + 1);
1115 offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val);
1116 offset_val = __msa_pckev_b(offset_val, offset_val);
1117 offset1 = (v16i8) __msa_insve_w((v4i32) zero, 3, (v4i32) offset_val);
1118 offset0 = __msa_sld_b(offset1, zero, 28 - ((sao_left_class) & 31));
1119 offset1 = __msa_sld_b(zero, offset1, 28 - ((sao_left_class) & 31));
1120
1121 /* load in advance. */
1122 LD_UB4(src, src_stride, src0, src1, src2, src3);
1123
1124 if (!((sao_left_class > 12) & (sao_left_class < 29))) {
1125 SWAP(offset0, offset1);
1126 }
1127
1128 for (height -= 4; height; height -= 4) {
1129 src += src_stride << 2;
1130
1131 ILVR_D2_SB(src1, src0, src3, src2, src0_r, src1_r);
1132
1133 mask0 = __msa_srli_b(src0_r, 3);
1134 mask1 = __msa_srli_b(src1_r, 3);
1135
1136 offset_mask0 = __msa_vshf_b(mask0, offset1, offset0);
1137 offset_mask1 = __msa_vshf_b(mask1, offset1, offset0);
1138
1139 /* load in advance. */
1140 LD_UB4(src, src_stride, src0, src1, src2, src3);
1141
1142 XORI_B2_128_SB(src0_r, src1_r);
1143
1144 dst0 = __msa_adds_s_b(src0_r, offset_mask0);
1145 dst1 = __msa_adds_s_b(src1_r, offset_mask1);
1146
1147 XORI_B2_128_SB(dst0, dst1);
1148
1149 /* store results */
1150 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
1151 dst += dst_stride << 2;
1152 }
1153
1154 ILVR_D2_SB(src1, src0, src3, src2, src0_r, src1_r);
1155
1156 mask0 = __msa_srli_b(src0_r, 3);
1157 mask1 = __msa_srli_b(src1_r, 3);
1158
1159 offset_mask0 = __msa_vshf_b(mask0, offset1, offset0);
1160 offset_mask1 = __msa_vshf_b(mask1, offset1, offset0);
1161
1162 XORI_B2_128_SB(src0_r, src1_r);
1163
1164 dst0 = __msa_adds_s_b(src0_r, offset_mask0);
1165 dst1 = __msa_adds_s_b(src1_r, offset_mask1);
1166
1167 XORI_B2_128_SB(dst0, dst1);
1168
1169 /* store results */
1170 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
1171 }
1172
hevc_sao_band_filter_16multiple_msa(uint8_t * dst,int32_t dst_stride,uint8_t * src,int32_t src_stride,int32_t sao_left_class,int16_t * sao_offset_val,int32_t width,int32_t height)1173 static void hevc_sao_band_filter_16multiple_msa(uint8_t *dst,
1174 int32_t dst_stride,
1175 uint8_t *src,
1176 int32_t src_stride,
1177 int32_t sao_left_class,
1178 int16_t *sao_offset_val,
1179 int32_t width, int32_t height)
1180 {
1181 int32_t w_cnt;
1182 v16u8 src0, src1, src2, src3;
1183 v16i8 out0, out1, out2, out3;
1184 v16i8 mask0, mask1, mask2, mask3;
1185 v16i8 tmp0, tmp1, tmp2, tmp3, offset_val;
1186 v16i8 offset0, offset1;
1187 v16i8 zero = { 0 };
1188
1189 offset_val = LD_SB(sao_offset_val + 1);
1190 offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val);
1191 offset_val = __msa_pckev_b(offset_val, offset_val);
1192 offset1 = (v16i8) __msa_insve_w((v4i32) zero, 3, (v4i32) offset_val);
1193 offset0 = __msa_sld_b(offset1, zero, 28 - ((sao_left_class) & 31));
1194 offset1 = __msa_sld_b(zero, offset1, 28 - ((sao_left_class) & 31));
1195
1196 if (!((sao_left_class > 12) & (sao_left_class < 29))) {
1197 SWAP(offset0, offset1);
1198 }
1199
1200 while (height > 0) {
1201 /* load in advance */
1202 LD_UB4(src, src_stride, src0, src1, src2, src3);
1203
1204 for (w_cnt = 16; w_cnt < width; w_cnt += 16) {
1205 mask0 = __msa_srli_b((v16i8) src0, 3);
1206 mask1 = __msa_srli_b((v16i8) src1, 3);
1207 mask2 = __msa_srli_b((v16i8) src2, 3);
1208 mask3 = __msa_srli_b((v16i8) src3, 3);
1209
1210 VSHF_B2_SB(offset0, offset1, offset0, offset1, mask0, mask1,
1211 tmp0, tmp1);
1212 VSHF_B2_SB(offset0, offset1, offset0, offset1, mask2, mask3,
1213 tmp2, tmp3);
1214 XORI_B4_128_UB(src0, src1, src2, src3);
1215
1216 out0 = __msa_adds_s_b((v16i8) src0, tmp0);
1217 out1 = __msa_adds_s_b((v16i8) src1, tmp1);
1218 out2 = __msa_adds_s_b((v16i8) src2, tmp2);
1219 out3 = __msa_adds_s_b((v16i8) src3, tmp3);
1220
1221 /* load for next iteration */
1222 LD_UB4(src + w_cnt, src_stride, src0, src1, src2, src3);
1223
1224 XORI_B4_128_SB(out0, out1, out2, out3);
1225
1226 ST_SB4(out0, out1, out2, out3, dst + w_cnt - 16, dst_stride);
1227 }
1228
1229 mask0 = __msa_srli_b((v16i8) src0, 3);
1230 mask1 = __msa_srli_b((v16i8) src1, 3);
1231 mask2 = __msa_srli_b((v16i8) src2, 3);
1232 mask3 = __msa_srli_b((v16i8) src3, 3);
1233
1234 VSHF_B2_SB(offset0, offset1, offset0, offset1, mask0, mask1, tmp0,
1235 tmp1);
1236 VSHF_B2_SB(offset0, offset1, offset0, offset1, mask2, mask3, tmp2,
1237 tmp3);
1238 XORI_B4_128_UB(src0, src1, src2, src3);
1239
1240 out0 = __msa_adds_s_b((v16i8) src0, tmp0);
1241 out1 = __msa_adds_s_b((v16i8) src1, tmp1);
1242 out2 = __msa_adds_s_b((v16i8) src2, tmp2);
1243 out3 = __msa_adds_s_b((v16i8) src3, tmp3);
1244
1245 XORI_B4_128_SB(out0, out1, out2, out3);
1246
1247 ST_SB4(out0, out1, out2, out3, dst + w_cnt - 16, dst_stride);
1248
1249 src += src_stride << 2;
1250 dst += dst_stride << 2;
1251 height -= 4;
1252 }
1253 }
1254
hevc_sao_edge_filter_0degree_4width_msa(uint8_t * dst,int32_t dst_stride,uint8_t * src,int32_t src_stride,int16_t * sao_offset_val,int32_t height)1255 static void hevc_sao_edge_filter_0degree_4width_msa(uint8_t *dst,
1256 int32_t dst_stride,
1257 uint8_t *src,
1258 int32_t src_stride,
1259 int16_t *sao_offset_val,
1260 int32_t height)
1261 {
1262 uint32_t dst_val0, dst_val1;
1263 v16u8 cmp_minus10, diff_minus10, diff_minus11, src_minus10, src_minus11;
1264 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1265 v16i8 sao_offset = LD_SB(sao_offset_val);
1266 v16i8 src_plus10, offset, src0, dst0;
1267 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1268 v16i8 zero = { 0 };
1269
1270 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1271 src -= 1;
1272
1273 /* load in advance */
1274 LD_UB2(src, src_stride, src_minus10, src_minus11);
1275
1276 for (height -= 2; height; height -= 2) {
1277 src += (2 * src_stride);
1278
1279 src_minus10 = (v16u8) __msa_pckev_d((v2i64) src_minus11,
1280 (v2i64) src_minus10);
1281
1282 src0 = (v16i8) __msa_sldi_b(zero, (v16i8) src_minus10, 1);
1283 src_plus10 = (v16i8) __msa_sldi_b(zero, (v16i8) src_minus10, 2);
1284
1285 cmp_minus10 = ((v16u8) src0 == src_minus10);
1286 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1287 cmp_minus10 = (src_minus10 < (v16u8) src0);
1288 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1289
1290 cmp_minus10 = ((v16u8) src0 == (v16u8) src_plus10);
1291 diff_minus11 = __msa_nor_v(cmp_minus10, cmp_minus10);
1292 cmp_minus10 = ((v16u8) src_plus10 < (v16u8) src0);
1293 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus10);
1294
1295 offset = (v16i8) diff_minus10 + (v16i8) diff_minus11 + 2;
1296
1297 /* load in advance */
1298 LD_UB2(src, src_stride, src_minus10, src_minus11);
1299
1300 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
1301 offset, offset);
1302
1303 src0 = (v16i8) __msa_xori_b((v16u8) src0, 128);
1304 dst0 = __msa_adds_s_b(src0, offset);
1305 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1306
1307 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1308 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1309 SW(dst_val0, dst);
1310 dst += dst_stride;
1311 SW(dst_val1, dst);
1312 dst += dst_stride;
1313 }
1314
1315 src_minus10 = (v16u8) __msa_pckev_d((v2i64) src_minus11,
1316 (v2i64) src_minus10);
1317
1318 src0 = (v16i8) __msa_sldi_b(zero, (v16i8) src_minus10, 1);
1319 src_plus10 = (v16i8) __msa_sldi_b(zero, (v16i8) src_minus10, 2);
1320
1321 cmp_minus10 = ((v16u8) src0 == src_minus10);
1322 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1323 cmp_minus10 = (src_minus10 < (v16u8) src0);
1324 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1325
1326 cmp_minus10 = ((v16u8) src0 == (v16u8) src_plus10);
1327 diff_minus11 = __msa_nor_v(cmp_minus10, cmp_minus10);
1328 cmp_minus10 = ((v16u8) src_plus10 < (v16u8) src0);
1329 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus10);
1330
1331 offset = (v16i8) diff_minus10 + (v16i8) diff_minus11 + 2;
1332 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
1333 offset, offset);
1334
1335 src0 = (v16i8) __msa_xori_b((v16u8) src0, 128);
1336 dst0 = __msa_adds_s_b(src0, offset);
1337 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1338
1339 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1340 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1341
1342 SW(dst_val0, dst);
1343 dst += dst_stride;
1344 SW(dst_val1, dst);
1345 }
1346
hevc_sao_edge_filter_0degree_8width_msa(uint8_t * dst,int32_t dst_stride,uint8_t * src,int32_t src_stride,int16_t * sao_offset_val,int32_t height)1347 static void hevc_sao_edge_filter_0degree_8width_msa(uint8_t *dst,
1348 int32_t dst_stride,
1349 uint8_t *src,
1350 int32_t src_stride,
1351 int16_t *sao_offset_val,
1352 int32_t height)
1353 {
1354 uint64_t dst_val0, dst_val1;
1355 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1356 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1357 v16u8 cmp_minus10, diff_minus10, diff_minus11;
1358 v16u8 src0, src1, dst0, src_minus10, src_minus11, src_plus10, src_plus11;
1359 v16i8 offset, sao_offset = LD_SB(sao_offset_val);
1360 v16i8 zeros = { 0 };
1361
1362 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1363 src -= 1;
1364
1365 /* load in advance */
1366 LD_UB2(src, src_stride, src_minus10, src_minus11);
1367
1368 for (height -= 2; height; height -= 2) {
1369 src += (src_stride << 1);
1370
1371 SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 1, src0, src1);
1372 SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_plus10, src_plus11);
1373
1374 PCKEV_D2_UB(src_minus11, src_minus10, src_plus11, src_plus10,
1375 src_minus10, src_plus10);
1376 src0 = (v16u8) __msa_pckev_d((v2i64) src1, (v2i64) src0);
1377
1378 cmp_minus10 = (src0 == src_minus10);
1379 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1380 cmp_minus10 = (src_minus10 < src0);
1381 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1382
1383 cmp_minus10 = (src0 == src_plus10);
1384 diff_minus11 = __msa_nor_v(cmp_minus10, cmp_minus10);
1385 cmp_minus10 = (src_plus10 < src0);
1386 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus10);
1387
1388 offset = (v16i8) diff_minus10 + (v16i8) diff_minus11 + 2;
1389
1390 /* load in advance */
1391 LD_UB2(src, src_stride, src_minus10, src_minus11);
1392
1393 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
1394 offset, offset);
1395
1396 src0 = __msa_xori_b(src0, 128);
1397 dst0 = (v16u8) __msa_adds_s_b((v16i8) src0, offset);
1398 dst0 = __msa_xori_b(dst0, 128);
1399
1400 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
1401 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
1402 SD(dst_val0, dst);
1403 dst += dst_stride;
1404 SD(dst_val1, dst);
1405 dst += dst_stride;
1406 }
1407
1408 SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 1, src0, src1);
1409 SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_plus10, src_plus11);
1410
1411 PCKEV_D2_UB(src_minus11, src_minus10, src_plus11, src_plus10, src_minus10,
1412 src_plus10);
1413 src0 = (v16u8) __msa_pckev_d((v2i64) src1, (v2i64) src0);
1414
1415 cmp_minus10 = ((v16u8) src0 == src_minus10);
1416 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1417 cmp_minus10 = (src_minus10 < (v16u8) src0);
1418 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1419
1420 cmp_minus10 = (src0 == src_plus10);
1421 diff_minus11 = __msa_nor_v(cmp_minus10, cmp_minus10);
1422 cmp_minus10 = (src_plus10 < src0);
1423 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus10);
1424
1425 offset = (v16i8) diff_minus10 + (v16i8) diff_minus11 + 2;
1426
1427 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
1428 offset, offset);
1429
1430 src0 = __msa_xori_b(src0, 128);
1431 dst0 = (v16u8) __msa_adds_s_b((v16i8) src0, offset);
1432 dst0 = __msa_xori_b(dst0, 128);
1433
1434 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
1435 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
1436 SD(dst_val0, dst);
1437 dst += dst_stride;
1438 SD(dst_val1, dst);
1439 }
1440
hevc_sao_edge_filter_0degree_16multiple_msa(uint8_t * dst,int32_t dst_stride,uint8_t * src,int32_t src_stride,int16_t * sao_offset_val,int32_t width,int32_t height)1441 static void hevc_sao_edge_filter_0degree_16multiple_msa(uint8_t *dst,
1442 int32_t dst_stride,
1443 uint8_t *src,
1444 int32_t src_stride,
1445 int16_t *sao_offset_val,
1446 int32_t width,
1447 int32_t height)
1448 {
1449 uint8_t *dst_ptr, *src_minus1;
1450 int32_t v_cnt;
1451 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1452 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1453 v16i8 sao_offset;
1454 v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1455 v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1456 v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1457 v16u8 diff_plus13;
1458 v16u8 src10, src11, src12, src13, dst0, dst1, dst2, dst3;
1459 v16u8 src_minus10, src_minus11, src_minus12, src_minus13;
1460 v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3;
1461 v16i8 src_zero0, src_zero1, src_zero2, src_zero3;
1462 v16i8 src_plus10, src_plus11, src_plus12, src_plus13;
1463
1464 sao_offset = LD_SB(sao_offset_val);
1465 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1466
1467 for (; height; height -= 4) {
1468 src_minus1 = src - 1;
1469 LD_UB4(src_minus1, src_stride,
1470 src_minus10, src_minus11, src_minus12, src_minus13);
1471
1472 for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
1473 src_minus1 += 16;
1474 dst_ptr = dst + v_cnt;
1475 LD_UB4(src_minus1, src_stride, src10, src11, src12, src13);
1476
1477 SLDI_B4_SB(src10, src_minus10, src11, src_minus11,
1478 src12, src_minus12, src13, src_minus13, 1,
1479 src_zero0, src_zero1, src_zero2, src_zero3);
1480 SLDI_B4_SB(src10, src_minus10, src11, src_minus11,
1481 src12, src_minus12, src13, src_minus13, 2,
1482 src_plus10, src_plus11, src_plus12, src_plus13);
1483
1484 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1485 cmp_plus10 = ((v16u8) src_zero0 == (v16u8) src_plus10);
1486 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1487 cmp_plus11 = ((v16u8) src_zero1 == (v16u8) src_plus11);
1488 cmp_minus12 = ((v16u8) src_zero2 == src_minus12);
1489 cmp_plus12 = ((v16u8) src_zero2 == (v16u8) src_plus12);
1490 cmp_minus13 = ((v16u8) src_zero3 == src_minus13);
1491 cmp_plus13 = ((v16u8) src_zero3 == (v16u8) src_plus13);
1492
1493 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1494 diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
1495 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1496 diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
1497 diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
1498 diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
1499 diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
1500 diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
1501
1502 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1503 cmp_plus10 = ((v16u8) src_plus10 < (v16u8) src_zero0);
1504 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1505 cmp_plus11 = ((v16u8) src_plus11 < (v16u8) src_zero1);
1506 cmp_minus12 = (src_minus12 < (v16u8) src_zero2);
1507 cmp_plus12 = ((v16u8) src_plus12 < (v16u8) src_zero2);
1508 cmp_minus13 = (src_minus13 < (v16u8) src_zero3);
1509 cmp_plus13 = ((v16u8) src_plus13 < (v16u8) src_zero3);
1510
1511 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1512 diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
1513 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1514 diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
1515 diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
1516 diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
1517 diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
1518 diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
1519
1520 offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
1521 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
1522 offset_mask0, offset_mask0, offset_mask0);
1523 offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
1524 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
1525 offset_mask1, offset_mask1, offset_mask1);
1526 offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
1527 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask2,
1528 offset_mask2, offset_mask2, offset_mask2);
1529 offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
1530 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask3,
1531 offset_mask3, offset_mask3, offset_mask3);
1532
1533 XORI_B4_128_SB(src_zero0, src_zero1, src_zero2, src_zero3);
1534
1535 dst0 = (v16u8) __msa_adds_s_b((v16i8) src_zero0, offset_mask0);
1536 dst1 = (v16u8) __msa_adds_s_b((v16i8) src_zero1, offset_mask1);
1537 dst2 = (v16u8) __msa_adds_s_b((v16i8) src_zero2, offset_mask2);
1538 dst3 = (v16u8) __msa_adds_s_b((v16i8) src_zero3, offset_mask3);
1539
1540 XORI_B4_128_UB(dst0, dst1, dst2, dst3);
1541
1542 src_minus10 = src10;
1543 ST_UB(dst0, dst_ptr);
1544 src_minus11 = src11;
1545 ST_UB(dst1, dst_ptr + dst_stride);
1546 src_minus12 = src12;
1547 ST_UB(dst2, dst_ptr + (dst_stride << 1));
1548 src_minus13 = src13;
1549 ST_UB(dst3, dst_ptr + (dst_stride * 3));
1550 }
1551
1552 src += (src_stride << 2);
1553 dst += (dst_stride << 2);
1554 }
1555 }
1556
hevc_sao_edge_filter_90degree_4width_msa(uint8_t * dst,int32_t dst_stride,uint8_t * src,int32_t src_stride,int16_t * sao_offset_val,int32_t height)1557 static void hevc_sao_edge_filter_90degree_4width_msa(uint8_t *dst,
1558 int32_t dst_stride,
1559 uint8_t *src,
1560 int32_t src_stride,
1561 int16_t *sao_offset_val,
1562 int32_t height)
1563 {
1564 uint32_t dst_val0, dst_val1;
1565 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1566 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1567 v16i8 dst0;
1568 v16i8 sao_offset = LD_SB(sao_offset_val);
1569 v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1570 v16u8 src_minus10, src_minus11, src10, src11;
1571 v16i8 src_zero0, src_zero1;
1572 v16i8 offset;
1573 v8i16 offset_mask0, offset_mask1;
1574
1575 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1576
1577 /* load in advance */
1578 LD_UB2(src - src_stride, src_stride, src_minus10, src_minus11);
1579 LD_UB2(src + src_stride, src_stride, src10, src11);
1580
1581 for (height -= 2; height; height -= 2) {
1582 src += (src_stride << 1);
1583
1584 src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
1585 src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
1586 src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
1587 src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
1588
1589 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1590 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1591 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1592 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1593
1594 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1595 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1596 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1597 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1598
1599 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1600 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1601
1602 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1603 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
1604
1605 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
1606 offset, offset);
1607
1608 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1609 dst0 = __msa_adds_s_b(dst0, offset);
1610 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1611
1612 src_minus10 = src10;
1613 src_minus11 = src11;
1614
1615 /* load in advance */
1616 LD_UB2(src + src_stride, src_stride, src10, src11);
1617
1618 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1619 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1620 SW(dst_val0, dst);
1621 dst += dst_stride;
1622 SW(dst_val1, dst);
1623
1624 dst += dst_stride;
1625 }
1626
1627 src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
1628 src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
1629 src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
1630 src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
1631
1632 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1633 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1634 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1635 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1636
1637 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1638 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1639 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1640 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1641
1642 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1643 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1644
1645 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1646 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
1647
1648 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset,
1649 offset, offset, offset);
1650
1651 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1652 dst0 = __msa_adds_s_b(dst0, offset);
1653 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1654
1655 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1656 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1657 SW(dst_val0, dst);
1658 dst += dst_stride;
1659 SW(dst_val1, dst);
1660 }
1661
hevc_sao_edge_filter_90degree_8width_msa(uint8_t * dst,int32_t dst_stride,uint8_t * src,int32_t src_stride,int16_t * sao_offset_val,int32_t height)1662 static void hevc_sao_edge_filter_90degree_8width_msa(uint8_t *dst,
1663 int32_t dst_stride,
1664 uint8_t *src,
1665 int32_t src_stride,
1666 int16_t *sao_offset_val,
1667 int32_t height)
1668 {
1669 uint64_t dst_val0, dst_val1;
1670 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1671 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1672 v16i8 offset, sao_offset = LD_SB(sao_offset_val);
1673 v16i8 src_zero0, src_zero1, dst0;
1674 v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1675 v16u8 src_minus10, src_minus11, src10, src11;
1676 v8i16 offset_mask0, offset_mask1;
1677
1678 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1679
1680 /* load in advance */
1681 LD_UB2(src - src_stride, src_stride, src_minus10, src_minus11);
1682 LD_UB2(src + src_stride, src_stride, src10, src11);
1683
1684 for (height -= 2; height; height -= 2) {
1685 src += (src_stride << 1);
1686
1687 src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
1688 src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
1689 src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
1690 src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
1691
1692 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1693 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1694 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1695 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1696
1697 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1698 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1699 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1700 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1701
1702 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1703 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1704
1705 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1706 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
1707
1708 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset,
1709 offset, offset, offset);
1710
1711 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1712 dst0 = __msa_adds_s_b(dst0, offset);
1713 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1714
1715 src_minus10 = src10;
1716 src_minus11 = src11;
1717
1718 /* load in advance */
1719 LD_UB2(src + src_stride, src_stride, src10, src11);
1720
1721 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
1722 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
1723 SD(dst_val0, dst);
1724 dst += dst_stride;
1725 SD(dst_val1, dst);
1726 dst += dst_stride;
1727 }
1728
1729 src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
1730 src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
1731 src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
1732 src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
1733
1734 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1735 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1736 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1737 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1738
1739 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1740 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1741 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1742 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1743
1744 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1745 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1746
1747 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1748 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
1749
1750 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
1751 offset, offset);
1752
1753 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1754 dst0 = __msa_adds_s_b(dst0, offset);
1755 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1756
1757 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
1758 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
1759 SD(dst_val0, dst);
1760 dst += dst_stride;
1761 SD(dst_val1, dst);
1762 }
1763
hevc_sao_edge_filter_90degree_16multiple_msa(uint8_t * dst,int32_t dst_stride,uint8_t * src,int32_t src_stride,int16_t * sao_offset_val,int32_t width,int32_t height)1764 static void hevc_sao_edge_filter_90degree_16multiple_msa(uint8_t *dst,
1765 int32_t dst_stride,
1766 uint8_t *src,
1767 int32_t src_stride,
1768 int16_t *
1769 sao_offset_val,
1770 int32_t width,
1771 int32_t height)
1772 {
1773 uint8_t *src_orig = src;
1774 uint8_t *dst_orig = dst;
1775 int32_t h_cnt, v_cnt;
1776 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1777 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1778 v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1779 v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1780 v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1781 v16u8 diff_plus13;
1782 v16u8 src10, src_minus10, dst0, src11, src_minus11, dst1;
1783 v16u8 src12, dst2, src13, dst3;
1784 v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
1785
1786 sao_offset = LD_SB(sao_offset_val);
1787 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1788
1789 for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
1790 src = src_orig + v_cnt;
1791 dst = dst_orig + v_cnt;
1792
1793 LD_UB2(src - src_stride, src_stride, src_minus10, src_minus11);
1794
1795 for (h_cnt = (height >> 2); h_cnt--;) {
1796 LD_UB4(src + src_stride, src_stride, src10, src11, src12, src13);
1797
1798 cmp_minus10 = (src_minus11 == src_minus10);
1799 cmp_plus10 = (src_minus11 == src10);
1800 cmp_minus11 = (src10 == src_minus11);
1801 cmp_plus11 = (src10 == src11);
1802 cmp_minus12 = (src11 == src10);
1803 cmp_plus12 = (src11 == src12);
1804 cmp_minus13 = (src12 == src11);
1805 cmp_plus13 = (src12 == src13);
1806
1807 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1808 diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
1809 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1810 diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
1811 diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
1812 diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
1813 diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
1814 diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
1815
1816 cmp_minus10 = (src_minus10 < src_minus11);
1817 cmp_plus10 = (src10 < src_minus11);
1818 cmp_minus11 = (src_minus11 < src10);
1819 cmp_plus11 = (src11 < src10);
1820 cmp_minus12 = (src10 < src11);
1821 cmp_plus12 = (src12 < src11);
1822 cmp_minus13 = (src11 < src12);
1823 cmp_plus13 = (src13 < src12);
1824
1825 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1826 diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
1827 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1828 diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
1829 diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
1830 diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
1831 diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
1832 diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
1833
1834 offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
1835 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1836 offset_mask0, offset_mask0, offset_mask0, offset_mask0);
1837 offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
1838 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1839 offset_mask1, offset_mask1, offset_mask1, offset_mask1);
1840 offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
1841 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1842 offset_mask2, offset_mask2, offset_mask2, offset_mask2);
1843 offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
1844 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1845 offset_mask3, offset_mask3, offset_mask3, offset_mask3);
1846
1847 src_minus10 = src12;
1848 XORI_B4_128_UB(src_minus11, src10, src11, src12);
1849
1850 dst0 = (v16u8) __msa_adds_s_b((v16i8) src_minus11, offset_mask0);
1851 dst1 = (v16u8) __msa_adds_s_b((v16i8) src10, offset_mask1);
1852 dst2 = (v16u8) __msa_adds_s_b((v16i8) src11, offset_mask2);
1853 dst3 = (v16u8) __msa_adds_s_b((v16i8) src12, offset_mask3);
1854
1855 XORI_B4_128_UB(dst0, dst1, dst2, dst3);
1856 src_minus11 = src13;
1857
1858 ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
1859
1860 src += (src_stride << 2);
1861 dst += (dst_stride << 2);
1862 }
1863 }
1864 }
1865
hevc_sao_edge_filter_45degree_4width_msa(uint8_t * dst,int32_t dst_stride,uint8_t * src,int32_t src_stride,int16_t * sao_offset_val,int32_t height)1866 static void hevc_sao_edge_filter_45degree_4width_msa(uint8_t *dst,
1867 int32_t dst_stride,
1868 uint8_t *src,
1869 int32_t src_stride,
1870 int16_t *sao_offset_val,
1871 int32_t height)
1872 {
1873 uint8_t *src_orig;
1874 uint32_t dst_val0, dst_val1;
1875 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1876 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1877 v16i8 offset, sao_offset = LD_SB(sao_offset_val);
1878 v16u8 cmp_minus10, diff_minus10, src_minus10, cmp_minus11, diff_minus11;
1879 v16u8 src_minus11, src10, src11;
1880 v16i8 src_plus0, src_zero0, src_plus1, src_zero1, dst0;
1881 v8i16 offset_mask0, offset_mask1;
1882 v16i8 zeros = { 0 };
1883
1884 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1885
1886 src_orig = src - 1;
1887
1888 /* load in advance */
1889 LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
1890 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
1891
1892 for (height -= 2; height; height -= 2) {
1893 src_orig += (src_stride << 1);
1894
1895 SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
1896 SLDI_B2_SB(zeros, src10, zeros, src11, 2, src_plus0, src_plus1);
1897
1898 ILVR_B2_UB(src_plus0, src_minus10, src_plus1, src_minus11, src_minus10,
1899 src_minus11);
1900 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
1901 src_zero1);
1902
1903 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1904 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1905 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1906 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1907
1908 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1909 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1910 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1911 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1912
1913 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1914 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1915
1916 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1917 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
1918
1919 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset,
1920 offset, offset, offset);
1921
1922 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1923 dst0 = __msa_adds_s_b(dst0, offset);
1924 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1925
1926 src_minus10 = src10;
1927 src_minus11 = src11;
1928
1929 /* load in advance */
1930 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
1931
1932 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1933 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1934 SW(dst_val0, dst);
1935 dst += dst_stride;
1936 SW(dst_val1, dst);
1937
1938 dst += dst_stride;
1939 }
1940
1941 SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
1942 SLDI_B2_SB(zeros, src10, zeros, src11, 2, src_plus0, src_plus1);
1943
1944 ILVR_B2_UB(src_plus0, src_minus10, src_plus1, src_minus11, src_minus10,
1945 src_minus11);
1946 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
1947 src_zero1);
1948
1949 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1950 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1951 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1952 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1953
1954 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1955 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1956 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1957 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1958
1959 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1960 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1961
1962 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1963 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
1964
1965 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
1966 offset, offset);
1967
1968 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1969 dst0 = __msa_adds_s_b(dst0, offset);
1970 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1971
1972 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1973 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1974 SW(dst_val0, dst);
1975 dst += dst_stride;
1976 SW(dst_val1, dst);
1977 }
1978
hevc_sao_edge_filter_45degree_8width_msa(uint8_t * dst,int32_t dst_stride,uint8_t * src,int32_t src_stride,int16_t * sao_offset_val,int32_t height)1979 static void hevc_sao_edge_filter_45degree_8width_msa(uint8_t *dst,
1980 int32_t dst_stride,
1981 uint8_t *src,
1982 int32_t src_stride,
1983 int16_t *sao_offset_val,
1984 int32_t height)
1985 {
1986 uint8_t *src_orig;
1987 uint64_t dst_val0, dst_val1;
1988 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1989 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1990 v16i8 offset, sao_offset = LD_SB(sao_offset_val);
1991 v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1992 v16u8 src_minus10, src10, src_minus11, src11;
1993 v16i8 src_zero0, src_plus10, src_zero1, src_plus11, dst0;
1994 v8i16 offset_mask0, offset_mask1;
1995 v16i8 zeros = { 0 };
1996
1997 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1998 src_orig = src - 1;
1999
2000 /* load in advance */
2001 LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
2002 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2003
2004 for (height -= 2; height; height -= 2) {
2005 src_orig += (src_stride << 1);
2006
2007 SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
2008 SLDI_B2_SB(zeros, src10, zeros, src11, 2, src_plus10, src_plus11);
2009
2010 ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11,
2011 src_minus10, src_minus11);
2012 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1,
2013 src_zero0, src_zero1);
2014
2015 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2016 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2017 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2018 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2019
2020 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2021 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2022 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2023 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2024
2025 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
2026 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
2027
2028 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
2029 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
2030
2031 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
2032 offset, offset);
2033
2034 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2035 dst0 = __msa_adds_s_b(dst0, offset);
2036 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2037
2038 src_minus10 = src10;
2039 src_minus11 = src11;
2040
2041 /* load in advance */
2042 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2043
2044 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
2045 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
2046 SD(dst_val0, dst);
2047 dst += dst_stride;
2048 SD(dst_val1, dst);
2049 dst += dst_stride;
2050 }
2051
2052 SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
2053 SLDI_B2_SB(zeros, src10, zeros, src11, 2, src_plus10, src_plus11);
2054 ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11, src_minus10,
2055 src_minus11);
2056 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
2057 src_zero1);
2058
2059 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2060 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2061 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2062 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2063
2064 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2065 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2066 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2067 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2068
2069 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
2070 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
2071
2072 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
2073 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
2074
2075 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
2076 offset, offset);
2077
2078 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2079 dst0 = __msa_adds_s_b(dst0, offset);
2080 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2081
2082 src_minus10 = src10;
2083 src_minus11 = src11;
2084
2085 /* load in advance */
2086 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2087
2088 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
2089 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
2090 SD(dst_val0, dst);
2091 dst += dst_stride;
2092 SD(dst_val1, dst);
2093 }
2094
hevc_sao_edge_filter_45degree_16multiple_msa(uint8_t * dst,int32_t dst_stride,uint8_t * src,int32_t src_stride,int16_t * sao_offset_val,int32_t width,int32_t height)2095 static void hevc_sao_edge_filter_45degree_16multiple_msa(uint8_t *dst,
2096 int32_t dst_stride,
2097 uint8_t *src,
2098 int32_t src_stride,
2099 int16_t *
2100 sao_offset_val,
2101 int32_t width,
2102 int32_t height)
2103 {
2104 uint8_t *src_orig = src;
2105 uint8_t *dst_orig = dst;
2106 int32_t v_cnt;
2107 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
2108 v16u8 const1 = (v16u8) __msa_ldi_b(1);
2109 v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
2110 v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
2111 v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
2112 v16u8 diff_plus13, src_minus14, src_plus13;
2113 v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3;
2114 v16u8 src10, src_minus10, dst0, src11, src_minus11, dst1;
2115 v16u8 src12, src_minus12, dst2, src13, src_minus13, dst3;
2116 v16i8 src_zero0, src_plus10, src_zero1, src_plus11, src_zero2, src_plus12;
2117 v16i8 src_zero3, sao_offset;
2118
2119 sao_offset = LD_SB(sao_offset_val);
2120 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
2121
2122 for (; height; height -= 4) {
2123 src_orig = src - 1;
2124 dst_orig = dst;
2125 LD_UB4(src_orig, src_stride, src_minus11, src_minus12, src_minus13,
2126 src_minus14);
2127
2128 for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
2129 src_minus10 = LD_UB(src_orig - src_stride);
2130 LD_UB4(src_orig + 16, src_stride, src10, src11, src12, src13);
2131 src_plus13 = LD_UB(src + 1 + v_cnt + (src_stride << 2));
2132 src_orig += 16;
2133
2134 SLDI_B4_SB(src10, src_minus11, src11, src_minus12,
2135 src12, src_minus13, src13, src_minus14, 1,
2136 src_zero0, src_zero1, src_zero2, src_zero3);
2137 SLDI_B2_SB(src11, src_minus12, src12, src_minus13, 2, src_plus10,
2138 src_plus11);
2139
2140 src_plus12 = __msa_sldi_b((v16i8) src13, (v16i8) src_minus14, 2);
2141
2142 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2143 cmp_plus10 = ((v16u8) src_zero0 == (v16u8) src_plus10);
2144 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2145 cmp_plus11 = ((v16u8) src_zero1 == (v16u8) src_plus11);
2146 cmp_minus12 = ((v16u8) src_zero2 == src_minus12);
2147 cmp_plus12 = ((v16u8) src_zero2 == (v16u8) src_plus12);
2148 cmp_minus13 = ((v16u8) src_zero3 == src_minus13);
2149 cmp_plus13 = ((v16u8) src_zero3 == src_plus13);
2150
2151 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2152 diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
2153 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2154 diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
2155 diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
2156 diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
2157 diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
2158 diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
2159
2160 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2161 cmp_plus10 = ((v16u8) src_plus10 < (v16u8) src_zero0);
2162 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2163 cmp_plus11 = ((v16u8) src_plus11 < (v16u8) src_zero1);
2164 cmp_minus12 = (src_minus12 < (v16u8) src_zero2);
2165 cmp_plus12 = ((v16u8) src_plus12 < (v16u8) src_zero2);
2166 cmp_minus13 = (src_minus13 < (v16u8) src_zero3);
2167 cmp_plus13 = (src_plus13 < (v16u8) src_zero3);
2168
2169 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2170 diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
2171 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2172 diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
2173 diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
2174 diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
2175 diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
2176 diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
2177
2178 offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
2179 offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
2180 offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
2181 offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
2182
2183 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2184 offset_mask0, offset_mask0, offset_mask0, offset_mask0);
2185 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2186 offset_mask1, offset_mask1, offset_mask1, offset_mask1);
2187 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2188 offset_mask2, offset_mask2, offset_mask2, offset_mask2);
2189 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2190 offset_mask3, offset_mask3, offset_mask3, offset_mask3);
2191
2192 XORI_B4_128_SB(src_zero0, src_zero1, src_zero2, src_zero3);
2193
2194 dst0 = (v16u8) __msa_adds_s_b((v16i8) src_zero0, offset_mask0);
2195 dst1 = (v16u8) __msa_adds_s_b((v16i8) src_zero1, offset_mask1);
2196 dst2 = (v16u8) __msa_adds_s_b((v16i8) src_zero2, offset_mask2);
2197 dst3 = (v16u8) __msa_adds_s_b((v16i8) src_zero3, offset_mask3);
2198
2199 XORI_B4_128_UB(dst0, dst1, dst2, dst3);
2200
2201 src_minus11 = src10;
2202 src_minus12 = src11;
2203 src_minus13 = src12;
2204 src_minus14 = src13;
2205
2206 ST_UB4(dst0, dst1, dst2, dst3, dst_orig, dst_stride);
2207 dst_orig += 16;
2208 }
2209
2210 src += (src_stride << 2);
2211 dst += (dst_stride << 2);
2212 }
2213 }
2214
hevc_sao_edge_filter_135degree_4width_msa(uint8_t * dst,int32_t dst_stride,uint8_t * src,int32_t src_stride,int16_t * sao_offset_val,int32_t height)2215 static void hevc_sao_edge_filter_135degree_4width_msa(uint8_t *dst,
2216 int32_t dst_stride,
2217 uint8_t *src,
2218 int32_t src_stride,
2219 int16_t *sao_offset_val,
2220 int32_t height)
2221 {
2222 uint8_t *src_orig;
2223 uint32_t dst_val0, dst_val1;
2224 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
2225 v16u8 const1 = (v16u8) __msa_ldi_b(1);
2226 v16i8 offset, sao_offset = LD_SB(sao_offset_val);
2227 v16i8 src_zero0, src_zero1, dst0;
2228 v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
2229 v16u8 src_minus10, src10, src_minus11, src11;
2230 v8i16 offset_mask0, offset_mask1;
2231 v16i8 zeros = { 0 };
2232
2233 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
2234 src_orig = src - 1;
2235
2236 /* load in advance */
2237 LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
2238 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2239
2240 for (height -= 2; height; height -= 2) {
2241 src_orig += (src_stride << 1);
2242
2243 SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
2244 SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_minus10, src_minus11);
2245
2246 ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
2247 src_minus11);
2248 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
2249 src_zero1);
2250
2251 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2252 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2253 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2254 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2255
2256 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2257 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2258 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2259 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2260
2261 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
2262 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
2263
2264 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
2265 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
2266
2267 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
2268 offset, offset);
2269
2270 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2271 dst0 = __msa_adds_s_b(dst0, offset);
2272 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2273
2274 src_minus10 = src10;
2275 src_minus11 = src11;
2276
2277 /* load in advance */
2278 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2279
2280 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
2281 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
2282
2283 SW(dst_val0, dst);
2284 dst += dst_stride;
2285 SW(dst_val1, dst);
2286
2287 dst += dst_stride;
2288 }
2289
2290 SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
2291 SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_minus10, src_minus11);
2292
2293 ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
2294 src_minus11);
2295 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
2296 src_zero1);
2297
2298 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2299 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2300 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2301 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2302
2303 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2304 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2305 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2306 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2307
2308 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
2309 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
2310
2311 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
2312 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
2313
2314 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
2315 offset, offset);
2316
2317 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2318 dst0 = __msa_adds_s_b(dst0, offset);
2319 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2320
2321 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
2322 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
2323
2324 SW(dst_val0, dst);
2325 dst += dst_stride;
2326 SW(dst_val1, dst);
2327 dst += dst_stride;
2328 }
2329
hevc_sao_edge_filter_135degree_8width_msa(uint8_t * dst,int32_t dst_stride,uint8_t * src,int32_t src_stride,int16_t * sao_offset_val,int32_t height)2330 static void hevc_sao_edge_filter_135degree_8width_msa(uint8_t *dst,
2331 int32_t dst_stride,
2332 uint8_t *src,
2333 int32_t src_stride,
2334 int16_t *sao_offset_val,
2335 int32_t height)
2336 {
2337 uint8_t *src_orig;
2338 uint64_t dst_val0, dst_val1;
2339 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
2340 v16u8 const1 = (v16u8) __msa_ldi_b(1);
2341 v16i8 offset, sao_offset = LD_SB(sao_offset_val);
2342 v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
2343 v16u8 src_minus10, src10, src_minus11, src11;
2344 v16i8 src_zero0, src_zero1, dst0;
2345 v8i16 offset_mask0, offset_mask1;
2346 v16i8 zeros = { 0 };
2347
2348 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
2349 src_orig = src - 1;
2350
2351 /* load in advance */
2352 LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
2353 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2354
2355 for (height -= 2; height; height -= 2) {
2356 src_orig += (src_stride << 1);
2357
2358 SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
2359 SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_minus10, src_minus11);
2360 ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
2361 src_minus11);
2362 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
2363 src_zero1);
2364
2365 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2366 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2367 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2368 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2369
2370 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2371 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2372 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2373 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2374
2375 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
2376 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
2377
2378 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
2379 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
2380
2381 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
2382 offset, offset);
2383
2384 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2385 dst0 = __msa_adds_s_b(dst0, offset);
2386 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2387
2388 src_minus10 = src10;
2389 src_minus11 = src11;
2390
2391 /* load in advance */
2392 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2393
2394 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
2395 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
2396
2397 SD(dst_val0, dst);
2398 dst += dst_stride;
2399 SD(dst_val1, dst);
2400 dst += dst_stride;
2401 }
2402
2403 SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
2404 SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_minus10, src_minus11);
2405 ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
2406 src_minus11);
2407 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
2408 src_zero1);
2409
2410 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2411 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2412 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2413 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2414
2415 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2416 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2417 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2418 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2419
2420 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
2421 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
2422
2423 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
2424 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
2425
2426 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset, offset,
2427 offset, offset);
2428
2429 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2430 dst0 = __msa_adds_s_b(dst0, offset);
2431 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2432
2433 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
2434 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
2435
2436 SD(dst_val0, dst);
2437 dst += dst_stride;
2438 SD(dst_val1, dst);
2439 dst += dst_stride;
2440 }
2441
hevc_sao_edge_filter_135degree_16multiple_msa(uint8_t * dst,int32_t dst_stride,uint8_t * src,int32_t src_stride,int16_t * sao_offset_val,int32_t width,int32_t height)2442 static void hevc_sao_edge_filter_135degree_16multiple_msa(uint8_t *dst,
2443 int32_t dst_stride,
2444 uint8_t *src,
2445 int32_t src_stride,
2446 int16_t *
2447 sao_offset_val,
2448 int32_t width,
2449 int32_t height)
2450 {
2451 uint8_t *src_orig, *dst_orig;
2452 int32_t v_cnt;
2453 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
2454 v16u8 const1 = (v16u8) __msa_ldi_b(1);
2455 v16u8 dst0, dst1, dst2, dst3;
2456 v16u8 cmp_minus10, cmp_minus11, cmp_minus12, cmp_minus13, cmp_plus10;
2457 v16u8 cmp_plus11, cmp_plus12, cmp_plus13, diff_minus10, diff_minus11;
2458 v16u8 diff_minus12, diff_minus13, diff_plus10, diff_plus11, diff_plus12;
2459 v16u8 diff_plus13, src10, src11, src12, src13, src_minus10, src_minus11;
2460 v16u8 src_plus10, src_plus11, src_plus12, src_plus13;
2461 v16i8 src_minus12, src_minus13, src_zero0, src_zero1, src_zero2, src_zero3;
2462 v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
2463
2464 sao_offset = LD_SB(sao_offset_val);
2465 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
2466
2467 for (; height; height -= 4) {
2468 src_orig = src - 1;
2469 dst_orig = dst;
2470
2471 LD_UB4(src_orig, src_stride, src_minus11, src_plus10, src_plus11,
2472 src_plus12);
2473
2474 for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
2475 src_minus10 = LD_UB(src_orig + 2 - src_stride);
2476 LD_UB4(src_orig + 16, src_stride, src10, src11, src12, src13);
2477 src_plus13 = LD_UB(src_orig + (src_stride << 2));
2478 src_orig += 16;
2479
2480 src_zero0 = __msa_sldi_b((v16i8) src10, (v16i8) src_minus11, 1);
2481 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2482 cmp_plus10 = ((v16u8) src_zero0 == src_plus10);
2483
2484 src_zero1 = __msa_sldi_b((v16i8) src11, (v16i8) src_plus10, 1);
2485 src_minus11 = (v16u8) __msa_sldi_b((v16i8) src10,
2486 (v16i8) src_minus11, 2);
2487 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2488 cmp_plus11 = ((v16u8) src_zero1 == src_plus11);
2489
2490 src_zero2 = __msa_sldi_b((v16i8) src12, (v16i8) src_plus11, 1);
2491 src_minus12 = __msa_sldi_b((v16i8) src11, (v16i8) src_plus10, 2);
2492 cmp_minus12 = ((v16u8) src_zero2 == (v16u8) src_minus12);
2493 cmp_plus12 = ((v16u8) src_zero2 == src_plus12);
2494
2495 src_zero3 = __msa_sldi_b((v16i8) src13, (v16i8) src_plus12, 1);
2496 src_minus13 = __msa_sldi_b((v16i8) src12, (v16i8) src_plus11, 2);
2497 cmp_minus13 = ((v16u8) src_zero3 == (v16u8) src_minus13);
2498 cmp_plus13 = ((v16u8) src_zero3 == src_plus13);
2499
2500 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2501 diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
2502 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2503 diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
2504 diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
2505 diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
2506 diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
2507 diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
2508
2509 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2510 cmp_plus10 = (src_plus10 < (v16u8) src_zero0);
2511 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2512 cmp_plus11 = (src_plus11 < (v16u8) src_zero1);
2513 cmp_minus12 = ((v16u8) src_minus12 < (v16u8) src_zero2);
2514 cmp_plus12 = (src_plus12 < (v16u8) src_zero2);
2515 cmp_minus13 = ((v16u8) src_minus13 < (v16u8) src_zero3);
2516 cmp_plus13 = (src_plus13 < (v16u8) src_zero3);
2517
2518 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2519 diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
2520 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2521 diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
2522 diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
2523 diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
2524 diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
2525 diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
2526
2527 offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
2528 offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
2529 offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
2530 offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
2531
2532 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2533 offset_mask0, offset_mask0, offset_mask0, offset_mask0);
2534 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2535 offset_mask1, offset_mask1, offset_mask1, offset_mask1);
2536 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2537 offset_mask2, offset_mask2, offset_mask2, offset_mask2);
2538 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2539 offset_mask3, offset_mask3, offset_mask3, offset_mask3);
2540
2541 XORI_B4_128_SB(src_zero0, src_zero1, src_zero2, src_zero3);
2542
2543 dst0 = (v16u8) __msa_adds_s_b((v16i8) src_zero0, offset_mask0);
2544 dst1 = (v16u8) __msa_adds_s_b((v16i8) src_zero1, offset_mask1);
2545 dst2 = (v16u8) __msa_adds_s_b((v16i8) src_zero2, offset_mask2);
2546 dst3 = (v16u8) __msa_adds_s_b((v16i8) src_zero3, offset_mask3);
2547
2548 XORI_B4_128_UB(dst0, dst1, dst2, dst3);
2549
2550 src_minus11 = src10;
2551 src_plus10 = src11;
2552 src_plus11 = src12;
2553 src_plus12 = src13;
2554
2555 ST_UB4(dst0, dst1, dst2, dst3, dst_orig, dst_stride);
2556 dst_orig += 16;
2557 }
2558
2559 src += (src_stride << 2);
2560 dst += (dst_stride << 2);
2561 }
2562 }
2563
ff_hevc_loop_filter_luma_h_8_msa(uint8_t * src,ptrdiff_t src_stride,int32_t beta,int32_t * tc,uint8_t * no_p,uint8_t * no_q)2564 void ff_hevc_loop_filter_luma_h_8_msa(uint8_t *src,
2565 ptrdiff_t src_stride,
2566 int32_t beta, int32_t *tc,
2567 uint8_t *no_p, uint8_t *no_q)
2568 {
2569 hevc_loopfilter_luma_hor_msa(src, src_stride, beta, tc, no_p, no_q);
2570 }
2571
ff_hevc_loop_filter_luma_v_8_msa(uint8_t * src,ptrdiff_t src_stride,int32_t beta,int32_t * tc,uint8_t * no_p,uint8_t * no_q)2572 void ff_hevc_loop_filter_luma_v_8_msa(uint8_t *src,
2573 ptrdiff_t src_stride,
2574 int32_t beta, int32_t *tc,
2575 uint8_t *no_p, uint8_t *no_q)
2576 {
2577 hevc_loopfilter_luma_ver_msa(src, src_stride, beta, tc, no_p, no_q);
2578 }
2579
ff_hevc_loop_filter_chroma_h_8_msa(uint8_t * src,ptrdiff_t src_stride,int32_t * tc,uint8_t * no_p,uint8_t * no_q)2580 void ff_hevc_loop_filter_chroma_h_8_msa(uint8_t *src,
2581 ptrdiff_t src_stride,
2582 int32_t *tc, uint8_t *no_p,
2583 uint8_t *no_q)
2584 {
2585 hevc_loopfilter_chroma_hor_msa(src, src_stride, tc, no_p, no_q);
2586 }
2587
ff_hevc_loop_filter_chroma_v_8_msa(uint8_t * src,ptrdiff_t src_stride,int32_t * tc,uint8_t * no_p,uint8_t * no_q)2588 void ff_hevc_loop_filter_chroma_v_8_msa(uint8_t *src,
2589 ptrdiff_t src_stride,
2590 int32_t *tc, uint8_t *no_p,
2591 uint8_t *no_q)
2592 {
2593 hevc_loopfilter_chroma_ver_msa(src, src_stride, tc, no_p, no_q);
2594 }
2595
ff_hevc_sao_band_filter_0_8_msa(uint8_t * dst,uint8_t * src,ptrdiff_t stride_dst,ptrdiff_t stride_src,int16_t * sao_offset_val,int sao_left_class,int width,int height)2596 void ff_hevc_sao_band_filter_0_8_msa(uint8_t *dst, uint8_t *src,
2597 ptrdiff_t stride_dst, ptrdiff_t stride_src,
2598 int16_t *sao_offset_val, int sao_left_class,
2599 int width, int height)
2600 {
2601 if (width >> 4) {
2602 hevc_sao_band_filter_16multiple_msa(dst, stride_dst, src, stride_src,
2603 sao_left_class, sao_offset_val,
2604 width - (width % 16), height);
2605 dst += width - (width % 16);
2606 src += width - (width % 16);
2607 width %= 16;
2608 }
2609
2610 if (width >> 3) {
2611 hevc_sao_band_filter_8width_msa(dst, stride_dst, src, stride_src,
2612 sao_left_class, sao_offset_val, height);
2613 dst += 8;
2614 src += 8;
2615 width %= 8;
2616 }
2617
2618 if (width) {
2619 hevc_sao_band_filter_4width_msa(dst, stride_dst, src, stride_src,
2620 sao_left_class, sao_offset_val, height);
2621 }
2622 }
2623
ff_hevc_sao_edge_filter_8_msa(uint8_t * dst,uint8_t * src,ptrdiff_t stride_dst,int16_t * sao_offset_val,int eo,int width,int height)2624 void ff_hevc_sao_edge_filter_8_msa(uint8_t *dst, uint8_t *src,
2625 ptrdiff_t stride_dst,
2626 int16_t *sao_offset_val,
2627 int eo, int width, int height)
2628 {
2629 ptrdiff_t stride_src = (2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(uint8_t);
2630
2631 switch (eo) {
2632 case 0:
2633 if (width >> 4) {
2634 hevc_sao_edge_filter_0degree_16multiple_msa(dst, stride_dst,
2635 src, stride_src,
2636 sao_offset_val,
2637 width - (width % 16),
2638 height);
2639 dst += width - (width % 16);
2640 src += width - (width % 16);
2641 width %= 16;
2642 }
2643
2644 if (width >> 3) {
2645 hevc_sao_edge_filter_0degree_8width_msa(dst, stride_dst,
2646 src, stride_src,
2647 sao_offset_val, height);
2648 dst += 8;
2649 src += 8;
2650 width %= 8;
2651 }
2652
2653 if (width) {
2654 hevc_sao_edge_filter_0degree_4width_msa(dst, stride_dst,
2655 src, stride_src,
2656 sao_offset_val, height);
2657 }
2658 break;
2659
2660 case 1:
2661 if (width >> 4) {
2662 hevc_sao_edge_filter_90degree_16multiple_msa(dst, stride_dst,
2663 src, stride_src,
2664 sao_offset_val,
2665 width - (width % 16),
2666 height);
2667 dst += width - (width % 16);
2668 src += width - (width % 16);
2669 width %= 16;
2670 }
2671
2672 if (width >> 3) {
2673 hevc_sao_edge_filter_90degree_8width_msa(dst, stride_dst,
2674 src, stride_src,
2675 sao_offset_val, height);
2676 dst += 8;
2677 src += 8;
2678 width %= 8;
2679 }
2680
2681 if (width) {
2682 hevc_sao_edge_filter_90degree_4width_msa(dst, stride_dst,
2683 src, stride_src,
2684 sao_offset_val, height);
2685 }
2686 break;
2687
2688 case 2:
2689 if (width >> 4) {
2690 hevc_sao_edge_filter_45degree_16multiple_msa(dst, stride_dst,
2691 src, stride_src,
2692 sao_offset_val,
2693 width - (width % 16),
2694 height);
2695 dst += width - (width % 16);
2696 src += width - (width % 16);
2697 width %= 16;
2698 }
2699
2700 if (width >> 3) {
2701 hevc_sao_edge_filter_45degree_8width_msa(dst, stride_dst,
2702 src, stride_src,
2703 sao_offset_val, height);
2704 dst += 8;
2705 src += 8;
2706 width %= 8;
2707 }
2708
2709 if (width) {
2710 hevc_sao_edge_filter_45degree_4width_msa(dst, stride_dst,
2711 src, stride_src,
2712 sao_offset_val, height);
2713 }
2714 break;
2715
2716 case 3:
2717 if (width >> 4) {
2718 hevc_sao_edge_filter_135degree_16multiple_msa(dst, stride_dst,
2719 src, stride_src,
2720 sao_offset_val,
2721 width - (width % 16),
2722 height);
2723 dst += width - (width % 16);
2724 src += width - (width % 16);
2725 width %= 16;
2726 }
2727
2728 if (width >> 3) {
2729 hevc_sao_edge_filter_135degree_8width_msa(dst, stride_dst,
2730 src, stride_src,
2731 sao_offset_val, height);
2732 dst += 8;
2733 src += 8;
2734 width %= 8;
2735 }
2736
2737 if (width) {
2738 hevc_sao_edge_filter_135degree_4width_msa(dst, stride_dst,
2739 src, stride_src,
2740 sao_offset_val, height);
2741 }
2742 break;
2743 }
2744 }
2745