1 /*
2  * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavcodec/hevcdec.h"
22 #include "libavutil/mips/generic_macros_msa.h"
23 #include "hevcpred_mips.h"
24 
25 static const int8_t intra_pred_angle_up[17] = {
26     -32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32
27 };
28 
29 static const int8_t intra_pred_angle_low[16] = {
30     32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26
31 };
32 
33 #define HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,          \
34                               mul_val_h0, mul_val_h1, mul_val_h2, mul_val_h3,  \
35                               res0, res1, mul_val_b0, mul_val_b1, round)       \
36 {                                                                              \
37     v8i16 res0_m, res1_m, res2_m, res3_m;                                      \
38                                                                                \
39     MUL4(mul_val_h0, vec0, mul_val_h2, vec0, mul_val_h0, vec1,                 \
40          mul_val_h2, vec1, res0_m, res1_m, res2_m, res3_m);                    \
41                                                                                \
42     res0_m += mul_val_h1 * tmp0;                                               \
43     res1_m += mul_val_h3 * tmp0;                                               \
44     res2_m += mul_val_h1 * tmp0;                                               \
45     res3_m += mul_val_h3 * tmp0;                                               \
46                                                                                \
47     res0_m += mul_val_b0 * src0_r;                                             \
48     res1_m += mul_val_b0 * src0_l;                                             \
49     res2_m += (mul_val_b0 - 1) * src0_r;                                       \
50     res3_m += (mul_val_b0 - 1) * src0_l;                                       \
51                                                                                \
52     res0_m += mul_val_b1 * tmp1;                                               \
53     res1_m += mul_val_b1 * tmp1;                                               \
54     res2_m += (mul_val_b1 + 1) * tmp1;                                         \
55     res3_m += (mul_val_b1 + 1) * tmp1;                                         \
56                                                                                \
57     SRARI_H4_SH(res0_m, res1_m, res2_m, res3_m, round);                        \
58     PCKEV_B2_SH(res1_m, res0_m, res3_m, res2_m, res0, res1);                   \
59 }
60 
hevc_intra_pred_vert_4x4_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,int32_t flag)61 static void hevc_intra_pred_vert_4x4_msa(const uint8_t *src_top,
62                                          const uint8_t *src_left,
63                                          uint8_t *dst, int32_t stride,
64                                          int32_t flag)
65 {
66     uint32_t col;
67     uint32_t src_data;
68     v8i16 vec0, vec1, vec2;
69     v16i8 zero = { 0 };
70 
71     src_data = LW(src_top);
72     SW4(src_data, src_data, src_data, src_data, dst, stride);
73 
74     if (0 == flag) {
75         src_data = LW(src_left);
76 
77         vec2 = (v8i16) __msa_insert_w((v4i32) vec2, 0, src_data);
78 
79         vec0 = __msa_fill_h(src_left[-1]);
80         vec1 = __msa_fill_h(src_top[0]);
81 
82         vec2 = (v8i16) __msa_ilvr_b(zero, (v16i8) vec2);
83         vec2 -= vec0;
84         vec2 >>= 1;
85         vec2 += vec1;
86         CLIP_SH_0_255(vec2);
87 
88         for (col = 0; col < 4; col++) {
89             dst[stride * col] = (uint8_t) vec2[col];
90         }
91     }
92 }
93 
hevc_intra_pred_vert_8x8_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,int32_t flag)94 static void hevc_intra_pred_vert_8x8_msa(const uint8_t *src_top,
95                                          const uint8_t *src_left,
96                                          uint8_t *dst, int32_t stride,
97                                          int32_t flag)
98 {
99     uint8_t *tmp_dst = dst;
100     uint32_t row;
101     uint16_t val0, val1, val2, val3;
102     uint64_t src_data1;
103     v8i16 vec0, vec1, vec2;
104     v16i8 zero = { 0 };
105 
106     src_data1 = LD(src_top);
107 
108     for (row = 8; row--;) {
109         SD(src_data1, tmp_dst);
110         tmp_dst += stride;
111     }
112 
113     if (0 == flag) {
114         src_data1 = LD(src_left);
115 
116         vec2 = (v8i16) __msa_insert_d((v2i64) zero, 0, src_data1);
117 
118         vec0 = __msa_fill_h(src_left[-1]);
119         vec1 = __msa_fill_h(src_top[0]);
120 
121         vec2 = (v8i16) __msa_ilvr_b(zero, (v16i8) vec2);
122         vec2 -= vec0;
123         vec2 >>= 1;
124         vec2 += vec1;
125         CLIP_SH_0_255(vec2);
126 
127         val0 = vec2[0];
128         val1 = vec2[1];
129         val2 = vec2[2];
130         val3 = vec2[3];
131 
132         dst[0] = val0;
133         dst[stride] = val1;
134         dst[2 * stride] = val2;
135         dst[3 * stride] = val3;
136 
137         val0 = vec2[4];
138         val1 = vec2[5];
139         val2 = vec2[6];
140         val3 = vec2[7];
141 
142         dst[4 * stride] = val0;
143         dst[5 * stride] = val1;
144         dst[6 * stride] = val2;
145         dst[7 * stride] = val3;
146     }
147 }
148 
hevc_intra_pred_vert_16x16_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,int32_t flag)149 static void hevc_intra_pred_vert_16x16_msa(const uint8_t *src_top,
150                                            const uint8_t *src_left,
151                                            uint8_t *dst, int32_t stride,
152                                            int32_t flag)
153 {
154     int32_t col;
155     uint8_t *tmp_dst = dst;
156     uint32_t row;
157     v16u8 src;
158     v8i16 vec0, vec1, vec2, vec3;
159 
160     src = LD_UB(src_top);
161 
162     for (row = 16; row--;) {
163         ST_UB(src, tmp_dst);
164         tmp_dst += stride;
165     }
166 
167     if (0 == flag) {
168         src = LD_UB(src_left);
169 
170         vec0 = __msa_fill_h(src_left[-1]);
171         vec1 = __msa_fill_h(src_top[0]);
172 
173         UNPCK_UB_SH(src, vec2, vec3);
174         SUB2(vec2, vec0, vec3, vec0, vec2, vec3);
175 
176         vec2 >>= 1;
177         vec3 >>= 1;
178 
179         ADD2(vec2, vec1, vec3, vec1, vec2, vec3);
180         CLIP_SH2_0_255(vec2, vec3);
181 
182         src = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2);
183 
184         for (col = 0; col < 16; col++) {
185             dst[stride * col] = src[col];
186         }
187     }
188 }
189 
hevc_intra_pred_horiz_4x4_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,int32_t flag)190 static void hevc_intra_pred_horiz_4x4_msa(const uint8_t *src_top,
191                                           const uint8_t *src_left,
192                                           uint8_t *dst, int32_t stride,
193                                           int32_t flag)
194 {
195     uint32_t val0, val1, val2, val3;
196     v16i8 src0;
197     v8i16 src0_r, src_top_val, src_left_val;
198     v16i8 zero = { 0 };
199 
200     val0 = src_left[0] * 0x01010101;
201     val1 = src_left[1] * 0x01010101;
202     val2 = src_left[2] * 0x01010101;
203     val3 = src_left[3] * 0x01010101;
204     SW4(val0, val1, val2, val3, dst, stride);
205 
206     if (0 == flag) {
207         val0 = LW(src_top);
208         src0 = (v16i8) __msa_insert_w((v4i32) src0, 0, val0);
209         src_top_val = __msa_fill_h(src_top[-1]);
210         src_left_val = __msa_fill_h(src_left[0]);
211 
212         src0_r = (v8i16) __msa_ilvr_b(zero, src0);
213 
214         src0_r -= src_top_val;
215         src0_r >>= 1;
216         src0_r += src_left_val;
217         CLIP_SH_0_255(src0_r);
218         src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
219         val0 = __msa_copy_s_w((v4i32) src0, 0);
220         SW(val0, dst);
221     }
222 }
223 
hevc_intra_pred_horiz_8x8_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,int32_t flag)224 static void hevc_intra_pred_horiz_8x8_msa(const uint8_t *src_top,
225                                           const uint8_t *src_left,
226                                           uint8_t *dst, int32_t stride,
227                                           int32_t flag)
228 {
229     uint64_t val0, val1, val2, val3;
230     v16i8 src0;
231     v8i16 src0_r, src_top_val, src_left_val;
232     v16i8 zero = { 0 };
233 
234     val0 = src_left[0] * 0x0101010101010101;
235     val1 = src_left[1] * 0x0101010101010101;
236     val2 = src_left[2] * 0x0101010101010101;
237     val3 = src_left[3] * 0x0101010101010101;
238     SD4(val0, val1, val2, val3, dst, stride);
239 
240     val0 = src_left[4] * 0x0101010101010101;
241     val1 = src_left[5] * 0x0101010101010101;
242     val2 = src_left[6] * 0x0101010101010101;
243     val3 = src_left[7] * 0x0101010101010101;
244     SD4(val0, val1, val2, val3, dst + 4 * stride, stride);
245 
246     if (0 == flag) {
247         val0 = LD(src_top);
248         src0 = (v16i8) __msa_insert_d((v2i64) src0, 0, val0);
249         src_top_val = __msa_fill_h(src_top[-1]);
250         src_left_val = __msa_fill_h(src_left[0]);
251 
252         src0_r = (v8i16) __msa_ilvr_b(zero, src0);
253 
254         src0_r -= src_top_val;
255         src0_r >>= 1;
256         src0_r += src_left_val;
257         CLIP_SH_0_255(src0_r);
258         src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
259         val0 = __msa_copy_s_d((v2i64) src0, 0);
260         SD(val0, dst);
261     }
262 }
263 
hevc_intra_pred_horiz_16x16_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,int32_t flag)264 static void hevc_intra_pred_horiz_16x16_msa(const uint8_t *src_top,
265                                             const uint8_t *src_left,
266                                             uint8_t *dst, int32_t stride,
267                                             int32_t flag)
268 {
269     uint8_t *tmp_dst = dst;
270     uint32_t row;
271     uint8_t inp0, inp1, inp2, inp3;
272     v16i8 src0, src1, src2, src3;
273     v8i16 src0_r, src0_l, src_left_val, src_top_val;
274 
275     src_left_val = __msa_fill_h(src_left[0]);
276 
277     for (row = 4; row--;) {
278         inp0 = src_left[0];
279         inp1 = src_left[1];
280         inp2 = src_left[2];
281         inp3 = src_left[3];
282         src_left += 4;
283 
284         src0 = __msa_fill_b(inp0);
285         src1 = __msa_fill_b(inp1);
286         src2 = __msa_fill_b(inp2);
287         src3 = __msa_fill_b(inp3);
288 
289         ST_SB4(src0, src1, src2, src3, tmp_dst, stride);
290         tmp_dst += (4 * stride);
291     }
292 
293     if (0 == flag) {
294         src0 = LD_SB(src_top);
295         src_top_val = __msa_fill_h(src_top[-1]);
296 
297         UNPCK_UB_SH(src0, src0_r, src0_l);
298         SUB2(src0_r, src_top_val, src0_l, src_top_val, src0_r, src0_l);
299 
300         src0_r >>= 1;
301         src0_l >>= 1;
302 
303         ADD2(src0_r, src_left_val, src0_l, src_left_val, src0_r, src0_l);
304         CLIP_SH2_0_255(src0_r, src0_l);
305         src0 = __msa_pckev_b((v16i8) src0_l, (v16i8) src0_r);
306         ST_SB(src0, dst);
307     }
308 }
309 
hevc_intra_pred_horiz_32x32_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride)310 static void hevc_intra_pred_horiz_32x32_msa(const uint8_t *src_top,
311                                             const uint8_t *src_left,
312                                             uint8_t *dst, int32_t stride)
313 {
314     uint32_t row;
315     uint8_t inp0, inp1, inp2, inp3;
316     v16i8 src0, src1, src2, src3;
317 
318     for (row = 0; row < 8; row++) {
319         inp0 = src_left[row * 4];
320         inp1 = src_left[row * 4 + 1];
321         inp2 = src_left[row * 4 + 2];
322         inp3 = src_left[row * 4 + 3];
323 
324         src0 = __msa_fill_b(inp0);
325         src1 = __msa_fill_b(inp1);
326         src2 = __msa_fill_b(inp2);
327         src3 = __msa_fill_b(inp3);
328 
329         ST_SB2(src0, src0, dst, 16);
330         dst += stride;
331         ST_SB2(src1, src1, dst, 16);
332         dst += stride;
333         ST_SB2(src2, src2, dst, 16);
334         dst += stride;
335         ST_SB2(src3, src3, dst, 16);
336         dst += stride;
337     }
338 }
339 
hevc_intra_pred_dc_4x4_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,int32_t flag)340 static void hevc_intra_pred_dc_4x4_msa(const uint8_t *src_top,
341                                        const uint8_t *src_left,
342                                        uint8_t *dst, int32_t stride,
343                                        int32_t flag)
344 {
345     uint8_t *tmp_dst = dst;
346     uint32_t addition = 0;
347     uint32_t val0, val1, val2;
348     v16i8 src = { 0 };
349     v16u8 store;
350     v16i8 zero = { 0 };
351     v8u16 sum, vec0, vec1;
352 
353     val0 = LW(src_top);
354     val1 = LW(src_left);
355     INSERT_W2_SB(val0, val1, src);
356     sum = __msa_hadd_u_h((v16u8) src, (v16u8) src);
357     sum = (v8u16) __msa_hadd_u_w(sum, sum);
358     sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
359     sum = (v8u16) __msa_srari_w((v4i32) sum, 3);
360     addition = __msa_copy_u_w((v4i32) sum, 0);
361     store = (v16u8) __msa_fill_b(addition);
362     val0 = __msa_copy_u_w((v4i32) store, 0);
363     SW4(val0, val0, val0, val0, dst, stride)
364 
365         if (0 == flag) {
366         ILVR_B2_UH(zero, store, zero, src, vec0, vec1);
367 
368         vec1 += vec0;
369         vec0 += vec0;
370         vec1 += vec0;
371 
372         vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
373         store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1);
374         val1 = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
375         store = (v16u8) __msa_insert_b((v16i8) store, 0, val1);
376         val0 = __msa_copy_u_w((v4i32) store, 0);
377         SW(val0, tmp_dst);
378 
379         val0 = src_left[1];
380         val1 = src_left[2];
381         val2 = src_left[3];
382 
383         addition *= 3;
384 
385         ADD2(val0, addition, val1, addition, val0, val1);
386         val2 += addition;
387 
388         val0 += 2;
389         val1 += 2;
390         val2 += 2;
391         val0 >>= 2;
392         val1 >>= 2;
393         val2 >>= 2;
394 
395         tmp_dst[stride * 1] = val0;
396         tmp_dst[stride * 2] = val1;
397         tmp_dst[stride * 3] = val2;
398     }
399 }
400 
hevc_intra_pred_dc_8x8_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,int32_t flag)401 static void hevc_intra_pred_dc_8x8_msa(const uint8_t *src_top,
402                                        const uint8_t *src_left,
403                                        uint8_t *dst, int32_t stride,
404                                        int32_t flag)
405 {
406     uint8_t *tmp_dst = dst;
407     uint32_t row, col, val;
408     uint32_t addition = 0;
409     uint64_t val0, val1;
410     v16u8 src = { 0 };
411     v16u8 store;
412     v8u16 sum, vec0, vec1;
413     v16i8 zero = { 0 };
414 
415     val0 = LD(src_top);
416     val1 = LD(src_left);
417     INSERT_D2_UB(val0, val1, src);
418     sum = __msa_hadd_u_h((v16u8) src, (v16u8) src);
419     sum = (v8u16) __msa_hadd_u_w(sum, sum);
420     sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
421     sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
422     sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
423     sum = (v8u16) __msa_srari_w((v4i32) sum, 4);
424     addition = __msa_copy_u_w((v4i32) sum, 0);
425     store = (v16u8) __msa_fill_b(addition);
426     val0 = __msa_copy_u_d((v2i64) store, 0);
427 
428     for (row = 8; row--;) {
429         SD(val0, dst);
430         dst += stride;
431     }
432 
433     if (0 == flag) {
434         ILVR_B2_UH(zero, store, zero, src, vec0, vec1);
435 
436         vec1 += vec0;
437         vec0 += vec0;
438         vec1 += vec0;
439         vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
440         store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1);
441         val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
442         store = (v16u8) __msa_insert_b((v16i8) store, 0, val);
443         val0 = __msa_copy_u_d((v2i64) store, 0);
444         SD(val0, tmp_dst);
445 
446         val0 = LD(src_left);
447         src = (v16u8) __msa_insert_d((v2i64) src, 0, val0);
448         vec1 = (v8u16) __msa_ilvr_b(zero, (v16i8) src);
449         vec0 = (v8u16) __msa_fill_h(addition);
450         vec0 *= 3;
451         vec1 += vec0;
452         vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
453 
454         for (col = 1; col < 8; col++) {
455             tmp_dst[stride * col] = vec1[col];
456         }
457     }
458 }
459 
hevc_intra_pred_dc_16x16_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,int32_t flag)460 static void hevc_intra_pred_dc_16x16_msa(const uint8_t *src_top,
461                                          const uint8_t *src_left,
462                                          uint8_t *dst, int32_t stride,
463                                          int32_t flag)
464 {
465     uint8_t *tmp_dst = dst;
466     uint32_t row, col, val;
467     uint32_t addition = 0;
468     v16u8 src_above1, store, src_left1;
469     v8u16 sum, sum_above, sum_left;
470     v8u16 vec0, vec1, vec2;
471     v16i8 zero = { 0 };
472 
473     src_above1 = LD_UB(src_top);
474     src_left1 = LD_UB(src_left);
475 
476     HADD_UB2_UH(src_above1, src_left1, sum_above, sum_left);
477     sum = sum_above + sum_left;
478     sum = (v8u16) __msa_hadd_u_w(sum, sum);
479     sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
480     sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
481     sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
482     sum = (v8u16) __msa_srari_w((v4i32) sum, 5);
483     addition = __msa_copy_u_w((v4i32) sum, 0);
484     store = (v16u8) __msa_fill_b(addition);
485 
486     for (row = 16; row--;) {
487         ST_UB(store, dst);
488         dst += stride;
489     }
490 
491     if (0 == flag) {
492         vec0 = (v8u16) __msa_ilvr_b(zero, (v16i8) store);
493         ILVRL_B2_UH(zero, src_above1, vec1, vec2);
494         ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
495         vec0 += vec0;
496         ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
497         SRARI_H2_UH(vec1, vec2, 2);
498         store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1);
499         val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
500         store = (v16u8) __msa_insert_b((v16i8) store, 0, val);
501         ST_UB(store, tmp_dst);
502 
503         ILVRL_B2_UH(zero, src_left1, vec1, vec2);
504         vec0 = (v8u16) __msa_fill_h(addition);
505         vec0 *= 3;
506         ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
507         SRARI_H2_UH(vec1, vec2, 2);
508         store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1);
509 
510         for (col = 1; col < 16; col++) {
511             tmp_dst[stride * col] = store[col];
512         }
513     }
514 }
515 
hevc_intra_pred_dc_32x32_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride)516 static void hevc_intra_pred_dc_32x32_msa(const uint8_t *src_top,
517                                          const uint8_t *src_left,
518                                          uint8_t *dst, int32_t stride)
519 {
520     uint32_t row;
521     v16u8 src_above1, src_above2, store, src_left1, src_left2;
522     v8u16 sum_above1, sum_above2;
523     v8u16 sum_left1, sum_left2;
524     v8u16 sum, sum_above, sum_left;
525 
526     LD_UB2(src_top, 16, src_above1, src_above2);
527     LD_UB2(src_left, 16, src_left1, src_left2);
528     HADD_UB2_UH(src_above1, src_above2, sum_above1, sum_above2);
529     HADD_UB2_UH(src_left1, src_left2, sum_left1, sum_left2);
530     sum_above = sum_above1 + sum_above2;
531     sum_left = sum_left1 + sum_left2;
532     sum = sum_above + sum_left;
533     sum = (v8u16) __msa_hadd_u_w(sum, sum);
534     sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
535     sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
536     sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
537     sum = (v8u16) __msa_srari_w((v4i32) sum, 6);
538     store = (v16u8) __msa_splati_b((v16i8) sum, 0);
539 
540     for (row = 16; row--;) {
541         ST_UB2(store, store, dst, 16);
542         dst += stride;
543         ST_UB2(store, store, dst, 16);
544         dst += stride;
545     }
546 }
547 
hevc_intra_pred_plane_4x4_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride)548 static void hevc_intra_pred_plane_4x4_msa(const uint8_t *src_top,
549                                           const uint8_t *src_left,
550                                           uint8_t *dst, int32_t stride)
551 {
552     uint32_t src0, src1;
553     v16i8 src_vec0, src_vec1;
554     v8i16 src_vec0_r, src1_r, tmp0, tmp1, mul_val1;
555     v8i16 vec0, vec1, vec2, vec3, res0, res1, res2, res3;
556     v8i16 mul_val0 = { 3, 2, 1, 0, 1, 2, 3, 4 };
557     v16i8 zero = { 0 };
558 
559     src0 = LW(src_top);
560     src1 = LW(src_left);
561 
562     mul_val1 = (v8i16) __msa_pckod_d((v2i64) mul_val0, (v2i64) mul_val0);
563 
564     src_vec0 = (v16i8) __msa_insert_w((v4i32) zero, 0, src0);
565     src_vec1 = (v16i8) __msa_insert_w((v4i32) zero, 0, src1);
566 
567     ILVR_B2_SH(zero, src_vec0, zero, src_vec1, src_vec0_r, src1_r);
568     SPLATI_H4_SH(src1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3);
569 
570     tmp0 = __msa_fill_h(src_top[4]);
571     tmp1 = __msa_fill_h(src_left[4]);
572 
573     MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3,
574          res0, res1, res2, res3);
575 
576     res0 += mul_val1 * tmp0;
577     res1 += mul_val1 * tmp0;
578     res2 += mul_val1 * tmp0;
579     res3 += mul_val1 * tmp0;
580 
581     res0 += 3 * src_vec0_r;
582     res1 += 2 * src_vec0_r;
583     res2 += src_vec0_r;
584     res0 += tmp1;
585     res1 += 2 * tmp1;
586     res2 += 3 * tmp1;
587     res3 += 4 * tmp1;
588 
589     PCKEV_D2_SH(res1, res0, res3, res2, res0, res1);
590     SRARI_H2_SH(res0, res1, 3);
591     src_vec0 = __msa_pckev_b((v16i8) res1, (v16i8) res0);
592     ST_W4(src_vec0, 0, 1, 2, 3, dst, stride);
593 }
594 
hevc_intra_pred_plane_8x8_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride)595 static void hevc_intra_pred_plane_8x8_msa(const uint8_t *src_top,
596                                           const uint8_t *src_left,
597                                           uint8_t *dst, int32_t stride)
598 {
599     uint64_t src0, src1;
600     v16i8 src_vec0, src_vec1, src_vec2, src_vec3;
601     v8i16 src_vec0_r, src_vec1_r;
602     v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
603     v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
604     v8i16 tmp0, tmp1, tmp2;
605     v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
606     v8i16 mul_val0 = { 7, 6, 5, 4, 3, 2, 1, 0 };
607     v16i8 zero = { 0 };
608 
609     src0 = LD(src_top);
610     src1 = LD(src_left);
611 
612     src_vec0 = (v16i8) __msa_insert_d((v2i64) zero, 0, src0);
613     src_vec1 = (v16i8) __msa_insert_d((v2i64) zero, 0, src1);
614 
615     ILVR_B2_SH(zero, src_vec0, zero, src_vec1, src_vec0_r, src_vec1_r);
616     SPLATI_H4_SH(src_vec1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3);
617     SPLATI_H4_SH(src_vec1_r, 4, 5, 6, 7, vec4, vec5, vec6, vec7);
618 
619     tmp0 = __msa_fill_h(src_top[8]);
620     tmp1 = __msa_fill_h(src_left[8]);
621 
622     MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3,
623          res0, res1, res2, res3);
624     MUL4(mul_val0, vec4, mul_val0, vec5, mul_val0, vec6, mul_val0, vec7,
625          res4, res5, res6, res7);
626 
627     tmp2 = mul_val1 * tmp0;
628     res0 += tmp2;
629     res1 += tmp2;
630     res2 += tmp2;
631     res3 += tmp2;
632     res4 += tmp2;
633     res5 += tmp2;
634     res6 += tmp2;
635     res7 += tmp2;
636 
637     res0 += 7 * src_vec0_r;
638     res1 += 6 * src_vec0_r;
639     res2 += 5 * src_vec0_r;
640     res3 += 4 * src_vec0_r;
641     res4 += 3 * src_vec0_r;
642     res5 += 2 * src_vec0_r;
643     res6 += src_vec0_r;
644 
645     res0 += tmp1;
646     res1 += 2 * tmp1;
647     res2 += 3 * tmp1;
648     res3 += 4 * tmp1;
649     res4 += 5 * tmp1;
650     res5 += 6 * tmp1;
651     res6 += 7 * tmp1;
652     res7 += 8 * tmp1;
653 
654     SRARI_H4_SH(res0, res1, res2, res3, 4);
655     SRARI_H4_SH(res4, res5, res6, res7, 4);
656     PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
657                 src_vec0, src_vec1, src_vec2, src_vec3);
658 
659     ST_D8(src_vec0, src_vec1, src_vec2, src_vec3, 0, 1, 0, 1,
660           0, 1, 0, 1, dst, stride);
661 }
662 
hevc_intra_pred_plane_16x16_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride)663 static void hevc_intra_pred_plane_16x16_msa(const uint8_t *src_top,
664                                             const uint8_t *src_left,
665                                             uint8_t *dst, int32_t stride)
666 {
667     v16u8 src0, src1;
668     v8i16 src0_r, src1_r, src0_l, src1_l;
669     v8i16 vec0, vec1;
670     v8i16 res0, res1, tmp0, tmp1;
671     v8i16 mul_val2, mul_val3;
672     v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
673     v8i16 mul_val0 = { 15, 14, 13, 12, 11, 10, 9, 8 };
674 
675     src0 = LD_UB(src_top);
676     src1 = LD_UB(src_left);
677 
678     UNPCK_UB_SH(src0, src0_r, src0_l);
679     UNPCK_UB_SH(src1, src1_r, src1_l);
680 
681     mul_val2 = mul_val0 - 8;
682     mul_val3 = mul_val1 + 8;
683 
684     tmp0 = __msa_fill_h(src_top[16]);
685     tmp1 = __msa_fill_h(src_left[16]);
686 
687     SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
688     HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
689                           mul_val0, mul_val1, mul_val2, mul_val3,
690                           res0, res1, 15, 1, 5);
691     ST_SH2(res0, res1, dst, stride);
692     dst += (2 * stride);
693 
694     SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
695     HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
696                           mul_val0, mul_val1, mul_val2, mul_val3,
697                           res0, res1, 13, 3, 5);
698     ST_SH2(res0, res1, dst, stride);
699     dst += (2 * stride);
700 
701     SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
702     HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
703                           mul_val0, mul_val1, mul_val2, mul_val3,
704                           res0, res1, 11, 5, 5);
705     ST_SH2(res0, res1, dst, stride);
706     dst += (2 * stride);
707 
708     SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
709     HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
710                           mul_val0, mul_val1, mul_val2, mul_val3,
711                           res0, res1, 9, 7, 5);
712     ST_SH2(res0, res1, dst, stride);
713     dst += (2 * stride);
714 
715     SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
716     HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
717                           mul_val0, mul_val1, mul_val2, mul_val3,
718                           res0, res1, 7, 9, 5);
719     ST_SH2(res0, res1, dst, stride);
720     dst += (2 * stride);
721 
722     SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
723     HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
724                           mul_val0, mul_val1, mul_val2, mul_val3,
725                           res0, res1, 5, 11, 5);
726     ST_SH2(res0, res1, dst, stride);
727     dst += (2 * stride);
728 
729     SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
730     HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
731                           mul_val0, mul_val1, mul_val2, mul_val3,
732                           res0, res1, 3, 13, 5);
733     ST_SH2(res0, res1, dst, stride);
734     dst += (2 * stride);
735 
736     SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
737     HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
738                           mul_val0, mul_val1, mul_val2, mul_val3,
739                           res0, res1, 1, 15, 5);
740     ST_SH2(res0, res1, dst, stride);
741 }
742 
process_intra_upper_16x16_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,uint8_t offset)743 static void process_intra_upper_16x16_msa(const uint8_t *src_top,
744                                           const uint8_t *src_left,
745                                           uint8_t *dst, int32_t stride,
746                                           uint8_t offset)
747 {
748     v16i8 src0, src1;
749     v8i16 src0_r, src1_r, src0_l, src1_l;
750     v8i16 vec0, vec1, res0, res1;
751     v8i16 tmp0, tmp1;
752     v8i16 mul_val2, mul_val3;
753     v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
754     v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 };
755 
756     tmp0 = __msa_fill_h(src_top[32 - offset]);
757     tmp1 = __msa_fill_h(src_left[32]);
758 
759     src0 = LD_SB(src_top);
760     src1 = LD_SB(src_left);
761 
762     UNPCK_UB_SH(src0, src0_r, src0_l);
763     UNPCK_UB_SH(src1, src1_r, src1_l);
764 
765     mul_val1 += offset;
766     mul_val0 -= offset;
767     mul_val2 = mul_val0 - 8;
768     mul_val3 = mul_val1 + 8;
769 
770     SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
771     HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
772                           mul_val0, mul_val1, mul_val2, mul_val3,
773                           res0, res1, 31, 1, 6);
774     ST_SH2(res0, res1, dst, stride);
775     dst += (2 * stride);
776 
777     SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
778     HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
779                           mul_val0, mul_val1, mul_val2, mul_val3,
780                           res0, res1, 29, 3, 6);
781     ST_SH2(res0, res1, dst, stride);
782     dst += (2 * stride);
783 
784     SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
785     HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
786                           mul_val0, mul_val1, mul_val2, mul_val3,
787                           res0, res1, 27, 5, 6);
788     ST_SH2(res0, res1, dst, stride);
789     dst += (2 * stride);
790 
791     SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
792     HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
793                           mul_val0, mul_val1, mul_val2, mul_val3,
794                           res0, res1, 25, 7, 6);
795     ST_SH2(res0, res1, dst, stride);
796     dst += (2 * stride);
797 
798     SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
799     HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
800                           mul_val0, mul_val1, mul_val2, mul_val3,
801                           res0, res1, 23, 9, 6);
802     ST_SH2(res0, res1, dst, stride);
803     dst += (2 * stride);
804 
805     SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
806     HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
807                           mul_val0, mul_val1, mul_val2, mul_val3,
808                           res0, res1, 21, 11, 6);
809     ST_SH2(res0, res1, dst, stride);
810     dst += (2 * stride);
811 
812     SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
813     HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
814                           mul_val0, mul_val1, mul_val2, mul_val3,
815                           res0, res1, 19, 13, 6);
816     ST_SH2(res0, res1, dst, stride);
817     dst += (2 * stride);
818 
819     SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
820     HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
821                           mul_val0, mul_val1, mul_val2, mul_val3,
822                           res0, res1, 17, 15, 6);
823     ST_SH2(res0, res1, dst, stride);
824 }
825 
process_intra_lower_16x16_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,uint8_t offset)826 static void process_intra_lower_16x16_msa(const uint8_t *src_top,
827                                           const uint8_t *src_left,
828                                           uint8_t *dst, int32_t stride,
829                                           uint8_t offset)
830 {
831     v16i8 src0, src1;
832     v8i16 src0_r, src1_r, src0_l, src1_l;
833     v8i16 vec0, vec1, res0, res1, tmp0, tmp1;
834     v8i16 mul_val2, mul_val3;
835     v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
836     v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 };
837 
838     tmp0 = __msa_fill_h(src_top[32 - offset]);
839     tmp1 = __msa_fill_h(src_left[16]);
840 
841     src0 = LD_SB(src_top);
842     src1 = LD_SB(src_left);
843 
844     UNPCK_UB_SH(src0, src0_r, src0_l);
845     UNPCK_UB_SH(src1, src1_r, src1_l);
846 
847     mul_val1 += offset;
848     mul_val0 -= offset;
849     mul_val2 = mul_val0 - 8;
850     mul_val3 = mul_val1 + 8;
851 
852     SPLATI_H2_SH(src1_r, 0, 1, vec0, vec1);
853     HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
854                           mul_val0, mul_val1, mul_val2, mul_val3,
855                           res0, res1, 15, 17, 6);
856     ST_SH2(res0, res1, dst, stride);
857     dst += (2 * stride);
858 
859     SPLATI_H2_SH(src1_r, 2, 3, vec0, vec1);
860     HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
861                           mul_val0, mul_val1, mul_val2, mul_val3,
862                           res0, res1, 13, 19, 6);
863     ST_SH2(res0, res1, dst, stride);
864     dst += (2 * stride);
865 
866     SPLATI_H2_SH(src1_r, 4, 5, vec0, vec1);
867     HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
868                           mul_val0, mul_val1, mul_val2, mul_val3,
869                           res0, res1, 11, 21, 6);
870     ST_SH2(res0, res1, dst, stride);
871     dst += (2 * stride);
872 
873     SPLATI_H2_SH(src1_r, 6, 7, vec0, vec1);
874     HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
875                           mul_val0, mul_val1, mul_val2, mul_val3,
876                           res0, res1, 9, 23, 6);
877     ST_SH2(res0, res1, dst, stride);
878     dst += (2 * stride);
879 
880     SPLATI_H2_SH(src1_l, 0, 1, vec0, vec1);
881     HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
882                           mul_val0, mul_val1, mul_val2, mul_val3,
883                           res0, res1, 7, 25, 6);
884     ST_SH2(res0, res1, dst, stride);
885     dst += (2 * stride);
886 
887     SPLATI_H2_SH(src1_l, 2, 3, vec0, vec1);
888     HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
889                           mul_val0, mul_val1, mul_val2, mul_val3,
890                           res0, res1, 5, 27, 6);
891     ST_SH2(res0, res1, dst, stride);
892     dst += (2 * stride);
893 
894     SPLATI_H2_SH(src1_l, 4, 5, vec0, vec1);
895     HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
896                           mul_val0, mul_val1, mul_val2, mul_val3,
897                           res0, res1, 3, 29, 6);
898     ST_SH2(res0, res1, dst, stride);
899     dst += (2 * stride);
900 
901     SPLATI_H2_SH(src1_l, 6, 7, vec0, vec1);
902     HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,
903                           mul_val0, mul_val1, mul_val2, mul_val3,
904                           res0, res1, 1, 31, 6);
905     ST_SH2(res0, res1, dst, stride);
906 }
907 
hevc_intra_pred_plane_32x32_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride)908 static void hevc_intra_pred_plane_32x32_msa(const uint8_t *src_top,
909                                             const uint8_t *src_left,
910                                             uint8_t *dst, int32_t stride)
911 {
912     process_intra_upper_16x16_msa(src_top, src_left, dst, stride, 0);
913     process_intra_upper_16x16_msa((src_top + 16), src_left,
914                                   (dst + 16), stride, 16);
915     dst += (16 * stride);
916     src_left += 16;
917 
918     process_intra_lower_16x16_msa(src_top, src_left, dst, stride, 0);
919     process_intra_lower_16x16_msa((src_top + 16), src_left,
920                                   (dst + 16), stride, 16);
921 }
922 
hevc_intra_pred_angular_upper_4width_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,int32_t mode)923 static void hevc_intra_pred_angular_upper_4width_msa(const uint8_t *src_top,
924                                                      const uint8_t *src_left,
925                                                      uint8_t *dst,
926                                                      int32_t stride,
927                                                      int32_t mode)
928 {
929     int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
930     uint8_t ref_array[3 * 32 + 4];
931     uint8_t *ref_tmp = ref_array + 4;
932     const uint8_t *ref;
933     int32_t last;
934     int32_t h_cnt, idx0, fact_val0, idx1, fact_val1;
935     int32_t idx2, fact_val2, idx3, fact_val3;
936     int32_t angle, angle_loop;
937     int32_t inv_angle_val, offset;
938     uint64_t tmp0;
939     v16i8 top0, top1, top2, top3;
940     v16i8 dst_val0;
941     v16i8 zero = { 0 };
942     v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
943     v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
944 
945     angle = intra_pred_angle_up[mode - 18];
946     inv_angle_val = inv_angle[mode - 18];
947     last = (angle) >> 3;
948     angle_loop = angle;
949 
950     ref = src_top - 1;
951     if (angle < 0 && last < -1) {
952         inv_angle_val = inv_angle[mode - 18];
953 
954         tmp0 = LD(ref);
955         SD(tmp0, ref_tmp);
956 
957         for (h_cnt = last; h_cnt <= -1; h_cnt++) {
958             offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8);
959             ref_tmp[h_cnt] = src_left[offset];
960         }
961 
962         ref = ref_tmp;
963     }
964 
965     idx0 = angle_loop >> 5;
966     fact_val0 = angle_loop & 31;
967     angle_loop += angle;
968 
969     idx1 = angle_loop >> 5;
970     fact_val1 = angle_loop & 31;
971     angle_loop += angle;
972 
973     idx2 = angle_loop >> 5;
974     fact_val2 = angle_loop & 31;
975     angle_loop += angle;
976 
977     idx3 = angle_loop >> 5;
978     fact_val3 = angle_loop & 31;
979 
980     top0 = LD_SB(ref + idx0 + 1);
981     top1 = LD_SB(ref + idx1 + 1);
982     top2 = LD_SB(ref + idx2 + 1);
983     top3 = LD_SB(ref + idx3 + 1);
984 
985     fact0 = __msa_fill_h(fact_val0);
986     fact1 = __msa_fill_h(32 - fact_val0);
987 
988     fact2 = __msa_fill_h(fact_val1);
989     fact3 = __msa_fill_h(32 - fact_val1);
990 
991     fact4 = __msa_fill_h(fact_val2);
992     fact5 = __msa_fill_h(32 - fact_val2);
993 
994     fact6 = __msa_fill_h(fact_val3);
995     fact7 = __msa_fill_h(32 - fact_val3);
996 
997     ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2);
998     ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
999     ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3,
1000                diff0, diff2, diff4, diff6);
1001     SLDI_B4_SH(zero, diff0, zero, diff2, zero, diff4, zero, diff6, 2,
1002                diff1, diff3, diff5, diff7);
1003     ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
1004     ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
1005     MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
1006 
1007     diff1 += diff0 * fact1;
1008     diff3 += diff2 * fact3;
1009 
1010     SRARI_H2_SH(diff1, diff3, 5);
1011     dst_val0 = __msa_pckev_b((v16i8) diff3, (v16i8) diff1);
1012     ST_W4(dst_val0, 0, 1, 2, 3, dst, stride);
1013 }
1014 
hevc_intra_pred_angular_upper_8width_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,int32_t mode)1015 static void hevc_intra_pred_angular_upper_8width_msa(const uint8_t *src_top,
1016                                                      const uint8_t *src_left,
1017                                                      uint8_t *dst,
1018                                                      int32_t stride,
1019                                                      int32_t mode)
1020 {
1021     int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1022     uint8_t ref_array[3 * 32 + 4];
1023     uint8_t *ref_tmp = ref_array + 8;
1024     const uint8_t *ref;
1025     const uint8_t *src_left_tmp = src_left - 1;
1026     int32_t last, offset;
1027     int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1028     int32_t idx2, fact_val2, idx3, fact_val3;
1029     int32_t angle, angle_loop;
1030     int32_t inv_angle_val, inv_angle_val_loop;
1031     int32_t tmp0, tmp1, tmp2;
1032     v16i8 top0, top1, top2, top3;
1033     v16u8 dst_val0, dst_val1;
1034     v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1035     v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1036 
1037     angle = intra_pred_angle_up[mode - 18];
1038     inv_angle_val = inv_angle[mode - 18];
1039     last = (angle) >> 2;
1040     angle_loop = angle;
1041 
1042     ref = src_top - 1;
1043     if (last < -1) {
1044         inv_angle_val_loop = inv_angle_val * last;
1045 
1046         tmp0 = LW(ref);
1047         tmp1 = LW(ref + 4);
1048         tmp2 = LW(ref + 8);
1049         SW(tmp0, ref_tmp);
1050         SW(tmp1, ref_tmp + 4);
1051         SW(tmp2, ref_tmp + 8);
1052 
1053         for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1054             offset = (inv_angle_val_loop + 128) >> 8;
1055             ref_tmp[h_cnt] = src_left_tmp[offset];
1056             inv_angle_val_loop += inv_angle_val;
1057         }
1058         ref = ref_tmp;
1059     }
1060 
1061     for (v_cnt = 0; v_cnt < 2; v_cnt++) {
1062         idx0 = (angle_loop) >> 5;
1063         fact_val0 = (angle_loop) & 31;
1064         angle_loop += angle;
1065 
1066         idx1 = (angle_loop) >> 5;
1067         fact_val1 = (angle_loop) & 31;
1068         angle_loop += angle;
1069 
1070         idx2 = (angle_loop) >> 5;
1071         fact_val2 = (angle_loop) & 31;
1072         angle_loop += angle;
1073 
1074         idx3 = (angle_loop) >> 5;
1075         fact_val3 = (angle_loop) & 31;
1076         angle_loop += angle;
1077 
1078         top0 = LD_SB(ref + idx0 + 1);
1079         top1 = LD_SB(ref + idx1 + 1);
1080         top2 = LD_SB(ref + idx2 + 1);
1081         top3 = LD_SB(ref + idx3 + 1);
1082 
1083         fact0 = __msa_fill_h(fact_val0);
1084         fact1 = __msa_fill_h(32 - fact_val0);
1085         fact2 = __msa_fill_h(fact_val1);
1086         fact3 = __msa_fill_h(32 - fact_val1);
1087         fact4 = __msa_fill_h(fact_val2);
1088         fact5 = __msa_fill_h(32 - fact_val2);
1089         fact6 = __msa_fill_h(fact_val3);
1090         fact7 = __msa_fill_h(32 - fact_val3);
1091 
1092         UNPCK_UB_SH(top0, diff0, diff1);
1093         UNPCK_UB_SH(top1, diff2, diff3);
1094         UNPCK_UB_SH(top2, diff4, diff5);
1095         UNPCK_UB_SH(top3, diff6, diff7);
1096 
1097         SLDI_B4_SH(diff1, diff0, diff3, diff2, diff5, diff4, diff7, diff6, 2,
1098                    diff1, diff3, diff5, diff7);
1099         MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
1100              diff1, diff3, diff5, diff7);
1101 
1102         diff1 += diff0 * fact1;
1103         diff3 += diff2 * fact3;
1104         diff5 += diff4 * fact5;
1105         diff7 += diff6 * fact7;
1106 
1107         SRARI_H4_SH(diff1, diff3, diff5, diff7, 5);
1108         PCKEV_B2_UB(diff3, diff1, diff7, diff5, dst_val0, dst_val1);
1109         ST_D4(dst_val0, dst_val1, 0, 1, 0, 1, dst, stride);
1110         dst += (4 * stride);
1111     }
1112 }
1113 
hevc_intra_pred_angular_upper_16width_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,int32_t mode)1114 static void hevc_intra_pred_angular_upper_16width_msa(const uint8_t *src_top,
1115                                                       const uint8_t *src_left,
1116                                                       uint8_t *dst,
1117                                                       int32_t stride,
1118                                                       int32_t mode)
1119 {
1120     int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1121     int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1122     int32_t idx2, fact_val2, idx3, fact_val3;
1123     int32_t tmp0;
1124     int32_t angle, angle_loop, offset;
1125     int32_t inv_angle_val, inv_angle_val_loop;
1126     uint8_t ref_array[3 * 32 + 4];
1127     uint8_t *ref_tmp = ref_array + 16;
1128     const uint8_t *ref;
1129     const uint8_t *src_left_tmp = src_left - 1;
1130     int32_t last;
1131     v16u8 top0, top1, top2, top3, top4, top5, top6, top7;
1132     v16i8 dst0, dst1, dst2, dst3;
1133     v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1134     v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1135     v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1136 
1137     angle = intra_pred_angle_up[mode - 18];
1138     inv_angle_val = inv_angle[mode - 18];
1139     last = angle >> 1;
1140     angle_loop = angle;
1141 
1142     ref = src_top - 1;
1143     if (last < -1) {
1144         inv_angle_val_loop = inv_angle_val * last;
1145 
1146         top0 = LD_UB(ref);
1147         tmp0 = LW(ref + 16);
1148         ST_UB(top0, ref_tmp);
1149         SW(tmp0, ref_tmp + 16);
1150 
1151         for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1152             offset = (inv_angle_val_loop + 128) >> 8;
1153             ref_tmp[h_cnt] = src_left_tmp[offset];
1154             inv_angle_val_loop += inv_angle_val;
1155         }
1156         ref = ref_tmp;
1157     }
1158 
1159     for (v_cnt = 4; v_cnt--;) {
1160         idx0 = (angle_loop) >> 5;
1161         fact_val0 = (angle_loop) & 31;
1162         angle_loop += angle;
1163 
1164         idx1 = (angle_loop) >> 5;
1165         fact_val1 = (angle_loop) & 31;
1166         angle_loop += angle;
1167 
1168         idx2 = (angle_loop) >> 5;
1169         fact_val2 = (angle_loop) & 31;
1170         angle_loop += angle;
1171 
1172         idx3 = (angle_loop) >> 5;
1173         fact_val3 = (angle_loop) & 31;
1174         angle_loop += angle;
1175 
1176         LD_UB2(ref + idx0 + 1, 16, top0, top1);
1177         LD_UB2(ref + idx1 + 1, 16, top2, top3);
1178         LD_UB2(ref + idx2 + 1, 16, top4, top5);
1179         LD_UB2(ref + idx3 + 1, 16, top6, top7);
1180 
1181         fact0 = __msa_fill_h(fact_val0);
1182         fact1 = __msa_fill_h(32 - fact_val0);
1183         fact2 = __msa_fill_h(fact_val1);
1184         fact3 = __msa_fill_h(32 - fact_val1);
1185         fact4 = __msa_fill_h(fact_val2);
1186         fact5 = __msa_fill_h(32 - fact_val2);
1187         fact6 = __msa_fill_h(fact_val3);
1188         fact7 = __msa_fill_h(32 - fact_val3);
1189 
1190         SLDI_B4_UB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
1191                    top1, top3, top5, top7);
1192         UNPCK_UB_SH(top0, diff0, diff1);
1193         UNPCK_UB_SH(top1, diff2, diff3);
1194         UNPCK_UB_SH(top2, diff4, diff5);
1195         UNPCK_UB_SH(top3, diff6, diff7);
1196         UNPCK_UB_SH(top4, diff8, diff9);
1197         UNPCK_UB_SH(top5, diff10, diff11);
1198         UNPCK_UB_SH(top6, diff12, diff13);
1199         UNPCK_UB_SH(top7, diff14, diff15);
1200 
1201         MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2,
1202              diff2, diff3, diff6, diff7);
1203         MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6,
1204              diff10, diff11, diff14, diff15);
1205 
1206         diff2 += diff0 * fact1;
1207         diff3 += diff1 * fact1;
1208         diff6 += diff4 * fact3;
1209         diff7 += diff5 * fact3;
1210         diff10 += diff8 * fact5;
1211         diff11 += diff9 * fact5;
1212         diff14 += diff12 * fact7;
1213         diff15 += diff13 * fact7;
1214 
1215         SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1216         SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1217         PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1218                     dst0, dst1, dst2, dst3);
1219         ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
1220         dst += (4 * stride);
1221     }
1222 }
1223 
hevc_intra_pred_angular_upper_32width_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,int32_t mode)1224 static void hevc_intra_pred_angular_upper_32width_msa(const uint8_t *src_top,
1225                                                       const uint8_t *src_left,
1226                                                       uint8_t *dst,
1227                                                       int32_t stride,
1228                                                       int32_t mode)
1229 {
1230     int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1231     uint8_t ref_array[3 * 32 + 4];
1232     uint8_t *ref_tmp;
1233     const uint8_t *ref;
1234     const uint8_t *src_left_tmp = src_left - 1;
1235     int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1236     int32_t tmp0, tmp1, tmp2, tmp3;
1237     int32_t angle, angle_loop;
1238     int32_t inv_angle_val, inv_angle_val_loop;
1239     int32_t last, offset;
1240     v16u8 top0, top1, top2, top3, top4, top5, top6, top7;
1241     v16i8 dst0, dst1, dst2, dst3;
1242     v8i16 fact0, fact1, fact2, fact3;
1243     v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1244     v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1245 
1246     ref_tmp = ref_array + 32;
1247 
1248     angle = intra_pred_angle_up[mode - 18];
1249     inv_angle_val = inv_angle[mode - 18];
1250     last = angle;
1251     angle_loop = angle;
1252 
1253     ref = src_top - 1;
1254     if (last < -1) {
1255         inv_angle_val_loop = inv_angle_val * last;
1256         LD_UB2(ref, 16, top0, top1);
1257         tmp0 = ref[32];
1258         tmp1 = ref[33];
1259         tmp2 = ref[34];
1260         tmp3 = ref[35];
1261 
1262         ST_UB2(top0, top1, ref_tmp, 16);
1263         ref_tmp[32] = tmp0;
1264         ref_tmp[33] = tmp1;
1265         ref_tmp[34] = tmp2;
1266         ref_tmp[35] = tmp3;
1267 
1268         for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1269             offset = (inv_angle_val_loop + 128) >> 8;
1270             ref_tmp[h_cnt] = src_left_tmp[offset];
1271             inv_angle_val_loop += inv_angle_val;
1272         }
1273 
1274         ref = ref_tmp;
1275     }
1276 
1277     for (v_cnt = 16; v_cnt--;) {
1278         idx0 = (angle_loop) >> 5;
1279         fact_val0 = (angle_loop) & 31;
1280         angle_loop += angle;
1281 
1282         idx1 = (angle_loop) >> 5;
1283         fact_val1 = (angle_loop) & 31;
1284         angle_loop += angle;
1285 
1286         top0 = LD_UB(ref + idx0 + 1);
1287         top4 = LD_UB(ref + idx1 + 1);
1288         top1 = LD_UB(ref + idx0 + 17);
1289         top5 = LD_UB(ref + idx1 + 17);
1290         top3 = LD_UB(ref + idx0 + 33);
1291         top7 = LD_UB(ref + idx1 + 33);
1292 
1293         fact0 = __msa_fill_h(fact_val0);
1294         fact1 = __msa_fill_h(32 - fact_val0);
1295         fact2 = __msa_fill_h(fact_val1);
1296         fact3 = __msa_fill_h(32 - fact_val1);
1297 
1298         top2 = top1;
1299         top6 = top5;
1300 
1301         SLDI_B4_UB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
1302                    top1, top3, top5, top7);
1303         UNPCK_UB_SH(top0, diff0, diff1);
1304         UNPCK_UB_SH(top1, diff2, diff3);
1305         UNPCK_UB_SH(top2, diff4, diff5);
1306         UNPCK_UB_SH(top3, diff6, diff7);
1307         UNPCK_UB_SH(top4, diff8, diff9);
1308         UNPCK_UB_SH(top5, diff10, diff11);
1309         UNPCK_UB_SH(top6, diff12, diff13);
1310         UNPCK_UB_SH(top7, diff14, diff15);
1311 
1312         MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0,
1313              diff2, diff3, diff6, diff7);
1314         MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2,
1315              diff10, diff11, diff14, diff15);
1316 
1317         diff2 += diff0 * fact1;
1318         diff3 += diff1 * fact1;
1319         diff6 += diff4 * fact1;
1320         diff7 += diff5 * fact1;
1321         diff10 += diff8 * fact3;
1322         diff11 += diff9 * fact3;
1323         diff14 += diff12 * fact3;
1324         diff15 += diff13 * fact3;
1325 
1326         SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1327         SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1328         PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1329                     dst0, dst1, dst2, dst3);
1330 
1331         ST_SB2(dst0, dst1, dst, 16);
1332         dst += stride;
1333         ST_SB2(dst2, dst3, dst, 16);
1334         dst += stride;
1335     }
1336 }
1337 
hevc_intra_pred_angular_lower_4width_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,int32_t mode)1338 static void hevc_intra_pred_angular_lower_4width_msa(const uint8_t *src_top,
1339                                                      const uint8_t *src_left,
1340                                                      uint8_t *dst,
1341                                                      int32_t stride,
1342                                                      int32_t mode)
1343 {
1344     int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1345     uint8_t ref_array[3 * 32 + 4];
1346     uint8_t *ref_tmp = ref_array + 4;
1347     const uint8_t *ref;
1348     int32_t last, offset;
1349     int32_t h_cnt, idx0, fact_val0, idx1, fact_val1;
1350     int32_t idx2, fact_val2, idx3, fact_val3;
1351     int32_t angle, angle_loop, inv_angle_val;
1352     uint64_t tmp0;
1353     v16i8 dst_val0, dst_val1;
1354     v16u8 top0, top1, top2, top3;
1355     v16u8 zero = { 0 };
1356     v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1357     v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1358 
1359     angle = intra_pred_angle_low[mode - 2];
1360     last = angle >> 3;
1361     angle_loop = angle;
1362 
1363     ref = src_left - 1;
1364     if (last < -1) {
1365         inv_angle_val = inv_angle[mode - 11];
1366 
1367         tmp0 = LD(ref);
1368         SD(tmp0, ref_tmp);
1369 
1370         for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1371             offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8);
1372             ref_tmp[h_cnt] = src_top[offset];
1373         }
1374 
1375         ref = ref_tmp;
1376     }
1377 
1378     idx0 = angle_loop >> 5;
1379     fact_val0 = angle_loop & 31;
1380     angle_loop += angle;
1381 
1382     idx1 = angle_loop >> 5;
1383     fact_val1 = angle_loop & 31;
1384     angle_loop += angle;
1385 
1386     idx2 = angle_loop >> 5;
1387     fact_val2 = angle_loop & 31;
1388     angle_loop += angle;
1389 
1390     idx3 = angle_loop >> 5;
1391     fact_val3 = angle_loop & 31;
1392 
1393     top0 = LD_UB(ref + idx0 + 1);
1394     top1 = LD_UB(ref + idx1 + 1);
1395     top2 = LD_UB(ref + idx2 + 1);
1396     top3 = LD_UB(ref + idx3 + 1);
1397 
1398     fact0 = __msa_fill_h(fact_val0);
1399     fact1 = __msa_fill_h(32 - fact_val0);
1400     fact2 = __msa_fill_h(fact_val1);
1401     fact3 = __msa_fill_h(32 - fact_val1);
1402     fact4 = __msa_fill_h(fact_val2);
1403     fact5 = __msa_fill_h(32 - fact_val2);
1404     fact6 = __msa_fill_h(fact_val3);
1405     fact7 = __msa_fill_h(32 - fact_val3);
1406 
1407     ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2);
1408     ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
1409     ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3,
1410                diff0, diff2, diff4, diff6);
1411     SLDI_B4_SH(zero, diff0, zero, diff2, zero, diff4, zero, diff6, 2,
1412                diff1, diff3, diff5, diff7);
1413     ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
1414     ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
1415     MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
1416 
1417     diff1 += diff0 * fact1;
1418     diff3 += diff2 * fact3;
1419 
1420     SRARI_H2_SH(diff1, diff3, 5);
1421     PCKEV_B2_SB(diff1, diff1, diff3, diff3, dst_val0, dst_val1);
1422 
1423     diff0 = (v8i16) __msa_pckev_b(dst_val1, dst_val0);
1424     diff1 = (v8i16) __msa_pckod_b(dst_val1, dst_val0);
1425 
1426     diff2 = (v8i16) __msa_pckev_w((v4i32) diff1, (v4i32) diff0);
1427 
1428     dst_val0 = __msa_pckev_b((v16i8) diff2, (v16i8) diff2);
1429     dst_val1 = __msa_pckod_b((v16i8) diff2, (v16i8) diff2);
1430 
1431     ST_W2(dst_val0, 0, 1, dst, stride);
1432     ST_W2(dst_val1, 0, 1, dst + 2 * stride, stride);
1433 }
1434 
hevc_intra_pred_angular_lower_8width_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,int32_t mode)1435 static void hevc_intra_pred_angular_lower_8width_msa(const uint8_t *src_top,
1436                                                      const uint8_t *src_left,
1437                                                      uint8_t *dst,
1438                                                      int32_t stride,
1439                                                      int32_t mode)
1440 {
1441     int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1442     uint8_t ref_array[3 * 32 + 4];
1443     uint8_t *ref_tmp = ref_array + 8;
1444     const uint8_t *ref;
1445     const uint8_t *src_top_tmp = src_top - 1;
1446     uint8_t *dst_org;
1447     int32_t last, offset, tmp0, tmp1, tmp2;
1448     int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1449     int32_t idx2, fact_val2, idx3, fact_val3;
1450     int32_t angle, angle_loop, inv_angle_val;
1451     v16i8 top0, top1, top2, top3;
1452     v16i8 dst_val0, dst_val1, dst_val2, dst_val3;
1453     v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1454     v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1455 
1456     angle = intra_pred_angle_low[mode - 2];
1457     last = (angle) >> 2;
1458     angle_loop = angle;
1459 
1460     ref = src_left - 1;
1461     if (last < -1) {
1462         inv_angle_val = inv_angle[mode - 11];
1463 
1464         tmp0 = LW(ref);
1465         tmp1 = LW(ref + 4);
1466         tmp2 = LW(ref + 8);
1467         SW(tmp0, ref_tmp);
1468         SW(tmp1, ref_tmp + 4);
1469         SW(tmp2, ref_tmp + 8);
1470 
1471         for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1472             offset = (h_cnt * inv_angle_val + 128) >> 8;
1473             ref_tmp[h_cnt] = src_top_tmp[offset];
1474         }
1475 
1476         ref = ref_tmp;
1477     }
1478 
1479     for (v_cnt = 0; v_cnt < 2; v_cnt++) {
1480         dst_org = dst;
1481 
1482         idx0 = angle_loop >> 5;
1483         fact_val0 = angle_loop & 31;
1484         angle_loop += angle;
1485 
1486         idx1 = angle_loop >> 5;
1487         fact_val1 = angle_loop & 31;
1488         angle_loop += angle;
1489 
1490         idx2 = angle_loop >> 5;
1491         fact_val2 = angle_loop & 31;
1492         angle_loop += angle;
1493 
1494         idx3 = angle_loop >> 5;
1495         fact_val3 = angle_loop & 31;
1496         angle_loop += angle;
1497 
1498         top0 = LD_SB(ref + idx0 + 1);
1499         top1 = LD_SB(ref + idx1 + 1);
1500         top2 = LD_SB(ref + idx2 + 1);
1501         top3 = LD_SB(ref + idx3 + 1);
1502 
1503         fact0 = __msa_fill_h(fact_val0);
1504         fact1 = __msa_fill_h(32 - fact_val0);
1505         fact2 = __msa_fill_h(fact_val1);
1506         fact3 = __msa_fill_h(32 - fact_val1);
1507         fact4 = __msa_fill_h(fact_val2);
1508         fact5 = __msa_fill_h(32 - fact_val2);
1509         fact6 = __msa_fill_h(fact_val3);
1510         fact7 = __msa_fill_h(32 - fact_val3);
1511 
1512         UNPCK_UB_SH(top0, diff0, diff1);
1513         UNPCK_UB_SH(top1, diff2, diff3);
1514         UNPCK_UB_SH(top2, diff4, diff5);
1515         UNPCK_UB_SH(top3, diff6, diff7);
1516         SLDI_B4_SH(diff1, diff0, diff3, diff2, diff5, diff4, diff7, diff6, 2,
1517                    diff1, diff3, diff5, diff7);
1518         MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
1519              diff1, diff3, diff5, diff7);
1520 
1521         diff1 += diff0 * fact1;
1522         diff3 += diff2 * fact3;
1523         diff5 += diff4 * fact5;
1524         diff7 += diff6 * fact7;
1525 
1526         SRARI_H4_SH(diff1, diff3, diff5, diff7, 5);
1527         PCKEV_B4_SB(diff1, diff1, diff3, diff3, diff5, diff5, diff7, diff7,
1528                     dst_val0, dst_val1, dst_val2, dst_val3);
1529         ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
1530         ILVRL_H2_SH(diff1, diff0, diff3, diff4);
1531         ST_W8(diff3, diff4, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
1532         dst += 4;
1533     }
1534 }
1535 
hevc_intra_pred_angular_lower_16width_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,int32_t mode)1536 static void hevc_intra_pred_angular_lower_16width_msa(const uint8_t *src_top,
1537                                                       const uint8_t *src_left,
1538                                                       uint8_t *dst,
1539                                                       int32_t stride,
1540                                                       int32_t mode)
1541 {
1542     int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1543     int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1544     int32_t idx2, fact_val2, idx3, fact_val3, tmp0;
1545     v16i8 top0, top1, dst_val0, top2, top3, dst_val1;
1546     v16i8 top4, top5, dst_val2, top6, top7, dst_val3;
1547     v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1548     v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1549     v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1550     int32_t angle, angle_loop, inv_angle_val, offset;
1551     uint8_t ref_array[3 * 32 + 4];
1552     uint8_t *ref_tmp = ref_array + 16;
1553     const uint8_t *ref, *src_top_tmp = src_top - 1;
1554     uint8_t *dst_org;
1555     int32_t last;
1556 
1557     angle = intra_pred_angle_low[mode - 2];
1558     last = (angle) >> 1;
1559     angle_loop = angle;
1560 
1561     ref = src_left - 1;
1562     if (last < -1) {
1563         inv_angle_val = inv_angle[mode - 11];
1564 
1565         top0 = LD_SB(ref);
1566         tmp0 = LW(ref + 16);
1567         ST_SB(top0, ref_tmp);
1568         SW(tmp0, ref_tmp + 16);
1569 
1570         for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1571             offset = (h_cnt * inv_angle_val + 128) >> 8;
1572             ref_tmp[h_cnt] = src_top_tmp[offset];
1573         }
1574 
1575         ref = ref_tmp;
1576     }
1577 
1578     for (v_cnt = 0; v_cnt < 4; v_cnt++) {
1579         dst_org = dst;
1580 
1581         idx0 = angle_loop >> 5;
1582         fact_val0 = angle_loop & 31;
1583         angle_loop += angle;
1584 
1585         idx1 = angle_loop >> 5;
1586         fact_val1 = angle_loop & 31;
1587         angle_loop += angle;
1588 
1589         idx2 = angle_loop >> 5;
1590         fact_val2 = angle_loop & 31;
1591         angle_loop += angle;
1592 
1593         idx3 = angle_loop >> 5;
1594         fact_val3 = angle_loop & 31;
1595         angle_loop += angle;
1596 
1597         LD_SB2(ref + idx0 + 1, 16, top0, top1);
1598         LD_SB2(ref + idx1 + 1, 16, top2, top3);
1599         LD_SB2(ref + idx2 + 1, 16, top4, top5);
1600         LD_SB2(ref + idx3 + 1, 16, top6, top7);
1601 
1602         fact0 = __msa_fill_h(fact_val0);
1603         fact1 = __msa_fill_h(32 - fact_val0);
1604         fact2 = __msa_fill_h(fact_val1);
1605         fact3 = __msa_fill_h(32 - fact_val1);
1606         fact4 = __msa_fill_h(fact_val2);
1607         fact5 = __msa_fill_h(32 - fact_val2);
1608         fact6 = __msa_fill_h(fact_val3);
1609         fact7 = __msa_fill_h(32 - fact_val3);
1610 
1611         SLDI_B4_SB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
1612                    top1, top3, top5, top7);
1613 
1614         UNPCK_UB_SH(top0, diff0, diff1);
1615         UNPCK_UB_SH(top1, diff2, diff3);
1616         UNPCK_UB_SH(top2, diff4, diff5);
1617         UNPCK_UB_SH(top3, diff6, diff7);
1618         UNPCK_UB_SH(top4, diff8, diff9);
1619         UNPCK_UB_SH(top5, diff10, diff11);
1620         UNPCK_UB_SH(top6, diff12, diff13);
1621         UNPCK_UB_SH(top7, diff14, diff15);
1622 
1623         MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2,
1624              diff2, diff3, diff6, diff7);
1625         MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6,
1626              diff10, diff11, diff14, diff15);
1627 
1628         diff2 += diff0 * fact1;
1629         diff3 += diff1 * fact1;
1630         diff6 += diff4 * fact3;
1631         diff7 += diff5 * fact3;
1632         diff10 += diff8 * fact5;
1633         diff11 += diff9 * fact5;
1634         diff14 += diff12 * fact7;
1635         diff15 += diff13 * fact7;
1636 
1637         SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1638         SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1639         PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1640                     dst_val0, dst_val1, dst_val2, dst_val3);
1641         ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
1642         ILVL_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff2, diff3);
1643         ILVRL_H2_SH(diff1, diff0, diff4, diff5);
1644         ILVRL_H2_SH(diff3, diff2, diff6, diff7);
1645         ST_W8(diff4, diff5, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
1646         dst_org += (8 * stride);
1647         ST_W8(diff6, diff7, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
1648         dst += 4;
1649     }
1650 }
1651 
hevc_intra_pred_angular_lower_32width_msa(const uint8_t * src_top,const uint8_t * src_left,uint8_t * dst,int32_t stride,int32_t mode)1652 static void hevc_intra_pred_angular_lower_32width_msa(const uint8_t *src_top,
1653                                                       const uint8_t *src_left,
1654                                                       uint8_t *dst,
1655                                                       int32_t stride,
1656                                                       int32_t mode)
1657 {
1658     int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1659     int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1, tmp0;
1660     v16i8 top0, top1, dst_val0, top2, top3, dst_val1;
1661     v16i8 top4, top5, dst_val2, top6, top7, dst_val3;
1662     v8i16 fact0, fact1, fact2, fact3;
1663     v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1664     v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1665     int32_t angle, angle_loop, inv_angle_val, offset;
1666     uint8_t ref_array[3 * 32 + 4];
1667     uint8_t *ref_tmp = ref_array + 32;
1668     const uint8_t *ref, *src_top_tmp = src_top - 1;
1669     uint8_t *dst_org;
1670     int32_t last;
1671 
1672     angle = intra_pred_angle_low[mode - 2];
1673     last = angle;
1674     angle_loop = angle;
1675 
1676     ref = src_left - 1;
1677     if (last < -1) {
1678         inv_angle_val = inv_angle[mode - 11];
1679 
1680         LD_SB2(ref, 16, top0, top1);
1681         tmp0 = LW(ref + 32);
1682         ST_SB2(top0, top1, ref_tmp, 16);
1683         SW(tmp0, ref_tmp + 32);
1684 
1685         for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1686             offset = (h_cnt * inv_angle_val + 128) >> 8;
1687             ref_tmp[h_cnt] = src_top_tmp[offset];
1688         }
1689 
1690         ref = ref_tmp;
1691     }
1692 
1693     for (v_cnt = 0; v_cnt < 16; v_cnt++) {
1694         dst_org = dst;
1695         idx0 = angle_loop >> 5;
1696         fact_val0 = angle_loop & 31;
1697         angle_loop += angle;
1698 
1699         idx1 = angle_loop >> 5;
1700         fact_val1 = angle_loop & 31;
1701         angle_loop += angle;
1702 
1703         top0 = LD_SB(ref + idx0 + 1);
1704         top4 = LD_SB(ref + idx1 + 1);
1705         top1 = LD_SB(ref + idx0 + 17);
1706         top5 = LD_SB(ref + idx1 + 17);
1707         top3 = LD_SB(ref + idx0 + 33);
1708         top7 = LD_SB(ref + idx1 + 33);
1709 
1710         fact0 = __msa_fill_h(fact_val0);
1711         fact1 = __msa_fill_h(32 - fact_val0);
1712         fact2 = __msa_fill_h(fact_val1);
1713         fact3 = __msa_fill_h(32 - fact_val1);
1714 
1715         top2 = top1;
1716         top6 = top5;
1717 
1718         SLDI_B4_SB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
1719                    top1, top3, top5, top7);
1720 
1721         UNPCK_UB_SH(top0, diff0, diff1);
1722         UNPCK_UB_SH(top1, diff2, diff3);
1723         UNPCK_UB_SH(top2, diff4, diff5);
1724         UNPCK_UB_SH(top3, diff6, diff7);
1725         UNPCK_UB_SH(top4, diff8, diff9);
1726         UNPCK_UB_SH(top5, diff10, diff11);
1727         UNPCK_UB_SH(top6, diff12, diff13);
1728         UNPCK_UB_SH(top7, diff14, diff15);
1729 
1730         MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0,
1731              diff2, diff3, diff6, diff7);
1732         MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2,
1733              diff10, diff11, diff14, diff15);
1734 
1735         diff2 += diff0 * fact1;
1736         diff3 += diff1 * fact1;
1737         diff6 += diff4 * fact1;
1738         diff7 += diff5 * fact1;
1739         diff10 += diff8 * fact3;
1740         diff11 += diff9 * fact3;
1741         diff14 += diff12 * fact3;
1742         diff15 += diff13 * fact3;
1743 
1744         SRARI_H4_SH(diff2, diff3, diff6, diff7, 5);
1745         SRARI_H4_SH(diff10, diff11, diff14, diff15, 5);
1746         PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1747                     dst_val0, dst_val1, dst_val2, dst_val3);
1748         ILVRL_B2_SH(dst_val2, dst_val0, diff0, diff1);
1749         ILVRL_B2_SH(dst_val3, dst_val1, diff2, diff3);
1750 
1751         ST_H8(diff0, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1752         dst_org += (8 * stride);
1753         ST_H8(diff1, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1754         dst_org += (8 * stride);
1755         ST_H8(diff2, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1756         dst_org += (8 * stride);
1757         ST_H8(diff3, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1758         dst_org += (8 * stride);
1759 
1760         dst += 2;
1761     }
1762 }
1763 
intra_predict_vert_32x32_msa(const uint8_t * src,uint8_t * dst,int32_t dst_stride)1764 static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst,
1765                                          int32_t dst_stride)
1766 {
1767     uint32_t row;
1768     v16u8 src1, src2;
1769 
1770     src1 = LD_UB(src);
1771     src2 = LD_UB(src + 16);
1772 
1773     for (row = 32; row--;) {
1774         ST_UB2(src1, src2, dst, 16);
1775         dst += dst_stride;
1776     }
1777 }
1778 
ff_hevc_intra_pred_planar_0_msa(uint8_t * dst,const uint8_t * src_top,const uint8_t * src_left,ptrdiff_t stride)1779 void ff_hevc_intra_pred_planar_0_msa(uint8_t *dst,
1780                                      const uint8_t *src_top,
1781                                      const uint8_t *src_left,
1782                                      ptrdiff_t stride)
1783 {
1784     hevc_intra_pred_plane_4x4_msa(src_top, src_left, dst, stride);
1785 }
1786 
ff_hevc_intra_pred_planar_1_msa(uint8_t * dst,const uint8_t * src_top,const uint8_t * src_left,ptrdiff_t stride)1787 void ff_hevc_intra_pred_planar_1_msa(uint8_t *dst,
1788                                      const uint8_t *src_top,
1789                                      const uint8_t *src_left,
1790                                      ptrdiff_t stride)
1791 {
1792     hevc_intra_pred_plane_8x8_msa(src_top, src_left, dst, stride);
1793 }
1794 
ff_hevc_intra_pred_planar_2_msa(uint8_t * dst,const uint8_t * src_top,const uint8_t * src_left,ptrdiff_t stride)1795 void ff_hevc_intra_pred_planar_2_msa(uint8_t *dst,
1796                                      const uint8_t *src_top,
1797                                      const uint8_t *src_left,
1798                                      ptrdiff_t stride)
1799 {
1800     hevc_intra_pred_plane_16x16_msa(src_top, src_left, dst, stride);
1801 }
1802 
ff_hevc_intra_pred_planar_3_msa(uint8_t * dst,const uint8_t * src_top,const uint8_t * src_left,ptrdiff_t stride)1803 void ff_hevc_intra_pred_planar_3_msa(uint8_t *dst,
1804                                      const uint8_t *src_top,
1805                                      const uint8_t *src_left,
1806                                      ptrdiff_t stride)
1807 {
1808     hevc_intra_pred_plane_32x32_msa(src_top, src_left, dst, stride);
1809 }
1810 
ff_hevc_intra_pred_dc_msa(uint8_t * dst,const uint8_t * src_top,const uint8_t * src_left,ptrdiff_t stride,int log2,int c_idx)1811 void ff_hevc_intra_pred_dc_msa(uint8_t *dst, const uint8_t *src_top,
1812                                const uint8_t *src_left,
1813                                ptrdiff_t stride, int log2, int c_idx)
1814 {
1815     switch (log2) {
1816     case 2:
1817         hevc_intra_pred_dc_4x4_msa(src_top, src_left, dst, stride, c_idx);
1818         break;
1819 
1820     case 3:
1821         hevc_intra_pred_dc_8x8_msa(src_top, src_left, dst, stride, c_idx);
1822         break;
1823 
1824     case 4:
1825         hevc_intra_pred_dc_16x16_msa(src_top, src_left, dst, stride, c_idx);
1826         break;
1827 
1828     case 5:
1829         hevc_intra_pred_dc_32x32_msa(src_top, src_left, dst, stride);
1830         break;
1831     }
1832 }
1833 
ff_pred_intra_pred_angular_0_msa(uint8_t * dst,const uint8_t * src_top,const uint8_t * src_left,ptrdiff_t stride,int c_idx,int mode)1834 void ff_pred_intra_pred_angular_0_msa(uint8_t *dst,
1835                                       const uint8_t *src_top,
1836                                       const uint8_t *src_left,
1837                                       ptrdiff_t stride, int c_idx, int mode)
1838 {
1839     if (mode == 10) {
1840         hevc_intra_pred_horiz_4x4_msa(src_top, src_left, dst, stride, c_idx);
1841     } else if (mode == 26) {
1842         hevc_intra_pred_vert_4x4_msa(src_top, src_left, dst, stride, c_idx);
1843     } else if (mode >= 18) {
1844         hevc_intra_pred_angular_upper_4width_msa(src_top, src_left,
1845                                                  dst, stride, mode);
1846     } else {
1847         hevc_intra_pred_angular_lower_4width_msa(src_top, src_left,
1848                                                  dst, stride, mode);
1849     }
1850 }
1851 
ff_pred_intra_pred_angular_1_msa(uint8_t * dst,const uint8_t * src_top,const uint8_t * src_left,ptrdiff_t stride,int c_idx,int mode)1852 void ff_pred_intra_pred_angular_1_msa(uint8_t *dst,
1853                                       const uint8_t *src_top,
1854                                       const uint8_t *src_left,
1855                                       ptrdiff_t stride, int c_idx, int mode)
1856 {
1857     if (mode == 10) {
1858         hevc_intra_pred_horiz_8x8_msa(src_top, src_left, dst, stride, c_idx);
1859     } else if (mode == 26) {
1860         hevc_intra_pred_vert_8x8_msa(src_top, src_left, dst, stride, c_idx);
1861     } else if (mode >= 18) {
1862         hevc_intra_pred_angular_upper_8width_msa(src_top, src_left,
1863                                                  dst, stride, mode);
1864     } else {
1865         hevc_intra_pred_angular_lower_8width_msa(src_top, src_left,
1866                                                  dst, stride, mode);
1867     }
1868 }
1869 
ff_pred_intra_pred_angular_2_msa(uint8_t * dst,const uint8_t * src_top,const uint8_t * src_left,ptrdiff_t stride,int c_idx,int mode)1870 void ff_pred_intra_pred_angular_2_msa(uint8_t *dst,
1871                                       const uint8_t *src_top,
1872                                       const uint8_t *src_left,
1873                                       ptrdiff_t stride, int c_idx, int mode)
1874 {
1875     if (mode == 10) {
1876         hevc_intra_pred_horiz_16x16_msa(src_top, src_left, dst, stride, c_idx);
1877     } else if (mode == 26) {
1878         hevc_intra_pred_vert_16x16_msa(src_top, src_left, dst, stride, c_idx);
1879     } else if (mode >= 18) {
1880         hevc_intra_pred_angular_upper_16width_msa(src_top, src_left,
1881                                                   dst, stride, mode);
1882     } else {
1883         hevc_intra_pred_angular_lower_16width_msa(src_top, src_left,
1884                                                   dst, stride, mode);
1885     }
1886 }
1887 
ff_pred_intra_pred_angular_3_msa(uint8_t * dst,const uint8_t * src_top,const uint8_t * src_left,ptrdiff_t stride,int c_idx,int mode)1888 void ff_pred_intra_pred_angular_3_msa(uint8_t *dst,
1889                                       const uint8_t *src_top,
1890                                       const uint8_t *src_left,
1891                                       ptrdiff_t stride, int c_idx, int mode)
1892 {
1893     if (mode == 10) {
1894         hevc_intra_pred_horiz_32x32_msa(src_top, src_left, dst, stride);
1895     } else if (mode == 26) {
1896         intra_predict_vert_32x32_msa(src_top, dst, stride);
1897     } else if (mode >= 18) {
1898         hevc_intra_pred_angular_upper_32width_msa(src_top, src_left,
1899                                                   dst, stride, mode);
1900     } else {
1901         hevc_intra_pred_angular_lower_32width_msa(src_top, src_left,
1902                                                   dst, stride, mode);
1903     }
1904 }
1905 
ff_intra_pred_8_16x16_msa(HEVCContext * s,int x0,int y0,int c_idx)1906 void ff_intra_pred_8_16x16_msa(HEVCContext *s, int x0, int y0, int c_idx)
1907 {
1908     v16u8 vec0;
1909     HEVCLocalContext *lc = s->HEVClc;
1910     int i;
1911     int hshift = s->ps.sps->hshift[c_idx];
1912     int vshift = s->ps.sps->vshift[c_idx];
1913     int size_in_luma_h = 16 << hshift;
1914     int size_in_tbs_h = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
1915     int size_in_luma_v = 16 << vshift;
1916     int size_in_tbs_v = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
1917     int x = x0 >> hshift;
1918     int y = y0 >> vshift;
1919     int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
1920     int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
1921 
1922     int cur_tb_addr =
1923         s->ps.pps->min_tb_addr_zs[(y_tb) * (s->ps.sps->tb_mask + 2) + (x_tb)];
1924 
1925     ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(uint8_t);
1926     uint8_t *src = (uint8_t *) s->frame->data[c_idx] + x + y * stride;
1927 
1928     int min_pu_width = s->ps.sps->min_pu_width;
1929 
1930     enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c :
1931         lc->tu.intra_pred_mode;
1932     uint32_t a;
1933     uint8_t left_array[2 * 32 + 1];
1934     uint8_t filtered_left_array[2 * 32 + 1];
1935     uint8_t top_array[2 * 32 + 1];
1936     uint8_t filtered_top_array[2 * 32 + 1];
1937 
1938     uint8_t *left = left_array + 1;
1939     uint8_t *top = top_array + 1;
1940     uint8_t *filtered_left = filtered_left_array + 1;
1941     uint8_t *filtered_top = filtered_top_array + 1;
1942     int cand_bottom_left = lc->na.cand_bottom_left
1943         && cur_tb_addr >
1944         s->ps.pps->min_tb_addr_zs[((y_tb + size_in_tbs_v) & s->ps.sps->tb_mask) *
1945                                (s->ps.sps->tb_mask + 2) + (x_tb - 1)];
1946     int cand_left = lc->na.cand_left;
1947     int cand_up_left = lc->na.cand_up_left;
1948     int cand_up = lc->na.cand_up;
1949     int cand_up_right = lc->na.cand_up_right
1950         && cur_tb_addr >
1951         s->ps.pps->min_tb_addr_zs[(y_tb - 1) * (s->ps.sps->tb_mask + 2) +
1952                                ((x_tb + size_in_tbs_h) & s->ps.sps->tb_mask)];
1953 
1954     int bottom_left_size =
1955         (((y0 + 2 * size_in_luma_v) >
1956           (s->ps.sps->height) ? (s->ps.sps->height) : (y0 +
1957                                                  2 * size_in_luma_v)) -
1958          (y0 + size_in_luma_v)) >> vshift;
1959     int top_right_size =
1960         (((x0 + 2 * size_in_luma_h) >
1961           (s->ps.sps->width) ? (s->ps.sps->width) : (x0 + 2 * size_in_luma_h)) -
1962          (x0 + size_in_luma_h)) >> hshift;
1963 
1964     if (s->ps.pps->constrained_intra_pred_flag == 1) {
1965         int size_in_luma_pu_v = ((size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
1966         int size_in_luma_pu_h = ((size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
1967         int on_pu_edge_x = !(x0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
1968         int on_pu_edge_y = !(y0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
1969         if (!size_in_luma_pu_h)
1970             size_in_luma_pu_h++;
1971         if (cand_bottom_left == 1 && on_pu_edge_x) {
1972             int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
1973             int y_bottom_pu =
1974                 ((y0 + size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
1975             int max =
1976                 ((size_in_luma_pu_v) >
1977                  (s->ps.sps->min_pu_height -
1978                   y_bottom_pu) ? (s->ps.sps->min_pu_height -
1979                                   y_bottom_pu) : (size_in_luma_pu_v));
1980             cand_bottom_left = 0;
1981             for (i = 0; i < max; i += 2)
1982                 cand_bottom_left |=
1983                     ((s->ref->tab_mvf[(x_left_pu) +
1984                                       (y_bottom_pu +
1985                                        i) * min_pu_width]).pred_flag ==
1986                      PF_INTRA);
1987         }
1988         if (cand_left == 1 && on_pu_edge_x) {
1989             int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
1990             int y_left_pu = ((y0) >> s->ps.sps->log2_min_pu_size);
1991             int max =
1992                 ((size_in_luma_pu_v) >
1993                  (s->ps.sps->min_pu_height -
1994                   y_left_pu) ? (s->ps.sps->min_pu_height -
1995                                 y_left_pu) : (size_in_luma_pu_v));
1996             cand_left = 0;
1997             for (i = 0; i < max; i += 2)
1998                 cand_left |=
1999                     ((s->ref->tab_mvf[(x_left_pu) +
2000                                       (y_left_pu +
2001                                        i) * min_pu_width]).pred_flag ==
2002                      PF_INTRA);
2003         }
2004         if (cand_up_left == 1) {
2005             int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
2006             int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2007             cand_up_left =
2008                 (s->ref->tab_mvf[(x_left_pu) +
2009                                  (y_top_pu) * min_pu_width]).pred_flag ==
2010                 PF_INTRA;
2011         }
2012         if (cand_up == 1 && on_pu_edge_y) {
2013             int x_top_pu = ((x0) >> s->ps.sps->log2_min_pu_size);
2014             int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2015             int max =
2016                 ((size_in_luma_pu_h) >
2017                  (s->ps.sps->min_pu_width -
2018                   x_top_pu) ? (s->ps.sps->min_pu_width -
2019                                x_top_pu) : (size_in_luma_pu_h));
2020             cand_up = 0;
2021             for (i = 0; i < max; i += 2)
2022                 cand_up |=
2023                     ((s->ref->tab_mvf[(x_top_pu + i) +
2024                                       (y_top_pu) *
2025                                       min_pu_width]).pred_flag == PF_INTRA);
2026         }
2027         if (cand_up_right == 1 && on_pu_edge_y) {
2028             int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2029             int x_right_pu =
2030                 ((x0 + size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
2031             int max =
2032                 ((size_in_luma_pu_h) >
2033                  (s->ps.sps->min_pu_width -
2034                   x_right_pu) ? (s->ps.sps->min_pu_width -
2035                                  x_right_pu) : (size_in_luma_pu_h));
2036             cand_up_right = 0;
2037             for (i = 0; i < max; i += 2)
2038                 cand_up_right |=
2039                     ((s->ref->tab_mvf[(x_right_pu + i) +
2040                                       (y_top_pu) *
2041                                       min_pu_width]).pred_flag == PF_INTRA);
2042         }
2043 
2044         vec0 = (v16u8) __msa_ldi_b(128);
2045 
2046         ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2047 
2048         ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2049 
2050         top[-1] = 128;
2051     }
2052     if (cand_up_left) {
2053         left[-1] = src[(-1) + stride * (-1)];
2054         top[-1] = left[-1];
2055     }
2056     if (cand_up) {
2057         vec0 = LD_UB(src - stride);
2058         ST_UB(vec0, top);
2059     }
2060     if (cand_up_right) {
2061         vec0 = LD_UB(src - stride + 16);
2062         ST_UB(vec0, (top + 16));
2063 
2064         do {
2065             uint32_t pix =
2066                 ((src[(16 + top_right_size - 1) + stride * (-1)]) *
2067                  0x01010101U);
2068             for (i = 0; i < (16 - top_right_size); i += 4)
2069                 ((((union unaligned_32 *) (top + 16 + top_right_size +
2070                                            i))->l) = (pix));
2071         } while (0);
2072     }
2073     if (cand_left)
2074         for (i = 0; i < 16; i++)
2075             left[i] = src[(-1) + stride * (i)];
2076     if (cand_bottom_left) {
2077         for (i = 16; i < 16 + bottom_left_size; i++)
2078             left[i] = src[(-1) + stride * (i)];
2079         do {
2080             uint32_t pix =
2081                 ((src[(-1) + stride * (16 + bottom_left_size - 1)]) *
2082                  0x01010101U);
2083             for (i = 0; i < (16 - bottom_left_size); i += 4)
2084                 ((((union unaligned_32 *) (left + 16 + bottom_left_size +
2085                                            i))->l) = (pix));
2086         } while (0);
2087     }
2088 
2089     if (s->ps.pps->constrained_intra_pred_flag == 1) {
2090         if (cand_bottom_left || cand_left || cand_up_left || cand_up
2091             || cand_up_right) {
2092             int size_max_x =
2093                 x0 + ((2 * 16) << hshift) <
2094                 s->ps.sps->width ? 2 * 16 : (s->ps.sps->width - x0) >> hshift;
2095             int size_max_y =
2096                 y0 + ((2 * 16) << vshift) <
2097                 s->ps.sps->height ? 2 * 16 : (s->ps.sps->height - y0) >> vshift;
2098             int j = 16 + (cand_bottom_left ? bottom_left_size : 0) - 1;
2099             if (!cand_up_right) {
2100                 size_max_x = x0 + ((16) << hshift) < s->ps.sps->width ?
2101                     16 : (s->ps.sps->width - x0) >> hshift;
2102             }
2103             if (!cand_bottom_left) {
2104                 size_max_y = y0 + ((16) << vshift) < s->ps.sps->height ?
2105                     16 : (s->ps.sps->height - y0) >> vshift;
2106             }
2107             if (cand_bottom_left || cand_left || cand_up_left) {
2108                 while (j > -1
2109                        &&
2110                        !((s->ref->tab_mvf[(((x0 +
2111                                              ((-1) << hshift)) >> s->ps.sps->
2112                                             log2_min_pu_size)) + (((y0 +
2113                                                                     ((j) <<
2114                                                                      vshift))
2115                                                                    >> s->ps.sps->
2116                                                                    log2_min_pu_size))
2117                                           * min_pu_width]).pred_flag ==
2118                          PF_INTRA))
2119                     j--;
2120                 if (!
2121                     ((s->ref->tab_mvf[(((x0 +
2122                                          ((-1) << hshift)) >> s->ps.sps->
2123                                         log2_min_pu_size)) + (((y0 + ((j)
2124                                                                       <<
2125                                                                       vshift))
2126                                                                >> s->ps.sps->
2127                                                                log2_min_pu_size))
2128                                       * min_pu_width]).pred_flag == PF_INTRA)) {
2129                     j = 0;
2130                     while (j < size_max_x
2131                            &&
2132                            !((s->ref->tab_mvf[(((x0 +
2133                                                  ((j) << hshift)) >> s->ps.sps->
2134                                                 log2_min_pu_size)) + (((y0 +
2135                                                                         ((-1) <<
2136                                                                          vshift))
2137                                                                        >> s->
2138                                                                        ps.sps->
2139                                                                        log2_min_pu_size))
2140                                               * min_pu_width]).pred_flag ==
2141                              PF_INTRA))
2142                         j++;
2143                     for (i = j; i > (j) - (j + 1); i--)
2144                         if (!
2145                             ((s->ref->tab_mvf[(((x0 +
2146                                                  ((i -
2147                                                    1) << hshift)) >> s->ps.sps->
2148                                                 log2_min_pu_size)) + (((y0 +
2149                                                                         ((-1) <<
2150                                                                          vshift))
2151                                                                        >> s->
2152                                                                        ps.sps->
2153                                                                        log2_min_pu_size))
2154                                               * min_pu_width]).pred_flag ==
2155                              PF_INTRA))
2156                             top[i - 1] = top[i];
2157                     left[-1] = top[-1];
2158                 }
2159             } else {
2160                 j = 0;
2161                 while (j < size_max_x
2162                        &&
2163                        !((s->ref->tab_mvf[(((x0 +
2164                                              ((j) << hshift)) >> s->ps.sps->
2165                                             log2_min_pu_size)) + (((y0 + ((-1)
2166                                                                           <<
2167                                                                           vshift))
2168                                                                    >> s->ps.sps->
2169                                                                    log2_min_pu_size))
2170                                           * min_pu_width]).pred_flag ==
2171                          PF_INTRA))
2172                     j++;
2173                 if (j > 0)
2174                     if (x0 > 0) {
2175                         for (i = j; i > (j) - (j + 1); i--)
2176                             if (!
2177                                 ((s->ref->tab_mvf[(((x0 +
2178                                                      ((i -
2179                                                        1) << hshift)) >>
2180                                                     s->ps.sps->log2_min_pu_size))
2181                                                   + (((y0 + ((-1)
2182                                                              << vshift))
2183                                                       >>
2184                                                       s->ps.sps->log2_min_pu_size))
2185                                                   *
2186                                                   min_pu_width]).pred_flag ==
2187                                  PF_INTRA))
2188                                 top[i - 1] = top[i];
2189                     } else {
2190                         for (i = j; i > (j) - (j); i--)
2191                             if (!
2192                                 ((s->ref->tab_mvf[(((x0 +
2193                                                      ((i -
2194                                                        1) << hshift)) >>
2195                                                     s->ps.sps->log2_min_pu_size))
2196                                                   + (((y0 + ((-1)
2197                                                              << vshift))
2198                                                       >>
2199                                                       s->ps.sps->log2_min_pu_size))
2200                                                   *
2201                                                   min_pu_width]).pred_flag ==
2202                                  PF_INTRA))
2203                                 top[i - 1] = top[i];
2204                         top[-1] = top[0];
2205                     }
2206                 left[-1] = top[-1];
2207             }
2208             left[-1] = top[-1];
2209             if (cand_bottom_left || cand_left) {
2210                 a = ((left[-1]) * 0x01010101U);
2211                 for (i = 0; i < (0) + (size_max_y); i += 4)
2212                     if (!
2213                         ((s->ref->tab_mvf[(((x0 +
2214                                              ((-1) << hshift)) >> s->ps.sps->
2215                                             log2_min_pu_size)) + (((y0 +
2216                                                                     ((i) <<
2217                                                                      vshift))
2218                                                                    >> s->ps.sps->
2219                                                                    log2_min_pu_size))
2220                                           * min_pu_width]).pred_flag ==
2221                          PF_INTRA))
2222                         ((((union unaligned_32 *) (&left[i]))->l) = (a));
2223                     else
2224                         a = ((left[i + 3]) * 0x01010101U);
2225             }
2226             if (!cand_left) {
2227                 vec0 = (v16u8) __msa_fill_b(left[-1]);
2228 
2229                 ST_UB(vec0, left);
2230             }
2231             if (!cand_bottom_left) {
2232 
2233                 vec0 = (v16u8) __msa_fill_b(left[15]);
2234 
2235                 ST_UB(vec0, (left + 16));
2236             }
2237             if (x0 != 0 && y0 != 0) {
2238                 a = ((left[size_max_y - 1]) * 0x01010101U);
2239                 for (i = (size_max_y - 1);
2240                      i > (size_max_y - 1) - (size_max_y); i -= 4)
2241                     if (!
2242                         ((s->ref->tab_mvf[(((x0 +
2243                                              ((-1) << hshift)) >> s->ps.sps->
2244                                             log2_min_pu_size)) + (((y0 +
2245                                                                     ((i -
2246                                                                       3) <<
2247                                                                      vshift))
2248                                                                    >> s->ps.sps->
2249                                                                    log2_min_pu_size))
2250                                           * min_pu_width]).pred_flag ==
2251                          PF_INTRA))
2252                         ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2253                     else
2254                         a = ((left[i - 3]) * 0x01010101U);
2255                 if (!
2256                     ((s->ref->tab_mvf[(((x0 +
2257                                          ((-1) << hshift)) >> s->ps.sps->
2258                                         log2_min_pu_size)) + (((y0 + ((-1)
2259                                                                       <<
2260                                                                       vshift))
2261                                                                >> s->ps.sps->
2262                                                                log2_min_pu_size))
2263                                       * min_pu_width]).pred_flag == PF_INTRA))
2264                     left[-1] = left[0];
2265             } else if (x0 == 0) {
2266                 do {
2267                     uint32_t pix = ((0) * 0x01010101U);
2268                     for (i = 0; i < (size_max_y); i += 4)
2269                         ((((union unaligned_32 *) (left + i))->l) = (pix));
2270                 } while (0);
2271             } else {
2272                 a = ((left[size_max_y - 1]) * 0x01010101U);
2273                 for (i = (size_max_y - 1);
2274                      i > (size_max_y - 1) - (size_max_y); i -= 4)
2275                     if (!
2276                         ((s->ref->tab_mvf[(((x0 +
2277                                              ((-1) << hshift)) >> s->ps.sps->
2278                                             log2_min_pu_size)) + (((y0 +
2279                                                                     ((i -
2280                                                                       3) <<
2281                                                                      vshift))
2282                                                                    >> s->ps.sps->
2283                                                                    log2_min_pu_size))
2284                                           * min_pu_width]).pred_flag ==
2285                          PF_INTRA))
2286                         ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2287                     else
2288                         a = ((left[i - 3]) * 0x01010101U);
2289             }
2290             top[-1] = left[-1];
2291             if (y0 != 0) {
2292                 a = ((left[-1]) * 0x01010101U);
2293                 for (i = 0; i < (0) + (size_max_x); i += 4)
2294                     if (!
2295                         ((s->ref->tab_mvf[(((x0 +
2296                                              ((i) << hshift)) >> s->ps.sps->
2297                                             log2_min_pu_size)) + (((y0 + ((-1)
2298                                                                           <<
2299                                                                           vshift))
2300                                                                    >> s->ps.sps->
2301                                                                    log2_min_pu_size))
2302                                           * min_pu_width]).pred_flag ==
2303                          PF_INTRA))
2304                         ((((union unaligned_32 *) (&top[i]))->l) = (a));
2305                     else
2306                         a = ((top[i + 3]) * 0x01010101U);
2307             }
2308         }
2309     }
2310 
2311     if (!cand_bottom_left) {
2312         if (cand_left) {
2313             vec0 = (v16u8) __msa_fill_b(left[15]);
2314 
2315             ST_UB(vec0, (left + 16));
2316 
2317         } else if (cand_up_left) {
2318             vec0 = (v16u8) __msa_fill_b(left[-1]);
2319 
2320             ST_UB2(vec0, vec0, left, 16);
2321 
2322             cand_left = 1;
2323         } else if (cand_up) {
2324             left[-1] = top[0];
2325 
2326             vec0 = (v16u8) __msa_fill_b(left[-1]);
2327 
2328             ST_UB2(vec0, vec0, left, 16);
2329 
2330             cand_up_left = 1;
2331             cand_left = 1;
2332         } else if (cand_up_right) {
2333             vec0 = (v16u8) __msa_fill_b(top[16]);
2334 
2335             ST_UB(vec0, top);
2336 
2337             left[-1] = top[16];
2338 
2339             ST_UB2(vec0, vec0, left, 16);
2340 
2341             cand_up = 1;
2342             cand_up_left = 1;
2343             cand_left = 1;
2344         } else {
2345             left[-1] = 128;
2346             vec0 = (v16u8) __msa_ldi_b(128);
2347 
2348             ST_UB2(vec0, vec0, top, 16);
2349             ST_UB2(vec0, vec0, left, 16);
2350         }
2351     }
2352 
2353     if (!cand_left) {
2354         vec0 = (v16u8) __msa_fill_b(left[16]);
2355         ST_UB(vec0, left);
2356     }
2357     if (!cand_up_left) {
2358         left[-1] = left[0];
2359     }
2360     if (!cand_up) {
2361         vec0 = (v16u8) __msa_fill_b(left[-1]);
2362         ST_UB(vec0, top);
2363     }
2364     if (!cand_up_right) {
2365         vec0 = (v16u8) __msa_fill_b(top[15]);
2366         ST_UB(vec0, (top + 16));
2367     }
2368 
2369     top[-1] = left[-1];
2370 
2371 
2372     if (!s->ps.sps->intra_smoothing_disabled_flag
2373         && (c_idx == 0 || s->ps.sps->chroma_format_idc == 3)) {
2374         if (mode != INTRA_DC && 16 != 4) {
2375             int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
2376             int min_dist_vert_hor =
2377                 (((((int) (mode - 26U)) >=
2378                    0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))) >
2379                  ((((int) (mode - 10U)) >=
2380                    0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2381                  ? ((((int) (mode - 10U)) >=
2382                      0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2383                  : ((((int) (mode - 26U)) >=
2384                      0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))));
2385             if (min_dist_vert_hor > intra_hor_ver_dist_thresh[4 - 3]) {
2386                 filtered_left[2 * 16 - 1] = left[2 * 16 - 1];
2387                 filtered_top[2 * 16 - 1] = top[2 * 16 - 1];
2388                 for (i = 2 * 16 - 2; i >= 0; i--)
2389                     filtered_left[i] = (left[i + 1] + 2 * left[i] +
2390                                         left[i - 1] + 2) >> 2;
2391                 filtered_top[-1] =
2392                     filtered_left[-1] =
2393                     (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
2394                 for (i = 2 * 16 - 2; i >= 0; i--)
2395                     filtered_top[i] = (top[i + 1] + 2 * top[i] +
2396                                        top[i - 1] + 2) >> 2;
2397                 left = filtered_left;
2398                 top = filtered_top;
2399             }
2400         }
2401     }
2402 
2403     switch (mode) {
2404     case INTRA_PLANAR:
2405         s->hpc.pred_planar[4 - 2] ((uint8_t *) src, (uint8_t *) top,
2406                                    (uint8_t *) left, stride);
2407         break;
2408     case INTRA_DC:
2409         s->hpc.pred_dc((uint8_t *) src, (uint8_t *) top,
2410                        (uint8_t *) left, stride, 4, c_idx);
2411         break;
2412     default:
2413         s->hpc.pred_angular[4 - 2] ((uint8_t *) src, (uint8_t *) top,
2414                                     (uint8_t *) left, stride, c_idx, mode);
2415         break;
2416     }
2417 }
2418 
ff_intra_pred_8_32x32_msa(HEVCContext * s,int x0,int y0,int c_idx)2419 void ff_intra_pred_8_32x32_msa(HEVCContext *s, int x0, int y0, int c_idx)
2420 {
2421     v16u8 vec0, vec1;
2422     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2423     v8i16 res0, res1, res2, res3;
2424     v8i16 mul_val0 = { 63, 62, 61, 60, 59, 58, 57, 56 };
2425     v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
2426     HEVCLocalContext *lc = s->HEVClc;
2427     int i;
2428     int hshift = s->ps.sps->hshift[c_idx];
2429     int vshift = s->ps.sps->vshift[c_idx];
2430     int size_in_luma_h = 32 << hshift;
2431     int size_in_tbs_h = size_in_luma_h >> s->ps.sps->log2_min_tb_size;
2432     int size_in_luma_v = 32 << vshift;
2433     int size_in_tbs_v = size_in_luma_v >> s->ps.sps->log2_min_tb_size;
2434     int x = x0 >> hshift;
2435     int y = y0 >> vshift;
2436     int x_tb = (x0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
2437     int y_tb = (y0 >> s->ps.sps->log2_min_tb_size) & s->ps.sps->tb_mask;
2438 
2439     int cur_tb_addr =
2440         s->ps.pps->min_tb_addr_zs[(y_tb) * (s->ps.sps->tb_mask + 2) + (x_tb)];
2441 
2442     ptrdiff_t stride = s->frame->linesize[c_idx] / sizeof(uint8_t);
2443     uint8_t *src = (uint8_t *) s->frame->data[c_idx] + x + y * stride;
2444 
2445     int min_pu_width = s->ps.sps->min_pu_width;
2446 
2447     enum IntraPredMode mode = c_idx ? lc->tu.intra_pred_mode_c :
2448         lc->tu.intra_pred_mode;
2449     uint32_t a;
2450     uint8_t left_array[2 * 32 + 1];
2451     uint8_t filtered_left_array[2 * 32 + 1];
2452     uint8_t top_array[2 * 32 + 1];
2453     uint8_t filtered_top_array[2 * 32 + 1];
2454 
2455     uint8_t *left = left_array + 1;
2456     uint8_t *top = top_array + 1;
2457     uint8_t *filtered_left = filtered_left_array + 1;
2458     uint8_t *filtered_top = filtered_top_array + 1;
2459     int cand_bottom_left = lc->na.cand_bottom_left
2460         && cur_tb_addr >
2461         s->ps.pps->min_tb_addr_zs[((y_tb + size_in_tbs_v) & s->ps.sps->tb_mask) *
2462                                (s->ps.sps->tb_mask + 2) + (x_tb - 1)];
2463     int cand_left = lc->na.cand_left;
2464     int cand_up_left = lc->na.cand_up_left;
2465     int cand_up = lc->na.cand_up;
2466     int cand_up_right = lc->na.cand_up_right
2467         && cur_tb_addr >
2468         s->ps.pps->min_tb_addr_zs[(y_tb - 1) * (s->ps.sps->tb_mask + 2) +
2469                                ((x_tb + size_in_tbs_h) & s->ps.sps->tb_mask)];
2470 
2471     int bottom_left_size =
2472         (((y0 + 2 * size_in_luma_v) >
2473           (s->ps.sps->height) ? (s->ps.sps->height) : (y0 +
2474                                                  2 * size_in_luma_v)) -
2475          (y0 + size_in_luma_v)) >> vshift;
2476     int top_right_size =
2477         (((x0 + 2 * size_in_luma_h) >
2478           (s->ps.sps->width) ? (s->ps.sps->width) : (x0 + 2 * size_in_luma_h)) -
2479          (x0 + size_in_luma_h)) >> hshift;
2480 
2481     if (s->ps.pps->constrained_intra_pred_flag == 1) {
2482         int size_in_luma_pu_v = ((size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
2483         int size_in_luma_pu_h = ((size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
2484         int on_pu_edge_x = !(x0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
2485         int on_pu_edge_y = !(y0 & ((1 << s->ps.sps->log2_min_pu_size) - 1));
2486         if (!size_in_luma_pu_h)
2487             size_in_luma_pu_h++;
2488         if (cand_bottom_left == 1 && on_pu_edge_x) {
2489             int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
2490             int y_bottom_pu =
2491                 ((y0 + size_in_luma_v) >> s->ps.sps->log2_min_pu_size);
2492             int max =
2493                 ((size_in_luma_pu_v) >
2494                  (s->ps.sps->min_pu_height -
2495                   y_bottom_pu) ? (s->ps.sps->min_pu_height -
2496                                   y_bottom_pu) : (size_in_luma_pu_v));
2497             cand_bottom_left = 0;
2498             for (i = 0; i < max; i += 2)
2499                 cand_bottom_left |=
2500                     ((s->ref->tab_mvf[(x_left_pu) +
2501                                       (y_bottom_pu +
2502                                        i) * min_pu_width]).pred_flag ==
2503                      PF_INTRA);
2504         }
2505         if (cand_left == 1 && on_pu_edge_x) {
2506             int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
2507             int y_left_pu = ((y0) >> s->ps.sps->log2_min_pu_size);
2508             int max =
2509                 ((size_in_luma_pu_v) >
2510                  (s->ps.sps->min_pu_height -
2511                   y_left_pu) ? (s->ps.sps->min_pu_height -
2512                                 y_left_pu) : (size_in_luma_pu_v));
2513             cand_left = 0;
2514             for (i = 0; i < max; i += 2)
2515                 cand_left |=
2516                     ((s->ref->tab_mvf[(x_left_pu) +
2517                                       (y_left_pu +
2518                                        i) * min_pu_width]).pred_flag ==
2519                      PF_INTRA);
2520         }
2521         if (cand_up_left == 1) {
2522             int x_left_pu = ((x0 - 1) >> s->ps.sps->log2_min_pu_size);
2523             int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2524             cand_up_left =
2525                 (s->ref->tab_mvf[(x_left_pu) +
2526                                  (y_top_pu) * min_pu_width]).pred_flag ==
2527                 PF_INTRA;
2528         }
2529         if (cand_up == 1 && on_pu_edge_y) {
2530             int x_top_pu = ((x0) >> s->ps.sps->log2_min_pu_size);
2531             int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2532             int max =
2533                 ((size_in_luma_pu_h) >
2534                  (s->ps.sps->min_pu_width -
2535                   x_top_pu) ? (s->ps.sps->min_pu_width -
2536                                x_top_pu) : (size_in_luma_pu_h));
2537             cand_up = 0;
2538             for (i = 0; i < max; i += 2)
2539                 cand_up |=
2540                     ((s->ref->tab_mvf[(x_top_pu + i) +
2541                                       (y_top_pu) *
2542                                       min_pu_width]).pred_flag == PF_INTRA);
2543         }
2544         if (cand_up_right == 1 && on_pu_edge_y) {
2545             int y_top_pu = ((y0 - 1) >> s->ps.sps->log2_min_pu_size);
2546             int x_right_pu =
2547                 ((x0 + size_in_luma_h) >> s->ps.sps->log2_min_pu_size);
2548             int max =
2549                 ((size_in_luma_pu_h) >
2550                  (s->ps.sps->min_pu_width -
2551                   x_right_pu) ? (s->ps.sps->min_pu_width -
2552                                  x_right_pu) : (size_in_luma_pu_h));
2553             cand_up_right = 0;
2554             for (i = 0; i < max; i += 2)
2555                 cand_up_right |=
2556                     ((s->ref->tab_mvf[(x_right_pu + i) +
2557                                       (y_top_pu) *
2558                                       min_pu_width]).pred_flag == PF_INTRA);
2559         }
2560         vec0 = (v16u8) __msa_ldi_b(128);
2561 
2562         ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2563         ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2564 
2565         top[-1] = 128;
2566     }
2567     if (cand_up_left) {
2568         left[-1] = src[(-1) + stride * (-1)];
2569         top[-1] = left[-1];
2570     }
2571     if (cand_up) {
2572         LD_UB2(src - stride, 16, vec0, vec1);
2573         ST_UB2(vec0, vec1, top, 16);
2574     }
2575 
2576     if (cand_up_right) {
2577         LD_UB2(src - stride + 32, 16, vec0, vec1);
2578         ST_UB2(vec0, vec1, (top + 32), 16);
2579         do {
2580             uint32_t pix =
2581                 ((src[(32 + top_right_size - 1) + stride * (-1)]) *
2582                  0x01010101U);
2583             for (i = 0; i < (32 - top_right_size); i += 4)
2584                 ((((union unaligned_32 *) (top + 32 + top_right_size +
2585                                            i))->l) = (pix));
2586         } while (0);
2587     }
2588     if (cand_left)
2589         for (i = 0; i < 32; i++)
2590             left[i] = src[(-1) + stride * (i)];
2591     if (cand_bottom_left) {
2592         for (i = 32; i < 32 + bottom_left_size; i++)
2593             left[i] = src[(-1) + stride * (i)];
2594         do {
2595             uint32_t pix =
2596                 ((src[(-1) + stride * (32 + bottom_left_size - 1)]) *
2597                  0x01010101U);
2598             for (i = 0; i < (32 - bottom_left_size); i += 4)
2599                 ((((union unaligned_32 *) (left + 32 + bottom_left_size +
2600                                            i))->l) = (pix));
2601         } while (0);
2602     }
2603 
2604     if (s->ps.pps->constrained_intra_pred_flag == 1) {
2605         if (cand_bottom_left || cand_left || cand_up_left || cand_up
2606             || cand_up_right) {
2607             int size_max_x =
2608                 x0 + ((2 * 32) << hshift) <
2609                 s->ps.sps->width ? 2 * 32 : (s->ps.sps->width - x0) >> hshift;
2610             int size_max_y =
2611                 y0 + ((2 * 32) << vshift) <
2612                 s->ps.sps->height ? 2 * 32 : (s->ps.sps->height - y0) >> vshift;
2613             int j = 32 + (cand_bottom_left ? bottom_left_size : 0) - 1;
2614             if (!cand_up_right) {
2615                 size_max_x = x0 + ((32) << hshift) < s->ps.sps->width ?
2616                     32 : (s->ps.sps->width - x0) >> hshift;
2617             }
2618             if (!cand_bottom_left) {
2619                 size_max_y = y0 + ((32) << vshift) < s->ps.sps->height ?
2620                     32 : (s->ps.sps->height - y0) >> vshift;
2621             }
2622             if (cand_bottom_left || cand_left || cand_up_left) {
2623                 while (j > -1
2624                        &&
2625                        !((s->ref->tab_mvf[(((x0 +
2626                                              ((-1) << hshift)) >> s->ps.sps->
2627                                             log2_min_pu_size)) + (((y0 +
2628                                                                     ((j) <<
2629                                                                      vshift))
2630                                                                    >> s->ps.sps->
2631                                                                    log2_min_pu_size))
2632                                           * min_pu_width]).pred_flag ==
2633                          PF_INTRA))
2634                     j--;
2635                 if (!
2636                     ((s->ref->tab_mvf[(((x0 +
2637                                          ((-1) << hshift)) >> s->ps.sps->
2638                                         log2_min_pu_size)) + (((y0 + ((j)
2639                                                                       <<
2640                                                                       vshift))
2641                                                                >> s->ps.sps->
2642                                                                log2_min_pu_size))
2643                                       * min_pu_width]).pred_flag == PF_INTRA)) {
2644                     j = 0;
2645                     while (j < size_max_x
2646                            &&
2647                            !((s->ref->tab_mvf[(((x0 +
2648                                                  ((j) << hshift)) >> s->ps.sps->
2649                                                 log2_min_pu_size)) + (((y0 +
2650                                                                         ((-1) <<
2651                                                                          vshift))
2652                                                                        >> s->
2653                                                                        ps.sps->
2654                                                                        log2_min_pu_size))
2655                                               * min_pu_width]).pred_flag ==
2656                              PF_INTRA))
2657                         j++;
2658                     for (i = j; i > (j) - (j + 1); i--)
2659                         if (!
2660                             ((s->ref->tab_mvf[(((x0 +
2661                                                  ((i -
2662                                                    1) << hshift)) >> s->ps.sps->
2663                                                 log2_min_pu_size)) + (((y0 +
2664                                                                         ((-1) <<
2665                                                                          vshift))
2666                                                                        >> s->
2667                                                                        ps.sps->
2668                                                                        log2_min_pu_size))
2669                                               * min_pu_width]).pred_flag ==
2670                              PF_INTRA))
2671                             top[i - 1] = top[i];
2672                     left[-1] = top[-1];
2673                 }
2674             } else {
2675                 j = 0;
2676                 while (j < size_max_x
2677                        &&
2678                        !((s->ref->tab_mvf[(((x0 +
2679                                              ((j) << hshift)) >> s->ps.sps->
2680                                             log2_min_pu_size)) + (((y0 + ((-1)
2681                                                                           <<
2682                                                                           vshift))
2683                                                                    >> s->ps.sps->
2684                                                                    log2_min_pu_size))
2685                                           * min_pu_width]).pred_flag ==
2686                          PF_INTRA))
2687                     j++;
2688                 if (j > 0)
2689                     if (x0 > 0) {
2690                         for (i = j; i > (j) - (j + 1); i--)
2691                             if (!
2692                                 ((s->ref->tab_mvf[(((x0 +
2693                                                      ((i -
2694                                                        1) << hshift)) >>
2695                                                     s->ps.sps->log2_min_pu_size))
2696                                                   + (((y0 + ((-1)
2697                                                              << vshift))
2698                                                       >>
2699                                                       s->ps.sps->log2_min_pu_size))
2700                                                   *
2701                                                   min_pu_width]).pred_flag ==
2702                                  PF_INTRA))
2703                                 top[i - 1] = top[i];
2704                     } else {
2705                         for (i = j; i > (j) - (j); i--)
2706                             if (!
2707                                 ((s->ref->tab_mvf[(((x0 +
2708                                                      ((i -
2709                                                        1) << hshift)) >>
2710                                                     s->ps.sps->log2_min_pu_size))
2711                                                   + (((y0 + ((-1)
2712                                                              << vshift))
2713                                                       >>
2714                                                       s->ps.sps->log2_min_pu_size))
2715                                                   *
2716                                                   min_pu_width]).pred_flag ==
2717                                  PF_INTRA))
2718                                 top[i - 1] = top[i];
2719                         top[-1] = top[0];
2720                     }
2721                 left[-1] = top[-1];
2722             }
2723             left[-1] = top[-1];
2724             if (cand_bottom_left || cand_left) {
2725                 a = ((left[-1]) * 0x01010101U);
2726                 for (i = 0; i < (0) + (size_max_y); i += 4)
2727                     if (!
2728                         ((s->ref->tab_mvf[(((x0 +
2729                                              ((-1) << hshift)) >> s->ps.sps->
2730                                             log2_min_pu_size)) + (((y0 +
2731                                                                     ((i) <<
2732                                                                      vshift))
2733                                                                    >> s->ps.sps->
2734                                                                    log2_min_pu_size))
2735                                           * min_pu_width]).pred_flag ==
2736                          PF_INTRA))
2737                         ((((union unaligned_32 *) (&left[i]))->l) = (a));
2738                     else
2739                         a = ((left[i + 3]) * 0x01010101U);
2740             }
2741             if (!cand_left) {
2742                 vec0 = (v16u8) __msa_fill_b(left[-1]);
2743 
2744                 ST_UB2(vec0, vec0, left, 16);
2745             }
2746             if (!cand_bottom_left) {
2747                 vec0 = (v16u8) __msa_fill_b(left[31]);
2748 
2749                 ST_UB2(vec0, vec0, (left + 32), 16);
2750             }
2751             if (x0 != 0 && y0 != 0) {
2752                 a = ((left[size_max_y - 1]) * 0x01010101U);
2753                 for (i = (size_max_y - 1);
2754                      i > (size_max_y - 1) - (size_max_y); i -= 4)
2755                     if (!
2756                         ((s->ref->tab_mvf[(((x0 +
2757                                              ((-1) << hshift)) >> s->ps.sps->
2758                                             log2_min_pu_size)) + (((y0 +
2759                                                                     ((i -
2760                                                                       3) <<
2761                                                                      vshift))
2762                                                                    >> s->ps.sps->
2763                                                                    log2_min_pu_size))
2764                                           * min_pu_width]).pred_flag ==
2765                          PF_INTRA))
2766                         ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2767                     else
2768                         a = ((left[i - 3]) * 0x01010101U);
2769                 if (!
2770                     ((s->ref->tab_mvf[(((x0 +
2771                                          ((-1) << hshift)) >> s->ps.sps->
2772                                         log2_min_pu_size)) + (((y0 + ((-1)
2773                                                                       <<
2774                                                                       vshift))
2775                                                                >> s->ps.sps->
2776                                                                log2_min_pu_size))
2777                                       * min_pu_width]).pred_flag == PF_INTRA))
2778                     left[-1] = left[0];
2779             } else if (x0 == 0) {
2780                 do {
2781                     uint32_t pix = ((0) * 0x01010101U);
2782                     for (i = 0; i < (size_max_y); i += 4)
2783                         ((((union unaligned_32 *) (left + i))->l) = (pix));
2784                 } while (0);
2785             } else {
2786                 a = ((left[size_max_y - 1]) * 0x01010101U);
2787                 for (i = (size_max_y - 1);
2788                      i > (size_max_y - 1) - (size_max_y); i -= 4)
2789                     if (!
2790                         ((s->ref->tab_mvf[(((x0 +
2791                                              ((-1) << hshift)) >> s->ps.sps->
2792                                             log2_min_pu_size)) + (((y0 +
2793                                                                     ((i -
2794                                                                       3) <<
2795                                                                      vshift))
2796                                                                    >> s->ps.sps->
2797                                                                    log2_min_pu_size))
2798                                           * min_pu_width]).pred_flag ==
2799                          PF_INTRA))
2800                         ((((union unaligned_32 *) (&left[i - 3]))->l) = (a));
2801                     else
2802                         a = ((left[i - 3]) * 0x01010101U);
2803             }
2804             top[-1] = left[-1];
2805             if (y0 != 0) {
2806                 a = ((left[-1]) * 0x01010101U);
2807                 for (i = 0; i < (0) + (size_max_x); i += 4)
2808                     if (!
2809                         ((s->ref->tab_mvf[(((x0 +
2810                                              ((i) << hshift)) >> s->ps.sps->
2811                                             log2_min_pu_size)) + (((y0 + ((-1)
2812                                                                           <<
2813                                                                           vshift))
2814                                                                    >> s->ps.sps->
2815                                                                    log2_min_pu_size))
2816                                           * min_pu_width]).pred_flag ==
2817                          PF_INTRA))
2818                         ((((union unaligned_32 *) (&top[i]))->l) = (a));
2819                     else
2820                         a = ((top[i + 3]) * 0x01010101U);
2821             }
2822         }
2823     }
2824 
2825     if (!cand_bottom_left) {
2826         if (cand_left) {
2827             vec0 = (v16u8) __msa_fill_b(left[31]);
2828 
2829             ST_UB2(vec0, vec0, (left + 32), 16);
2830         } else if (cand_up_left) {
2831             vec0 = (v16u8) __msa_fill_b(left[-1]);
2832 
2833             ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2834 
2835             cand_left = 1;
2836         } else if (cand_up) {
2837             left[-1] = top[0];
2838 
2839             vec0 = (v16u8) __msa_fill_b(left[-1]);
2840 
2841             ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2842 
2843             cand_up_left = 1;
2844             cand_left = 1;
2845         } else if (cand_up_right) {
2846             vec0 = (v16u8) __msa_fill_b(top[32]);
2847 
2848             ST_UB2(vec0, vec0, top, 16);
2849 
2850             left[-1] = top[32];
2851 
2852             ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2853 
2854             cand_up = 1;
2855             cand_up_left = 1;
2856             cand_left = 1;
2857         } else {
2858             left[-1] = 128;
2859 
2860             vec0 = (v16u8) __msa_ldi_b(128);
2861 
2862             ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2863             ST_UB4(vec0, vec0, vec0, vec0, left, 16);
2864         }
2865     }
2866 
2867     if (!cand_left) {
2868         vec0 = (v16u8) __msa_fill_b(left[32]);
2869 
2870         ST_UB2(vec0, vec0, left, 16);
2871     }
2872     if (!cand_up_left) {
2873         left[-1] = left[0];
2874     }
2875     if (!cand_up) {
2876         vec0 = (v16u8) __msa_fill_b(left[-1]);
2877 
2878         ST_UB2(vec0, vec0, top, 16);
2879     }
2880     if (!cand_up_right) {
2881         vec0 = (v16u8) __msa_fill_b(top[31]);
2882 
2883         ST_UB2(vec0, vec0, (top + 32), 16);
2884     }
2885 
2886     top[-1] = left[-1];
2887 
2888 
2889     if (!s->ps.sps->intra_smoothing_disabled_flag
2890         && (c_idx == 0 || s->ps.sps->chroma_format_idc == 3)) {
2891         if (mode != INTRA_DC && 32 != 4) {
2892             int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
2893             int min_dist_vert_hor =
2894                 (((((int) (mode - 26U)) >=
2895                    0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))) >
2896                  ((((int) (mode - 10U)) >=
2897                    0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2898                  ? ((((int) (mode - 10U)) >=
2899                      0 ? ((int) (mode - 10U)) : (-((int) (mode - 10U)))))
2900                  : ((((int) (mode - 26U)) >=
2901                      0 ? ((int) (mode - 26U)) : (-((int) (mode - 26U))))));
2902             if (min_dist_vert_hor > intra_hor_ver_dist_thresh[5 - 3]) {
2903                 int threshold = 1 << (8 - 5);
2904                 if (s->ps.sps->sps_strong_intra_smoothing_enable_flag
2905                     && c_idx == 0
2906                     && ((top[-1] + top[63] - 2 * top[31]) >=
2907                         0 ? (top[-1] + top[63] -
2908                              2 * top[31]) : (-(top[-1] + top[63] -
2909                                                2 * top[31]))) < threshold
2910                     && ((left[-1] + left[63] - 2 * left[31]) >=
2911                         0 ? (left[-1] + left[63] -
2912                              2 * left[31]) : (-(left[-1] + left[63] -
2913                                                 2 * left[31]))) < threshold) {
2914 
2915 
2916                     filtered_top[-1] = top[-1];
2917                     filtered_top[63] = top[63];
2918 
2919 
2920                     for (i = 0; i < 63; i++) {
2921                         filtered_top[i] =
2922                             ((63 - i) * top[-1] + (i + 1) * top[63] + 32) >> 6;
2923                     }
2924 
2925                     tmp0 = __msa_fill_h(top[-1]);
2926                     tmp1 = __msa_fill_h(top[63]);
2927 
2928                     tmp2 = mul_val0 - 8;
2929                     tmp3 = mul_val0 - 16;
2930                     tmp4 = mul_val0 - 24;
2931                     tmp5 = mul_val1 + 8;
2932                     tmp6 = mul_val1 + 16;
2933                     tmp7 = mul_val1 + 24;
2934 
2935                     res0 = mul_val0 * tmp0;
2936                     res1 = tmp2 * tmp0;
2937                     res2 = tmp3 * tmp0;
2938                     res3 = tmp4 * tmp0;
2939                     res0 += mul_val1 * tmp1;
2940                     res1 += tmp5 * tmp1;
2941                     res2 += tmp6 * tmp1;
2942                     res3 += tmp7 * tmp1;
2943 
2944                     res0 = __msa_srari_h(res0, 6);
2945                     res1 = __msa_srari_h(res1, 6);
2946                     res2 = __msa_srari_h(res2, 6);
2947                     res3 = __msa_srari_h(res3, 6);
2948 
2949                     vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
2950                     vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
2951 
2952                     ST_UB2(vec0, vec1, filtered_top, 16);
2953 
2954                     res0 = mul_val0 - 32;
2955                     tmp2 = mul_val0 - 40;
2956                     tmp3 = mul_val0 - 48;
2957                     tmp4 = mul_val0 - 56;
2958                     res3 = mul_val1 + 32;
2959                     tmp5 = mul_val1 + 40;
2960                     tmp6 = mul_val1 + 48;
2961                     tmp7 = mul_val1 + 56;
2962 
2963                     res0 = res0 * tmp0;
2964                     res1 = tmp2 * tmp0;
2965                     res2 = tmp3 * tmp0;
2966                     res0 += res3 * tmp1;
2967                     res3 = tmp4 * tmp0;
2968                     res1 += tmp5 * tmp1;
2969                     res2 += tmp6 * tmp1;
2970                     res3 += tmp7 * tmp1;
2971 
2972                     res0 = __msa_srari_h(res0, 6);
2973                     res1 = __msa_srari_h(res1, 6);
2974                     res2 = __msa_srari_h(res2, 6);
2975                     res3 = __msa_srari_h(res3, 6);
2976 
2977                     vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
2978                     vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
2979 
2980                     ST_UB2(vec0, vec1, (filtered_top + 32), 16);
2981 
2982                     filtered_top[63] = top[63];
2983 
2984                     tmp0 = __msa_fill_h(left[-1]);
2985                     tmp1 = __msa_fill_h(left[63]);
2986 
2987                     tmp2 = mul_val0 - 8;
2988                     tmp3 = mul_val0 - 16;
2989                     tmp4 = mul_val0 - 24;
2990                     tmp5 = mul_val1 + 8;
2991                     tmp6 = mul_val1 + 16;
2992                     tmp7 = mul_val1 + 24;
2993 
2994                     res0 = mul_val0 * tmp0;
2995                     res1 = tmp2 * tmp0;
2996                     res2 = tmp3 * tmp0;
2997                     res3 = tmp4 * tmp0;
2998                     res0 += mul_val1 * tmp1;
2999                     res1 += tmp5 * tmp1;
3000                     res2 += tmp6 * tmp1;
3001                     res3 += tmp7 * tmp1;
3002 
3003                     res0 = __msa_srari_h(res0, 6);
3004                     res1 = __msa_srari_h(res1, 6);
3005                     res2 = __msa_srari_h(res2, 6);
3006                     res3 = __msa_srari_h(res3, 6);
3007 
3008                     vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
3009                     vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
3010 
3011                     ST_UB2(vec0, vec1, left, 16);
3012 
3013                     res0 = mul_val0 - 32;
3014                     tmp2 = mul_val0 - 40;
3015                     tmp3 = mul_val0 - 48;
3016                     tmp4 = mul_val0 - 56;
3017                     res3 = mul_val1 + 32;
3018                     tmp5 = mul_val1 + 40;
3019                     tmp6 = mul_val1 + 48;
3020                     tmp7 = mul_val1 + 56;
3021 
3022                     res0 = res0 * tmp0;
3023                     res1 = tmp2 * tmp0;
3024                     res2 = tmp3 * tmp0;
3025                     res0 += res3 * tmp1;
3026                     res3 = tmp4 * tmp0;
3027                     res1 += tmp5 * tmp1;
3028                     res2 += tmp6 * tmp1;
3029                     res3 += tmp7 * tmp1;
3030 
3031                     res0 = __msa_srari_h(res0, 6);
3032                     res1 = __msa_srari_h(res1, 6);
3033                     res2 = __msa_srari_h(res2, 6);
3034                     res3 = __msa_srari_h(res3, 6);
3035 
3036                     vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
3037                     vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
3038 
3039                     ST_UB2(vec0, vec1, (left + 32), 16);
3040 
3041                     left[63] = tmp1[0];
3042 
3043                     top = filtered_top;
3044                 } else {
3045                     filtered_left[2 * 32 - 1] = left[2 * 32 - 1];
3046                     filtered_top[2 * 32 - 1] = top[2 * 32 - 1];
3047                     for (i = 2 * 32 - 2; i >= 0; i--)
3048                         filtered_left[i] = (left[i + 1] + 2 * left[i] +
3049                                             left[i - 1] + 2) >> 2;
3050                     filtered_top[-1] =
3051                         filtered_left[-1] =
3052                         (left[0] + 2 * left[-1] + top[0] + 2) >> 2;
3053                     for (i = 2 * 32 - 2; i >= 0; i--)
3054                         filtered_top[i] = (top[i + 1] + 2 * top[i] +
3055                                            top[i - 1] + 2) >> 2;
3056                     left = filtered_left;
3057                     top = filtered_top;
3058                 }
3059             }
3060         }
3061     }
3062 
3063     switch (mode) {
3064     case INTRA_PLANAR:
3065         s->hpc.pred_planar[3] ((uint8_t *) src, (uint8_t *) top,
3066                                (uint8_t *) left, stride);
3067         break;
3068     case INTRA_DC:
3069         s->hpc.pred_dc((uint8_t *) src, (uint8_t *) top,
3070                        (uint8_t *) left, stride, 5, c_idx);
3071         break;
3072     default:
3073         s->hpc.pred_angular[3] ((uint8_t *) src, (uint8_t *) top,
3074                                 (uint8_t *) left, stride, c_idx, mode);
3075         break;
3076     }
3077 }
3078