1 /*****************************************************************************
2  * predict-c.c: msa intra prediction
3  *****************************************************************************
4  * Copyright (C) 2015-2021 x264 project
5  *
6  * Authors: Mandar Sahastrabuddhe <mandar.sahastrabuddhe@imgtec.com>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21  *
22  * This program is also available under a commercial proprietary license.
23  * For more information, contact us at licensing@x264.com.
24  *****************************************************************************/
25 
26 #include "common/common.h"
27 #include "macros.h"
28 #include "predict.h"
29 
30 #if !HIGH_BIT_DEPTH
intra_predict_vert_4x4_msa(uint8_t * p_src,uint8_t * p_dst,int32_t i_dst_stride)31 static void intra_predict_vert_4x4_msa( uint8_t *p_src, uint8_t *p_dst,
32                                         int32_t i_dst_stride )
33 {
34     uint32_t u_src_data;
35 
36     u_src_data = LW( p_src );
37 
38     SW4( u_src_data, u_src_data, u_src_data, u_src_data, p_dst, i_dst_stride );
39 }
40 
intra_predict_vert_8x8_msa(uint8_t * p_src,uint8_t * p_dst,int32_t i_dst_stride)41 static void intra_predict_vert_8x8_msa( uint8_t *p_src, uint8_t *p_dst,
42                                         int32_t i_dst_stride )
43 {
44     uint64_t u_out;
45 
46     u_out = LD( p_src );
47 
48     SD4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride );
49     p_dst += ( 4 * i_dst_stride );
50     SD4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride );
51 }
52 
intra_predict_vert_16x16_msa(uint8_t * p_src,uint8_t * p_dst,int32_t i_dst_stride)53 static void intra_predict_vert_16x16_msa( uint8_t *p_src, uint8_t *p_dst,
54                                           int32_t i_dst_stride )
55 {
56     v16u8 src0 = LD_UB( p_src );
57 
58     ST_UB8( src0, src0, src0, src0, src0, src0, src0, src0, p_dst,
59             i_dst_stride );
60     p_dst += ( 8 * i_dst_stride );
61     ST_UB8( src0, src0, src0, src0, src0, src0, src0, src0, p_dst,
62             i_dst_stride );
63 }
64 
intra_predict_horiz_4x4_msa(uint8_t * p_src,int32_t i_src_stride,uint8_t * p_dst,int32_t i_dst_stride)65 static void intra_predict_horiz_4x4_msa( uint8_t *p_src, int32_t i_src_stride,
66                                          uint8_t *p_dst, int32_t i_dst_stride )
67 {
68     uint32_t u_out0, u_out1, u_out2, u_out3;
69 
70     u_out0 = p_src[0 * i_src_stride] * 0x01010101;
71     u_out1 = p_src[1 * i_src_stride] * 0x01010101;
72     u_out2 = p_src[2 * i_src_stride] * 0x01010101;
73     u_out3 = p_src[3 * i_src_stride] * 0x01010101;
74 
75     SW4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
76 }
77 
intra_predict_horiz_8x8_msa(uint8_t * p_src,int32_t i_src_stride,uint8_t * p_dst,int32_t i_dst_stride)78 static void intra_predict_horiz_8x8_msa( uint8_t *p_src, int32_t i_src_stride,
79                                          uint8_t *p_dst, int32_t i_dst_stride )
80 {
81     uint64_t u_out0, u_out1, u_out2, u_out3, u_out4, u_out5, u_out6, u_out7;
82 
83     u_out0 = p_src[0 * i_src_stride] * 0x0101010101010101ull;
84     u_out1 = p_src[1 * i_src_stride] * 0x0101010101010101ull;
85     u_out2 = p_src[2 * i_src_stride] * 0x0101010101010101ull;
86     u_out3 = p_src[3 * i_src_stride] * 0x0101010101010101ull;
87     u_out4 = p_src[4 * i_src_stride] * 0x0101010101010101ull;
88     u_out5 = p_src[5 * i_src_stride] * 0x0101010101010101ull;
89     u_out6 = p_src[6 * i_src_stride] * 0x0101010101010101ull;
90     u_out7 = p_src[7 * i_src_stride] * 0x0101010101010101ull;
91 
92     SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
93     p_dst += ( 4 * i_dst_stride );
94     SD4( u_out4, u_out5, u_out6, u_out7, p_dst, i_dst_stride );
95 }
96 
intra_predict_horiz_16x16_msa(uint8_t * p_src,int32_t i_src_stride,uint8_t * p_dst,int32_t i_dst_stride)97 static void intra_predict_horiz_16x16_msa( uint8_t *p_src, int32_t i_src_stride,
98                                            uint8_t *p_dst,
99                                            int32_t i_dst_stride )
100 {
101     uint32_t u_row;
102     uint8_t u_inp0, u_inp1, u_inp2, u_inp3;
103     v16u8 src0, src1, src2, src3;
104 
105     for( u_row = 4; u_row--; )
106     {
107         u_inp0 = p_src[0];
108         p_src += i_src_stride;
109         u_inp1 = p_src[0];
110         p_src += i_src_stride;
111         u_inp2 = p_src[0];
112         p_src += i_src_stride;
113         u_inp3 = p_src[0];
114         p_src += i_src_stride;
115 
116         src0 = ( v16u8 ) __msa_fill_b( u_inp0 );
117         src1 = ( v16u8 ) __msa_fill_b( u_inp1 );
118         src2 = ( v16u8 ) __msa_fill_b( u_inp2 );
119         src3 = ( v16u8 ) __msa_fill_b( u_inp3 );
120 
121         ST_UB4( src0, src1, src2, src3, p_dst, i_dst_stride );
122         p_dst += ( 4 * i_dst_stride );
123     }
124 }
125 
intra_predict_dc_4x4_msa(uint8_t * p_src_top,uint8_t * p_src_left,int32_t i_src_stride_left,uint8_t * p_dst,int32_t i_dst_stride,uint8_t is_above,uint8_t is_left)126 static void intra_predict_dc_4x4_msa( uint8_t *p_src_top, uint8_t *p_src_left,
127                                       int32_t i_src_stride_left,
128                                       uint8_t *p_dst, int32_t i_dst_stride,
129                                       uint8_t is_above, uint8_t is_left )
130 {
131     uint32_t u_row;
132     uint32_t u_out, u_addition = 0;
133     v16u8 src_above, store;
134     v8u16 sum_above;
135     v4u32 sum;
136 
137     if( is_left && is_above )
138     {
139         src_above = LD_UB( p_src_top );
140 
141         sum_above = __msa_hadd_u_h( src_above, src_above );
142         sum = __msa_hadd_u_w( sum_above, sum_above );
143         u_addition = __msa_copy_u_w( ( v4i32 ) sum, 0 );
144 
145         for( u_row = 0; u_row < 4; u_row++ )
146         {
147             u_addition += p_src_left[u_row * i_src_stride_left];
148         }
149 
150         u_addition = ( u_addition + 4 ) >> 3;
151         store = ( v16u8 ) __msa_fill_b( u_addition );
152     }
153     else if( is_left )
154     {
155         for( u_row = 0; u_row < 4; u_row++ )
156         {
157             u_addition += p_src_left[u_row * i_src_stride_left];
158         }
159 
160         u_addition = ( u_addition + 2 ) >> 2;
161         store = ( v16u8 ) __msa_fill_b( u_addition );
162     }
163     else if( is_above )
164     {
165         src_above = LD_UB( p_src_top );
166 
167         sum_above = __msa_hadd_u_h( src_above, src_above );
168         sum = __msa_hadd_u_w( sum_above, sum_above );
169         sum = ( v4u32 ) __msa_srari_w( ( v4i32 ) sum, 2 );
170         store = ( v16u8 ) __msa_splati_b( ( v16i8 ) sum, 0 );
171     }
172     else
173     {
174         store = ( v16u8 ) __msa_ldi_b( 128 );
175     }
176 
177     u_out = __msa_copy_u_w( ( v4i32 ) store, 0 );
178 
179     SW4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride );
180 }
181 
intra_predict_dc_8x8_msa(uint8_t * p_src_top,uint8_t * p_src_left,uint8_t * p_dst,int32_t i_dst_stride)182 static void intra_predict_dc_8x8_msa( uint8_t *p_src_top, uint8_t *p_src_left,
183                                       uint8_t *p_dst, int32_t i_dst_stride )
184 {
185     uint64_t u_val0, u_val1;
186     v16i8 store;
187     v16u8 src = { 0 };
188     v8u16 sum_h;
189     v4u32 sum_w;
190     v2u64 sum_d;
191 
192     u_val0 = LD( p_src_top );
193     u_val1 = LD( p_src_left );
194     INSERT_D2_UB( u_val0, u_val1, src );
195     sum_h = __msa_hadd_u_h( src, src );
196     sum_w = __msa_hadd_u_w( sum_h, sum_h );
197     sum_d = __msa_hadd_u_d( sum_w, sum_w );
198     sum_w = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum_d, ( v4i32 ) sum_d );
199     sum_d = __msa_hadd_u_d( sum_w, sum_w );
200     sum_w = ( v4u32 ) __msa_srari_w( ( v4i32 ) sum_d, 4 );
201     store = __msa_splati_b( ( v16i8 ) sum_w, 0 );
202     u_val0 = __msa_copy_u_d( ( v2i64 ) store, 0 );
203 
204     SD4( u_val0, u_val0, u_val0, u_val0, p_dst, i_dst_stride );
205     p_dst += ( 4 * i_dst_stride );
206     SD4( u_val0, u_val0, u_val0, u_val0, p_dst, i_dst_stride );
207 }
208 
intra_predict_dc_16x16_msa(uint8_t * p_src_top,uint8_t * p_src_left,int32_t i_src_stride_left,uint8_t * p_dst,int32_t i_dst_stride,uint8_t is_above,uint8_t is_left)209 static void intra_predict_dc_16x16_msa( uint8_t *p_src_top, uint8_t *p_src_left,
210                                         int32_t i_src_stride_left,
211                                         uint8_t *p_dst, int32_t i_dst_stride,
212                                         uint8_t is_above, uint8_t is_left )
213 {
214     uint32_t u_row;
215     uint32_t u_addition = 0;
216     v16u8 src_above, store;
217     v8u16 sum_above;
218     v4u32 sum_top;
219     v2u64 sum;
220 
221     if( is_left && is_above )
222     {
223         src_above = LD_UB( p_src_top );
224 
225         sum_above = __msa_hadd_u_h( src_above, src_above );
226         sum_top = __msa_hadd_u_w( sum_above, sum_above );
227         sum = __msa_hadd_u_d( sum_top, sum_top );
228         sum_top = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum, ( v4i32 ) sum );
229         sum = __msa_hadd_u_d( sum_top, sum_top );
230         u_addition = __msa_copy_u_w( ( v4i32 ) sum, 0 );
231 
232         for( u_row = 0; u_row < 16; u_row++ )
233         {
234             u_addition += p_src_left[u_row * i_src_stride_left];
235         }
236 
237         u_addition = ( u_addition + 16 ) >> 5;
238         store = ( v16u8 ) __msa_fill_b( u_addition );
239     }
240     else if( is_left )
241     {
242         for( u_row = 0; u_row < 16; u_row++ )
243         {
244             u_addition += p_src_left[u_row * i_src_stride_left];
245         }
246 
247         u_addition = ( u_addition + 8 ) >> 4;
248         store = ( v16u8 ) __msa_fill_b( u_addition );
249     }
250     else if( is_above )
251     {
252         src_above = LD_UB( p_src_top );
253 
254         sum_above = __msa_hadd_u_h( src_above, src_above );
255         sum_top = __msa_hadd_u_w( sum_above, sum_above );
256         sum = __msa_hadd_u_d( sum_top, sum_top );
257         sum_top = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum, ( v4i32 ) sum );
258         sum = __msa_hadd_u_d( sum_top, sum_top );
259         sum = ( v2u64 ) __msa_srari_d( ( v2i64 ) sum, 4 );
260         store = ( v16u8 ) __msa_splati_b( ( v16i8 ) sum, 0 );
261     }
262     else
263     {
264         store = ( v16u8 ) __msa_ldi_b( 128 );
265     }
266 
267     ST_UB8( store, store, store, store, store, store, store, store, p_dst,
268             i_dst_stride );
269     p_dst += ( 8 * i_dst_stride );
270     ST_UB8( store, store, store, store, store, store, store, store, p_dst,
271             i_dst_stride );
272 }
273 
intra_predict_plane_8x8_msa(uint8_t * p_src,int32_t i_stride)274 static void intra_predict_plane_8x8_msa( uint8_t *p_src, int32_t i_stride )
275 {
276     uint8_t u_lpcnt;
277     int32_t i_res, i_res0, i_res1, i_res2, i_res3;
278     uint64_t u_out0, u_out1;
279     v16i8 shf_mask = { 3, 5, 2, 6, 1, 7, 0, 8, 3, 5, 2, 6, 1, 7, 0, 8 };
280     v8i16 short_multiplier = { 1, 2, 3, 4, 1, 2, 3, 4 };
281     v4i32 int_multiplier = { 0, 1, 2, 3 };
282     v16u8 p_src_top;
283     v8i16 vec9, vec10, vec11;
284     v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8;
285     v2i64 sum;
286 
287     p_src_top = LD_UB( p_src - ( i_stride + 1 ) );
288     p_src_top = ( v16u8 ) __msa_vshf_b( shf_mask, ( v16i8 ) p_src_top,
289                                         ( v16i8 ) p_src_top );
290 
291     vec9 = __msa_hsub_u_h( p_src_top, p_src_top );
292     vec9 *= short_multiplier;
293     vec8 = __msa_hadd_s_w( vec9, vec9 );
294     sum = __msa_hadd_s_d( vec8, vec8 );
295 
296     i_res0 = __msa_copy_s_w( ( v4i32 ) sum, 0 );
297 
298     i_res1 = ( p_src[4 * i_stride - 1] - p_src[2 * i_stride - 1] ) +
299              2 * ( p_src[5 * i_stride - 1] - p_src[i_stride - 1] ) +
300              3 * ( p_src[6 * i_stride - 1] - p_src[-1] ) +
301              4 * ( p_src[7 * i_stride - 1] - p_src[-i_stride - 1] );
302 
303     i_res0 *= 17;
304     i_res1 *= 17;
305     i_res0 = ( i_res0 + 16 ) >> 5;
306     i_res1 = ( i_res1 + 16 ) >> 5;
307 
308     i_res3 = 3 * ( i_res0 + i_res1 );
309     i_res2 = 16 * ( p_src[7 * i_stride - 1] + p_src[-i_stride + 7] + 1 );
310     i_res = i_res2 - i_res3;
311 
312     vec8 = __msa_fill_w( i_res0 );
313     vec4 = __msa_fill_w( i_res );
314     vec2 = __msa_fill_w( i_res1 );
315     vec5 = vec8 * int_multiplier;
316     vec3 = vec8 * 4;
317 
318     for( u_lpcnt = 4; u_lpcnt--; )
319     {
320         vec0 = vec5;
321         vec0 += vec4;
322         vec1 = vec0 + vec3;
323         vec6 = vec5;
324         vec4 += vec2;
325         vec6 += vec4;
326         vec7 = vec6 + vec3;
327 
328         SRA_4V( vec0, vec1, vec6, vec7, 5 );
329         PCKEV_H2_SH( vec1, vec0, vec7, vec6, vec10, vec11 );
330         CLIP_SH2_0_255( vec10, vec11 );
331         PCKEV_B2_SH( vec10, vec10, vec11, vec11, vec10, vec11 );
332 
333         u_out0 = __msa_copy_s_d( ( v2i64 ) vec10, 0 );
334         u_out1 = __msa_copy_s_d( ( v2i64 ) vec11, 0 );
335         SD( u_out0, p_src );
336         p_src += i_stride;
337         SD( u_out1, p_src );
338         p_src += i_stride;
339 
340         vec4 += vec2;
341     }
342 }
343 
intra_predict_plane_16x16_msa(uint8_t * p_src,int32_t i_stride)344 static void intra_predict_plane_16x16_msa( uint8_t *p_src, int32_t i_stride )
345 {
346     uint8_t u_lpcnt;
347     int32_t i_res0, i_res1, i_res2, i_res3;
348     uint64_t u_load0, u_load1;
349     v16i8 shf_mask = { 7, 8, 6, 9, 5, 10, 4, 11, 3, 12, 2, 13, 1, 14, 0, 15 };
350     v8i16 short_multiplier = { 1, 2, 3, 4, 5, 6, 7, 8 };
351     v4i32 int_multiplier = { 0, 1, 2, 3 };
352     v16u8 p_src_top = { 0 };
353     v8i16 vec9, vec10;
354     v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, res_add;
355 
356     u_load0 = LD( p_src - ( i_stride + 1 ) );
357     u_load1 = LD( p_src - ( i_stride + 1 ) + 9 );
358 
359     INSERT_D2_UB( u_load0, u_load1, p_src_top );
360 
361     p_src_top = ( v16u8 ) __msa_vshf_b( shf_mask, ( v16i8 ) p_src_top,
362                                         ( v16i8 ) p_src_top );
363 
364     vec9 = __msa_hsub_u_h( p_src_top, p_src_top );
365     vec9 *= short_multiplier;
366     vec8 = __msa_hadd_s_w( vec9, vec9 );
367     res_add = ( v4i32 ) __msa_hadd_s_d( vec8, vec8 );
368 
369     i_res0 = __msa_copy_s_w( res_add, 0 ) + __msa_copy_s_w( res_add, 2 );
370 
371     i_res1 = ( p_src[8 * i_stride - 1] - p_src[6 * i_stride - 1] ) +
372              2 * ( p_src[9 * i_stride - 1] - p_src[5 * i_stride - 1] ) +
373              3 * ( p_src[10 * i_stride - 1] - p_src[4 * i_stride - 1] ) +
374              4 * ( p_src[11 * i_stride - 1] - p_src[3 * i_stride - 1] ) +
375              5 * ( p_src[12 * i_stride - 1] - p_src[2 * i_stride - 1] ) +
376              6 * ( p_src[13 * i_stride - 1] - p_src[i_stride - 1] ) +
377              7 * ( p_src[14 * i_stride - 1] - p_src[-1] ) +
378              8 * ( p_src[15 * i_stride - 1] - p_src[-1 * i_stride - 1] );
379 
380     i_res0 *= 5;
381     i_res1 *= 5;
382     i_res0 = ( i_res0 + 32 ) >> 6;
383     i_res1 = ( i_res1 + 32 ) >> 6;
384 
385     i_res3 = 7 * ( i_res0 + i_res1 );
386     i_res2 = 16 * ( p_src[15 * i_stride - 1] + p_src[-i_stride + 15] + 1 );
387     i_res2 -= i_res3;
388 
389     vec8 = __msa_fill_w( i_res0 );
390     vec4 = __msa_fill_w( i_res2 );
391     vec5 = __msa_fill_w( i_res1 );
392     vec6 = vec8 * 4;
393     vec7 = vec8 * int_multiplier;
394 
395     for( u_lpcnt = 16; u_lpcnt--; )
396     {
397         vec0 = vec7;
398         vec0 += vec4;
399         vec1 = vec0 + vec6;
400         vec2 = vec1 + vec6;
401         vec3 = vec2 + vec6;
402 
403         SRA_4V( vec0, vec1, vec2, vec3, 5 );
404         PCKEV_H2_SH( vec1, vec0, vec3, vec2, vec9, vec10 );
405         CLIP_SH2_0_255( vec9, vec10 );
406         PCKEV_ST_SB( vec9, vec10, p_src );
407         p_src += i_stride;
408 
409         vec4 += vec5;
410     }
411 }
412 
intra_predict_dc_4blk_8x8_msa(uint8_t * p_src,int32_t i_stride)413 static void intra_predict_dc_4blk_8x8_msa( uint8_t *p_src, int32_t i_stride )
414 {
415     uint8_t u_lp_cnt;
416     uint32_t u_src0, u_src1, u_src3, u_src2 = 0;
417     uint32_t u_out0, u_out1, u_out2, u_out3;
418     v16u8 p_src_top;
419     v8u16 add;
420     v4u32 sum;
421 
422     p_src_top = LD_UB( p_src - i_stride );
423     add = __msa_hadd_u_h( ( v16u8 ) p_src_top, ( v16u8 ) p_src_top );
424     sum = __msa_hadd_u_w( add, add );
425     u_src0 = __msa_copy_u_w( ( v4i32 ) sum, 0 );
426     u_src1 = __msa_copy_u_w( ( v4i32 ) sum, 1 );
427 
428     for( u_lp_cnt = 0; u_lp_cnt < 4; u_lp_cnt++ )
429     {
430         u_src0 += p_src[u_lp_cnt * i_stride - 1];
431         u_src2 += p_src[( 4 + u_lp_cnt ) * i_stride - 1];
432     }
433 
434     u_src0 = ( u_src0 + 4 ) >> 3;
435     u_src3 = ( u_src1 + u_src2 + 4 ) >> 3;
436     u_src1 = ( u_src1 + 2 ) >> 2;
437     u_src2 = ( u_src2 + 2 ) >> 2;
438 
439     u_out0 = u_src0 * 0x01010101;
440     u_out1 = u_src1 * 0x01010101;
441     u_out2 = u_src2 * 0x01010101;
442     u_out3 = u_src3 * 0x01010101;
443 
444     for( u_lp_cnt = 4; u_lp_cnt--; )
445     {
446         SW( u_out0, p_src );
447         SW( u_out1, ( p_src + 4 ) );
448         SW( u_out2, ( p_src + 4 * i_stride ) );
449         SW( u_out3, ( p_src + 4 * i_stride + 4 ) );
450         p_src += i_stride;
451     }
452 }
453 
intra_predict_ddl_8x8_msa(uint8_t * p_src,uint8_t * p_dst,int32_t i_dst_stride)454 static void intra_predict_ddl_8x8_msa( uint8_t *p_src, uint8_t *p_dst,
455                                        int32_t i_dst_stride )
456 {
457     uint8_t u_src_val = p_src[15];
458     uint64_t u_out0, u_out1, u_out2, u_out3;
459     v16u8 src, vec4, vec5, res0;
460     v8u16 vec0, vec1, vec2, vec3;
461     v2i64 res1, res2, res3;
462 
463     src = LD_UB( p_src );
464 
465     vec4 = ( v16u8 ) __msa_sldi_b( ( v16i8 ) src, ( v16i8 ) src, 1 );
466     vec5 = ( v16u8 ) __msa_sldi_b( ( v16i8 ) src, ( v16i8 ) src, 2 );
467     vec5 = ( v16u8 ) __msa_insert_b( ( v16i8 ) vec5, 14, u_src_val );
468     ILVR_B2_UH( vec5, src, vec4, vec4, vec0, vec1 );
469     ILVL_B2_UH( vec5, src, vec4, vec4, vec2, vec3 );
470     HADD_UB4_UH( vec0, vec1, vec2, vec3, vec0, vec1, vec2, vec3 );
471 
472     vec0 += vec1;
473     vec2 += vec3;
474     vec0 = ( v8u16 ) __msa_srari_h( ( v8i16 ) vec0, 2 );
475     vec2 = ( v8u16 ) __msa_srari_h( ( v8i16 ) vec2, 2 );
476 
477     res0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) vec2, ( v16i8 ) vec0 );
478     res1 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 1 );
479     res2 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 2 );
480     res3 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 3 );
481 
482     u_out0 = __msa_copy_u_d( ( v2i64 ) res0, 0 );
483     u_out1 = __msa_copy_u_d( res1, 0 );
484     u_out2 = __msa_copy_u_d( res2, 0 );
485     u_out3 = __msa_copy_u_d( res3, 0 );
486     SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
487     p_dst += ( 4 * i_dst_stride );
488 
489     res0 = ( v16u8 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 4 );
490     res1 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 1 );
491     res2 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 2 );
492     res3 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 3 );
493 
494     u_out0 = __msa_copy_u_d( ( v2i64 ) res0, 0 );
495     u_out1 = __msa_copy_u_d( res1, 0 );
496     u_out2 = __msa_copy_u_d( res2, 0 );
497     u_out3 = __msa_copy_u_d( res3, 0 );
498     SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
499 }
500 
intra_predict_128dc_16x16_msa(uint8_t * p_dst,int32_t i_dst_stride)501 static void intra_predict_128dc_16x16_msa( uint8_t *p_dst,
502                                            int32_t i_dst_stride )
503 {
504     v16u8 out = ( v16u8 ) __msa_ldi_b( 128 );
505 
506     ST_UB8( out, out, out, out, out, out, out, out, p_dst, i_dst_stride );
507     p_dst += ( 8 * i_dst_stride );
508     ST_UB8( out, out, out, out, out, out, out, out, p_dst, i_dst_stride );
509 }
510 
x264_intra_predict_dc_16x16_msa(uint8_t * p_src)511 void x264_intra_predict_dc_16x16_msa( uint8_t *p_src )
512 {
513     intra_predict_dc_16x16_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
514                                 FDEC_STRIDE, p_src, FDEC_STRIDE, 1, 1 );
515 }
516 
x264_intra_predict_dc_left_16x16_msa(uint8_t * p_src)517 void x264_intra_predict_dc_left_16x16_msa( uint8_t *p_src )
518 {
519     intra_predict_dc_16x16_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
520                                 FDEC_STRIDE, p_src, FDEC_STRIDE, 0, 1 );
521 }
522 
x264_intra_predict_dc_top_16x16_msa(uint8_t * p_src)523 void x264_intra_predict_dc_top_16x16_msa( uint8_t *p_src )
524 {
525     intra_predict_dc_16x16_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
526                                 FDEC_STRIDE, p_src, FDEC_STRIDE, 1, 0 );
527 }
528 
x264_intra_predict_dc_128_16x16_msa(uint8_t * p_src)529 void x264_intra_predict_dc_128_16x16_msa( uint8_t *p_src )
530 {
531     intra_predict_128dc_16x16_msa( p_src, FDEC_STRIDE );
532 }
533 
x264_intra_predict_hor_16x16_msa(uint8_t * p_src)534 void x264_intra_predict_hor_16x16_msa( uint8_t *p_src )
535 {
536     intra_predict_horiz_16x16_msa( ( p_src - 1 ), FDEC_STRIDE,
537                                    p_src, FDEC_STRIDE );
538 }
539 
x264_intra_predict_vert_16x16_msa(uint8_t * p_src)540 void x264_intra_predict_vert_16x16_msa( uint8_t *p_src )
541 {
542     intra_predict_vert_16x16_msa( ( p_src - FDEC_STRIDE ), p_src, FDEC_STRIDE );
543 }
544 
x264_intra_predict_plane_16x16_msa(uint8_t * p_src)545 void x264_intra_predict_plane_16x16_msa( uint8_t *p_src )
546 {
547     intra_predict_plane_16x16_msa( p_src, FDEC_STRIDE );
548 }
549 
x264_intra_predict_dc_4blk_8x8_msa(uint8_t * p_src)550 void x264_intra_predict_dc_4blk_8x8_msa( uint8_t *p_src )
551 {
552     intra_predict_dc_4blk_8x8_msa( p_src, FDEC_STRIDE );
553 }
554 
x264_intra_predict_hor_8x8_msa(uint8_t * p_src)555 void x264_intra_predict_hor_8x8_msa( uint8_t *p_src )
556 {
557     intra_predict_horiz_8x8_msa( ( p_src - 1 ), FDEC_STRIDE,
558                                  p_src, FDEC_STRIDE );
559 }
560 
x264_intra_predict_vert_8x8_msa(uint8_t * p_src)561 void x264_intra_predict_vert_8x8_msa( uint8_t *p_src )
562 {
563     intra_predict_vert_8x8_msa( ( p_src - FDEC_STRIDE ), p_src, FDEC_STRIDE );
564 }
565 
x264_intra_predict_plane_8x8_msa(uint8_t * p_src)566 void x264_intra_predict_plane_8x8_msa( uint8_t *p_src )
567 {
568     intra_predict_plane_8x8_msa( p_src, FDEC_STRIDE );
569 }
570 
x264_intra_predict_ddl_8x8_msa(uint8_t * p_src,uint8_t pu_xyz[36])571 void x264_intra_predict_ddl_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
572 {
573     intra_predict_ddl_8x8_msa( ( pu_xyz + 16 ), p_src, FDEC_STRIDE );
574 }
575 
x264_intra_predict_dc_8x8_msa(uint8_t * p_src,uint8_t pu_xyz[36])576 void x264_intra_predict_dc_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
577 {
578     intra_predict_dc_8x8_msa( ( pu_xyz + 16 ), ( pu_xyz + 7 ),
579                               p_src, FDEC_STRIDE );
580 }
581 
x264_intra_predict_h_8x8_msa(uint8_t * p_src,uint8_t pu_xyz[36])582 void x264_intra_predict_h_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
583 {
584     intra_predict_horiz_8x8_msa( ( pu_xyz + 14 ), -1, p_src, FDEC_STRIDE );
585 }
586 
x264_intra_predict_v_8x8_msa(uint8_t * p_src,uint8_t pu_xyz[36])587 void x264_intra_predict_v_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
588 {
589     intra_predict_vert_8x8_msa( ( pu_xyz + 16 ), p_src, FDEC_STRIDE );
590 }
591 
x264_intra_predict_dc_4x4_msa(uint8_t * p_src)592 void x264_intra_predict_dc_4x4_msa( uint8_t *p_src )
593 {
594     intra_predict_dc_4x4_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
595                               FDEC_STRIDE, p_src, FDEC_STRIDE, 1, 1 );
596 }
597 
x264_intra_predict_hor_4x4_msa(uint8_t * p_src)598 void x264_intra_predict_hor_4x4_msa( uint8_t *p_src )
599 {
600     intra_predict_horiz_4x4_msa( ( p_src - 1 ), FDEC_STRIDE,
601                                  p_src, FDEC_STRIDE );
602 }
603 
x264_intra_predict_vert_4x4_msa(uint8_t * p_src)604 void x264_intra_predict_vert_4x4_msa( uint8_t *p_src )
605 {
606     intra_predict_vert_4x4_msa( ( p_src - FDEC_STRIDE ), p_src, FDEC_STRIDE );
607 }
608 #endif
609