1 /*****************************************************************************
2 * predict-c.c: msa intra prediction
3 *****************************************************************************
4 * Copyright (C) 2015-2021 x264 project
5 *
6 * Authors: Mandar Sahastrabuddhe <mandar.sahastrabuddhe@imgtec.com>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21 *
22 * This program is also available under a commercial proprietary license.
23 * For more information, contact us at licensing@x264.com.
24 *****************************************************************************/
25
26 #include "common/common.h"
27 #include "macros.h"
28 #include "predict.h"
29
30 #if !HIGH_BIT_DEPTH
intra_predict_vert_4x4_msa(uint8_t * p_src,uint8_t * p_dst,int32_t i_dst_stride)31 static void intra_predict_vert_4x4_msa( uint8_t *p_src, uint8_t *p_dst,
32 int32_t i_dst_stride )
33 {
34 uint32_t u_src_data;
35
36 u_src_data = LW( p_src );
37
38 SW4( u_src_data, u_src_data, u_src_data, u_src_data, p_dst, i_dst_stride );
39 }
40
intra_predict_vert_8x8_msa(uint8_t * p_src,uint8_t * p_dst,int32_t i_dst_stride)41 static void intra_predict_vert_8x8_msa( uint8_t *p_src, uint8_t *p_dst,
42 int32_t i_dst_stride )
43 {
44 uint64_t u_out;
45
46 u_out = LD( p_src );
47
48 SD4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride );
49 p_dst += ( 4 * i_dst_stride );
50 SD4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride );
51 }
52
intra_predict_vert_16x16_msa(uint8_t * p_src,uint8_t * p_dst,int32_t i_dst_stride)53 static void intra_predict_vert_16x16_msa( uint8_t *p_src, uint8_t *p_dst,
54 int32_t i_dst_stride )
55 {
56 v16u8 src0 = LD_UB( p_src );
57
58 ST_UB8( src0, src0, src0, src0, src0, src0, src0, src0, p_dst,
59 i_dst_stride );
60 p_dst += ( 8 * i_dst_stride );
61 ST_UB8( src0, src0, src0, src0, src0, src0, src0, src0, p_dst,
62 i_dst_stride );
63 }
64
intra_predict_horiz_4x4_msa(uint8_t * p_src,int32_t i_src_stride,uint8_t * p_dst,int32_t i_dst_stride)65 static void intra_predict_horiz_4x4_msa( uint8_t *p_src, int32_t i_src_stride,
66 uint8_t *p_dst, int32_t i_dst_stride )
67 {
68 uint32_t u_out0, u_out1, u_out2, u_out3;
69
70 u_out0 = p_src[0 * i_src_stride] * 0x01010101;
71 u_out1 = p_src[1 * i_src_stride] * 0x01010101;
72 u_out2 = p_src[2 * i_src_stride] * 0x01010101;
73 u_out3 = p_src[3 * i_src_stride] * 0x01010101;
74
75 SW4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
76 }
77
intra_predict_horiz_8x8_msa(uint8_t * p_src,int32_t i_src_stride,uint8_t * p_dst,int32_t i_dst_stride)78 static void intra_predict_horiz_8x8_msa( uint8_t *p_src, int32_t i_src_stride,
79 uint8_t *p_dst, int32_t i_dst_stride )
80 {
81 uint64_t u_out0, u_out1, u_out2, u_out3, u_out4, u_out5, u_out6, u_out7;
82
83 u_out0 = p_src[0 * i_src_stride] * 0x0101010101010101ull;
84 u_out1 = p_src[1 * i_src_stride] * 0x0101010101010101ull;
85 u_out2 = p_src[2 * i_src_stride] * 0x0101010101010101ull;
86 u_out3 = p_src[3 * i_src_stride] * 0x0101010101010101ull;
87 u_out4 = p_src[4 * i_src_stride] * 0x0101010101010101ull;
88 u_out5 = p_src[5 * i_src_stride] * 0x0101010101010101ull;
89 u_out6 = p_src[6 * i_src_stride] * 0x0101010101010101ull;
90 u_out7 = p_src[7 * i_src_stride] * 0x0101010101010101ull;
91
92 SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
93 p_dst += ( 4 * i_dst_stride );
94 SD4( u_out4, u_out5, u_out6, u_out7, p_dst, i_dst_stride );
95 }
96
intra_predict_horiz_16x16_msa(uint8_t * p_src,int32_t i_src_stride,uint8_t * p_dst,int32_t i_dst_stride)97 static void intra_predict_horiz_16x16_msa( uint8_t *p_src, int32_t i_src_stride,
98 uint8_t *p_dst,
99 int32_t i_dst_stride )
100 {
101 uint32_t u_row;
102 uint8_t u_inp0, u_inp1, u_inp2, u_inp3;
103 v16u8 src0, src1, src2, src3;
104
105 for( u_row = 4; u_row--; )
106 {
107 u_inp0 = p_src[0];
108 p_src += i_src_stride;
109 u_inp1 = p_src[0];
110 p_src += i_src_stride;
111 u_inp2 = p_src[0];
112 p_src += i_src_stride;
113 u_inp3 = p_src[0];
114 p_src += i_src_stride;
115
116 src0 = ( v16u8 ) __msa_fill_b( u_inp0 );
117 src1 = ( v16u8 ) __msa_fill_b( u_inp1 );
118 src2 = ( v16u8 ) __msa_fill_b( u_inp2 );
119 src3 = ( v16u8 ) __msa_fill_b( u_inp3 );
120
121 ST_UB4( src0, src1, src2, src3, p_dst, i_dst_stride );
122 p_dst += ( 4 * i_dst_stride );
123 }
124 }
125
intra_predict_dc_4x4_msa(uint8_t * p_src_top,uint8_t * p_src_left,int32_t i_src_stride_left,uint8_t * p_dst,int32_t i_dst_stride,uint8_t is_above,uint8_t is_left)126 static void intra_predict_dc_4x4_msa( uint8_t *p_src_top, uint8_t *p_src_left,
127 int32_t i_src_stride_left,
128 uint8_t *p_dst, int32_t i_dst_stride,
129 uint8_t is_above, uint8_t is_left )
130 {
131 uint32_t u_row;
132 uint32_t u_out, u_addition = 0;
133 v16u8 src_above, store;
134 v8u16 sum_above;
135 v4u32 sum;
136
137 if( is_left && is_above )
138 {
139 src_above = LD_UB( p_src_top );
140
141 sum_above = __msa_hadd_u_h( src_above, src_above );
142 sum = __msa_hadd_u_w( sum_above, sum_above );
143 u_addition = __msa_copy_u_w( ( v4i32 ) sum, 0 );
144
145 for( u_row = 0; u_row < 4; u_row++ )
146 {
147 u_addition += p_src_left[u_row * i_src_stride_left];
148 }
149
150 u_addition = ( u_addition + 4 ) >> 3;
151 store = ( v16u8 ) __msa_fill_b( u_addition );
152 }
153 else if( is_left )
154 {
155 for( u_row = 0; u_row < 4; u_row++ )
156 {
157 u_addition += p_src_left[u_row * i_src_stride_left];
158 }
159
160 u_addition = ( u_addition + 2 ) >> 2;
161 store = ( v16u8 ) __msa_fill_b( u_addition );
162 }
163 else if( is_above )
164 {
165 src_above = LD_UB( p_src_top );
166
167 sum_above = __msa_hadd_u_h( src_above, src_above );
168 sum = __msa_hadd_u_w( sum_above, sum_above );
169 sum = ( v4u32 ) __msa_srari_w( ( v4i32 ) sum, 2 );
170 store = ( v16u8 ) __msa_splati_b( ( v16i8 ) sum, 0 );
171 }
172 else
173 {
174 store = ( v16u8 ) __msa_ldi_b( 128 );
175 }
176
177 u_out = __msa_copy_u_w( ( v4i32 ) store, 0 );
178
179 SW4( u_out, u_out, u_out, u_out, p_dst, i_dst_stride );
180 }
181
intra_predict_dc_8x8_msa(uint8_t * p_src_top,uint8_t * p_src_left,uint8_t * p_dst,int32_t i_dst_stride)182 static void intra_predict_dc_8x8_msa( uint8_t *p_src_top, uint8_t *p_src_left,
183 uint8_t *p_dst, int32_t i_dst_stride )
184 {
185 uint64_t u_val0, u_val1;
186 v16i8 store;
187 v16u8 src = { 0 };
188 v8u16 sum_h;
189 v4u32 sum_w;
190 v2u64 sum_d;
191
192 u_val0 = LD( p_src_top );
193 u_val1 = LD( p_src_left );
194 INSERT_D2_UB( u_val0, u_val1, src );
195 sum_h = __msa_hadd_u_h( src, src );
196 sum_w = __msa_hadd_u_w( sum_h, sum_h );
197 sum_d = __msa_hadd_u_d( sum_w, sum_w );
198 sum_w = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum_d, ( v4i32 ) sum_d );
199 sum_d = __msa_hadd_u_d( sum_w, sum_w );
200 sum_w = ( v4u32 ) __msa_srari_w( ( v4i32 ) sum_d, 4 );
201 store = __msa_splati_b( ( v16i8 ) sum_w, 0 );
202 u_val0 = __msa_copy_u_d( ( v2i64 ) store, 0 );
203
204 SD4( u_val0, u_val0, u_val0, u_val0, p_dst, i_dst_stride );
205 p_dst += ( 4 * i_dst_stride );
206 SD4( u_val0, u_val0, u_val0, u_val0, p_dst, i_dst_stride );
207 }
208
intra_predict_dc_16x16_msa(uint8_t * p_src_top,uint8_t * p_src_left,int32_t i_src_stride_left,uint8_t * p_dst,int32_t i_dst_stride,uint8_t is_above,uint8_t is_left)209 static void intra_predict_dc_16x16_msa( uint8_t *p_src_top, uint8_t *p_src_left,
210 int32_t i_src_stride_left,
211 uint8_t *p_dst, int32_t i_dst_stride,
212 uint8_t is_above, uint8_t is_left )
213 {
214 uint32_t u_row;
215 uint32_t u_addition = 0;
216 v16u8 src_above, store;
217 v8u16 sum_above;
218 v4u32 sum_top;
219 v2u64 sum;
220
221 if( is_left && is_above )
222 {
223 src_above = LD_UB( p_src_top );
224
225 sum_above = __msa_hadd_u_h( src_above, src_above );
226 sum_top = __msa_hadd_u_w( sum_above, sum_above );
227 sum = __msa_hadd_u_d( sum_top, sum_top );
228 sum_top = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum, ( v4i32 ) sum );
229 sum = __msa_hadd_u_d( sum_top, sum_top );
230 u_addition = __msa_copy_u_w( ( v4i32 ) sum, 0 );
231
232 for( u_row = 0; u_row < 16; u_row++ )
233 {
234 u_addition += p_src_left[u_row * i_src_stride_left];
235 }
236
237 u_addition = ( u_addition + 16 ) >> 5;
238 store = ( v16u8 ) __msa_fill_b( u_addition );
239 }
240 else if( is_left )
241 {
242 for( u_row = 0; u_row < 16; u_row++ )
243 {
244 u_addition += p_src_left[u_row * i_src_stride_left];
245 }
246
247 u_addition = ( u_addition + 8 ) >> 4;
248 store = ( v16u8 ) __msa_fill_b( u_addition );
249 }
250 else if( is_above )
251 {
252 src_above = LD_UB( p_src_top );
253
254 sum_above = __msa_hadd_u_h( src_above, src_above );
255 sum_top = __msa_hadd_u_w( sum_above, sum_above );
256 sum = __msa_hadd_u_d( sum_top, sum_top );
257 sum_top = ( v4u32 ) __msa_pckev_w( ( v4i32 ) sum, ( v4i32 ) sum );
258 sum = __msa_hadd_u_d( sum_top, sum_top );
259 sum = ( v2u64 ) __msa_srari_d( ( v2i64 ) sum, 4 );
260 store = ( v16u8 ) __msa_splati_b( ( v16i8 ) sum, 0 );
261 }
262 else
263 {
264 store = ( v16u8 ) __msa_ldi_b( 128 );
265 }
266
267 ST_UB8( store, store, store, store, store, store, store, store, p_dst,
268 i_dst_stride );
269 p_dst += ( 8 * i_dst_stride );
270 ST_UB8( store, store, store, store, store, store, store, store, p_dst,
271 i_dst_stride );
272 }
273
intra_predict_plane_8x8_msa(uint8_t * p_src,int32_t i_stride)274 static void intra_predict_plane_8x8_msa( uint8_t *p_src, int32_t i_stride )
275 {
276 uint8_t u_lpcnt;
277 int32_t i_res, i_res0, i_res1, i_res2, i_res3;
278 uint64_t u_out0, u_out1;
279 v16i8 shf_mask = { 3, 5, 2, 6, 1, 7, 0, 8, 3, 5, 2, 6, 1, 7, 0, 8 };
280 v8i16 short_multiplier = { 1, 2, 3, 4, 1, 2, 3, 4 };
281 v4i32 int_multiplier = { 0, 1, 2, 3 };
282 v16u8 p_src_top;
283 v8i16 vec9, vec10, vec11;
284 v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8;
285 v2i64 sum;
286
287 p_src_top = LD_UB( p_src - ( i_stride + 1 ) );
288 p_src_top = ( v16u8 ) __msa_vshf_b( shf_mask, ( v16i8 ) p_src_top,
289 ( v16i8 ) p_src_top );
290
291 vec9 = __msa_hsub_u_h( p_src_top, p_src_top );
292 vec9 *= short_multiplier;
293 vec8 = __msa_hadd_s_w( vec9, vec9 );
294 sum = __msa_hadd_s_d( vec8, vec8 );
295
296 i_res0 = __msa_copy_s_w( ( v4i32 ) sum, 0 );
297
298 i_res1 = ( p_src[4 * i_stride - 1] - p_src[2 * i_stride - 1] ) +
299 2 * ( p_src[5 * i_stride - 1] - p_src[i_stride - 1] ) +
300 3 * ( p_src[6 * i_stride - 1] - p_src[-1] ) +
301 4 * ( p_src[7 * i_stride - 1] - p_src[-i_stride - 1] );
302
303 i_res0 *= 17;
304 i_res1 *= 17;
305 i_res0 = ( i_res0 + 16 ) >> 5;
306 i_res1 = ( i_res1 + 16 ) >> 5;
307
308 i_res3 = 3 * ( i_res0 + i_res1 );
309 i_res2 = 16 * ( p_src[7 * i_stride - 1] + p_src[-i_stride + 7] + 1 );
310 i_res = i_res2 - i_res3;
311
312 vec8 = __msa_fill_w( i_res0 );
313 vec4 = __msa_fill_w( i_res );
314 vec2 = __msa_fill_w( i_res1 );
315 vec5 = vec8 * int_multiplier;
316 vec3 = vec8 * 4;
317
318 for( u_lpcnt = 4; u_lpcnt--; )
319 {
320 vec0 = vec5;
321 vec0 += vec4;
322 vec1 = vec0 + vec3;
323 vec6 = vec5;
324 vec4 += vec2;
325 vec6 += vec4;
326 vec7 = vec6 + vec3;
327
328 SRA_4V( vec0, vec1, vec6, vec7, 5 );
329 PCKEV_H2_SH( vec1, vec0, vec7, vec6, vec10, vec11 );
330 CLIP_SH2_0_255( vec10, vec11 );
331 PCKEV_B2_SH( vec10, vec10, vec11, vec11, vec10, vec11 );
332
333 u_out0 = __msa_copy_s_d( ( v2i64 ) vec10, 0 );
334 u_out1 = __msa_copy_s_d( ( v2i64 ) vec11, 0 );
335 SD( u_out0, p_src );
336 p_src += i_stride;
337 SD( u_out1, p_src );
338 p_src += i_stride;
339
340 vec4 += vec2;
341 }
342 }
343
intra_predict_plane_16x16_msa(uint8_t * p_src,int32_t i_stride)344 static void intra_predict_plane_16x16_msa( uint8_t *p_src, int32_t i_stride )
345 {
346 uint8_t u_lpcnt;
347 int32_t i_res0, i_res1, i_res2, i_res3;
348 uint64_t u_load0, u_load1;
349 v16i8 shf_mask = { 7, 8, 6, 9, 5, 10, 4, 11, 3, 12, 2, 13, 1, 14, 0, 15 };
350 v8i16 short_multiplier = { 1, 2, 3, 4, 5, 6, 7, 8 };
351 v4i32 int_multiplier = { 0, 1, 2, 3 };
352 v16u8 p_src_top = { 0 };
353 v8i16 vec9, vec10;
354 v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, res_add;
355
356 u_load0 = LD( p_src - ( i_stride + 1 ) );
357 u_load1 = LD( p_src - ( i_stride + 1 ) + 9 );
358
359 INSERT_D2_UB( u_load0, u_load1, p_src_top );
360
361 p_src_top = ( v16u8 ) __msa_vshf_b( shf_mask, ( v16i8 ) p_src_top,
362 ( v16i8 ) p_src_top );
363
364 vec9 = __msa_hsub_u_h( p_src_top, p_src_top );
365 vec9 *= short_multiplier;
366 vec8 = __msa_hadd_s_w( vec9, vec9 );
367 res_add = ( v4i32 ) __msa_hadd_s_d( vec8, vec8 );
368
369 i_res0 = __msa_copy_s_w( res_add, 0 ) + __msa_copy_s_w( res_add, 2 );
370
371 i_res1 = ( p_src[8 * i_stride - 1] - p_src[6 * i_stride - 1] ) +
372 2 * ( p_src[9 * i_stride - 1] - p_src[5 * i_stride - 1] ) +
373 3 * ( p_src[10 * i_stride - 1] - p_src[4 * i_stride - 1] ) +
374 4 * ( p_src[11 * i_stride - 1] - p_src[3 * i_stride - 1] ) +
375 5 * ( p_src[12 * i_stride - 1] - p_src[2 * i_stride - 1] ) +
376 6 * ( p_src[13 * i_stride - 1] - p_src[i_stride - 1] ) +
377 7 * ( p_src[14 * i_stride - 1] - p_src[-1] ) +
378 8 * ( p_src[15 * i_stride - 1] - p_src[-1 * i_stride - 1] );
379
380 i_res0 *= 5;
381 i_res1 *= 5;
382 i_res0 = ( i_res0 + 32 ) >> 6;
383 i_res1 = ( i_res1 + 32 ) >> 6;
384
385 i_res3 = 7 * ( i_res0 + i_res1 );
386 i_res2 = 16 * ( p_src[15 * i_stride - 1] + p_src[-i_stride + 15] + 1 );
387 i_res2 -= i_res3;
388
389 vec8 = __msa_fill_w( i_res0 );
390 vec4 = __msa_fill_w( i_res2 );
391 vec5 = __msa_fill_w( i_res1 );
392 vec6 = vec8 * 4;
393 vec7 = vec8 * int_multiplier;
394
395 for( u_lpcnt = 16; u_lpcnt--; )
396 {
397 vec0 = vec7;
398 vec0 += vec4;
399 vec1 = vec0 + vec6;
400 vec2 = vec1 + vec6;
401 vec3 = vec2 + vec6;
402
403 SRA_4V( vec0, vec1, vec2, vec3, 5 );
404 PCKEV_H2_SH( vec1, vec0, vec3, vec2, vec9, vec10 );
405 CLIP_SH2_0_255( vec9, vec10 );
406 PCKEV_ST_SB( vec9, vec10, p_src );
407 p_src += i_stride;
408
409 vec4 += vec5;
410 }
411 }
412
intra_predict_dc_4blk_8x8_msa(uint8_t * p_src,int32_t i_stride)413 static void intra_predict_dc_4blk_8x8_msa( uint8_t *p_src, int32_t i_stride )
414 {
415 uint8_t u_lp_cnt;
416 uint32_t u_src0, u_src1, u_src3, u_src2 = 0;
417 uint32_t u_out0, u_out1, u_out2, u_out3;
418 v16u8 p_src_top;
419 v8u16 add;
420 v4u32 sum;
421
422 p_src_top = LD_UB( p_src - i_stride );
423 add = __msa_hadd_u_h( ( v16u8 ) p_src_top, ( v16u8 ) p_src_top );
424 sum = __msa_hadd_u_w( add, add );
425 u_src0 = __msa_copy_u_w( ( v4i32 ) sum, 0 );
426 u_src1 = __msa_copy_u_w( ( v4i32 ) sum, 1 );
427
428 for( u_lp_cnt = 0; u_lp_cnt < 4; u_lp_cnt++ )
429 {
430 u_src0 += p_src[u_lp_cnt * i_stride - 1];
431 u_src2 += p_src[( 4 + u_lp_cnt ) * i_stride - 1];
432 }
433
434 u_src0 = ( u_src0 + 4 ) >> 3;
435 u_src3 = ( u_src1 + u_src2 + 4 ) >> 3;
436 u_src1 = ( u_src1 + 2 ) >> 2;
437 u_src2 = ( u_src2 + 2 ) >> 2;
438
439 u_out0 = u_src0 * 0x01010101;
440 u_out1 = u_src1 * 0x01010101;
441 u_out2 = u_src2 * 0x01010101;
442 u_out3 = u_src3 * 0x01010101;
443
444 for( u_lp_cnt = 4; u_lp_cnt--; )
445 {
446 SW( u_out0, p_src );
447 SW( u_out1, ( p_src + 4 ) );
448 SW( u_out2, ( p_src + 4 * i_stride ) );
449 SW( u_out3, ( p_src + 4 * i_stride + 4 ) );
450 p_src += i_stride;
451 }
452 }
453
intra_predict_ddl_8x8_msa(uint8_t * p_src,uint8_t * p_dst,int32_t i_dst_stride)454 static void intra_predict_ddl_8x8_msa( uint8_t *p_src, uint8_t *p_dst,
455 int32_t i_dst_stride )
456 {
457 uint8_t u_src_val = p_src[15];
458 uint64_t u_out0, u_out1, u_out2, u_out3;
459 v16u8 src, vec4, vec5, res0;
460 v8u16 vec0, vec1, vec2, vec3;
461 v2i64 res1, res2, res3;
462
463 src = LD_UB( p_src );
464
465 vec4 = ( v16u8 ) __msa_sldi_b( ( v16i8 ) src, ( v16i8 ) src, 1 );
466 vec5 = ( v16u8 ) __msa_sldi_b( ( v16i8 ) src, ( v16i8 ) src, 2 );
467 vec5 = ( v16u8 ) __msa_insert_b( ( v16i8 ) vec5, 14, u_src_val );
468 ILVR_B2_UH( vec5, src, vec4, vec4, vec0, vec1 );
469 ILVL_B2_UH( vec5, src, vec4, vec4, vec2, vec3 );
470 HADD_UB4_UH( vec0, vec1, vec2, vec3, vec0, vec1, vec2, vec3 );
471
472 vec0 += vec1;
473 vec2 += vec3;
474 vec0 = ( v8u16 ) __msa_srari_h( ( v8i16 ) vec0, 2 );
475 vec2 = ( v8u16 ) __msa_srari_h( ( v8i16 ) vec2, 2 );
476
477 res0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) vec2, ( v16i8 ) vec0 );
478 res1 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 1 );
479 res2 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 2 );
480 res3 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 3 );
481
482 u_out0 = __msa_copy_u_d( ( v2i64 ) res0, 0 );
483 u_out1 = __msa_copy_u_d( res1, 0 );
484 u_out2 = __msa_copy_u_d( res2, 0 );
485 u_out3 = __msa_copy_u_d( res3, 0 );
486 SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
487 p_dst += ( 4 * i_dst_stride );
488
489 res0 = ( v16u8 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 4 );
490 res1 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 1 );
491 res2 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 2 );
492 res3 = ( v2i64 ) __msa_sldi_b( ( v16i8 ) res0, ( v16i8 ) res0, 3 );
493
494 u_out0 = __msa_copy_u_d( ( v2i64 ) res0, 0 );
495 u_out1 = __msa_copy_u_d( res1, 0 );
496 u_out2 = __msa_copy_u_d( res2, 0 );
497 u_out3 = __msa_copy_u_d( res3, 0 );
498 SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
499 }
500
intra_predict_128dc_16x16_msa(uint8_t * p_dst,int32_t i_dst_stride)501 static void intra_predict_128dc_16x16_msa( uint8_t *p_dst,
502 int32_t i_dst_stride )
503 {
504 v16u8 out = ( v16u8 ) __msa_ldi_b( 128 );
505
506 ST_UB8( out, out, out, out, out, out, out, out, p_dst, i_dst_stride );
507 p_dst += ( 8 * i_dst_stride );
508 ST_UB8( out, out, out, out, out, out, out, out, p_dst, i_dst_stride );
509 }
510
x264_intra_predict_dc_16x16_msa(uint8_t * p_src)511 void x264_intra_predict_dc_16x16_msa( uint8_t *p_src )
512 {
513 intra_predict_dc_16x16_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
514 FDEC_STRIDE, p_src, FDEC_STRIDE, 1, 1 );
515 }
516
x264_intra_predict_dc_left_16x16_msa(uint8_t * p_src)517 void x264_intra_predict_dc_left_16x16_msa( uint8_t *p_src )
518 {
519 intra_predict_dc_16x16_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
520 FDEC_STRIDE, p_src, FDEC_STRIDE, 0, 1 );
521 }
522
x264_intra_predict_dc_top_16x16_msa(uint8_t * p_src)523 void x264_intra_predict_dc_top_16x16_msa( uint8_t *p_src )
524 {
525 intra_predict_dc_16x16_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
526 FDEC_STRIDE, p_src, FDEC_STRIDE, 1, 0 );
527 }
528
x264_intra_predict_dc_128_16x16_msa(uint8_t * p_src)529 void x264_intra_predict_dc_128_16x16_msa( uint8_t *p_src )
530 {
531 intra_predict_128dc_16x16_msa( p_src, FDEC_STRIDE );
532 }
533
x264_intra_predict_hor_16x16_msa(uint8_t * p_src)534 void x264_intra_predict_hor_16x16_msa( uint8_t *p_src )
535 {
536 intra_predict_horiz_16x16_msa( ( p_src - 1 ), FDEC_STRIDE,
537 p_src, FDEC_STRIDE );
538 }
539
x264_intra_predict_vert_16x16_msa(uint8_t * p_src)540 void x264_intra_predict_vert_16x16_msa( uint8_t *p_src )
541 {
542 intra_predict_vert_16x16_msa( ( p_src - FDEC_STRIDE ), p_src, FDEC_STRIDE );
543 }
544
x264_intra_predict_plane_16x16_msa(uint8_t * p_src)545 void x264_intra_predict_plane_16x16_msa( uint8_t *p_src )
546 {
547 intra_predict_plane_16x16_msa( p_src, FDEC_STRIDE );
548 }
549
x264_intra_predict_dc_4blk_8x8_msa(uint8_t * p_src)550 void x264_intra_predict_dc_4blk_8x8_msa( uint8_t *p_src )
551 {
552 intra_predict_dc_4blk_8x8_msa( p_src, FDEC_STRIDE );
553 }
554
x264_intra_predict_hor_8x8_msa(uint8_t * p_src)555 void x264_intra_predict_hor_8x8_msa( uint8_t *p_src )
556 {
557 intra_predict_horiz_8x8_msa( ( p_src - 1 ), FDEC_STRIDE,
558 p_src, FDEC_STRIDE );
559 }
560
x264_intra_predict_vert_8x8_msa(uint8_t * p_src)561 void x264_intra_predict_vert_8x8_msa( uint8_t *p_src )
562 {
563 intra_predict_vert_8x8_msa( ( p_src - FDEC_STRIDE ), p_src, FDEC_STRIDE );
564 }
565
x264_intra_predict_plane_8x8_msa(uint8_t * p_src)566 void x264_intra_predict_plane_8x8_msa( uint8_t *p_src )
567 {
568 intra_predict_plane_8x8_msa( p_src, FDEC_STRIDE );
569 }
570
x264_intra_predict_ddl_8x8_msa(uint8_t * p_src,uint8_t pu_xyz[36])571 void x264_intra_predict_ddl_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
572 {
573 intra_predict_ddl_8x8_msa( ( pu_xyz + 16 ), p_src, FDEC_STRIDE );
574 }
575
x264_intra_predict_dc_8x8_msa(uint8_t * p_src,uint8_t pu_xyz[36])576 void x264_intra_predict_dc_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
577 {
578 intra_predict_dc_8x8_msa( ( pu_xyz + 16 ), ( pu_xyz + 7 ),
579 p_src, FDEC_STRIDE );
580 }
581
x264_intra_predict_h_8x8_msa(uint8_t * p_src,uint8_t pu_xyz[36])582 void x264_intra_predict_h_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
583 {
584 intra_predict_horiz_8x8_msa( ( pu_xyz + 14 ), -1, p_src, FDEC_STRIDE );
585 }
586
x264_intra_predict_v_8x8_msa(uint8_t * p_src,uint8_t pu_xyz[36])587 void x264_intra_predict_v_8x8_msa( uint8_t *p_src, uint8_t pu_xyz[36] )
588 {
589 intra_predict_vert_8x8_msa( ( pu_xyz + 16 ), p_src, FDEC_STRIDE );
590 }
591
x264_intra_predict_dc_4x4_msa(uint8_t * p_src)592 void x264_intra_predict_dc_4x4_msa( uint8_t *p_src )
593 {
594 intra_predict_dc_4x4_msa( ( p_src - FDEC_STRIDE ), ( p_src - 1 ),
595 FDEC_STRIDE, p_src, FDEC_STRIDE, 1, 1 );
596 }
597
x264_intra_predict_hor_4x4_msa(uint8_t * p_src)598 void x264_intra_predict_hor_4x4_msa( uint8_t *p_src )
599 {
600 intra_predict_horiz_4x4_msa( ( p_src - 1 ), FDEC_STRIDE,
601 p_src, FDEC_STRIDE );
602 }
603
x264_intra_predict_vert_4x4_msa(uint8_t * p_src)604 void x264_intra_predict_vert_4x4_msa( uint8_t *p_src )
605 {
606 intra_predict_vert_4x4_msa( ( p_src - FDEC_STRIDE ), p_src, FDEC_STRIDE );
607 }
608 #endif
609