1 /*****************************************************************************
2  * dct-c.c: msa transform and zigzag
3  *****************************************************************************
4  * Copyright (C) 2015-2021 x264 project
5  *
6  * Authors: Rishikesh More <rishikesh.more@imgtec.com>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21  *
22  * This program is also available under a commercial proprietary license.
23  * For more information, contact us at licensing@x264.com.
24  *****************************************************************************/
25 
26 #include "common/common.h"
27 #include "macros.h"
28 #include "dct.h"
29 
30 #if !HIGH_BIT_DEPTH
31 #define AVC_ITRANS_H( in0, in1, in2, in3, out0, out1, out2, out3 )          \
32 {                                                                           \
33     v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
34                                                                             \
35     tmp0_m = in0 + in2;                                                     \
36     tmp1_m = in0 - in2;                                                     \
37     tmp2_m = in1 >> 1;                                                      \
38     tmp2_m = tmp2_m - in3;                                                  \
39     tmp3_m = in3 >> 1;                                                      \
40     tmp3_m = in1 + tmp3_m;                                                  \
41                                                                             \
42     BUTTERFLY_4( tmp0_m, tmp1_m, tmp2_m, tmp3_m, out0, out1, out2, out3 );  \
43 }
44 
avc_dct4x4dc_msa(int16_t * p_src,int16_t * p_dst,int32_t i_src_stride)45 static void avc_dct4x4dc_msa( int16_t *p_src, int16_t *p_dst,
46                               int32_t i_src_stride )
47 {
48     v8i16 src0, src1, src2, src3, ver_res0, ver_res1, ver_res2, ver_res3;
49     v4i32 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
50     v4i32 hor_res0, hor_res1, hor_res2, hor_res3;
51     v4i32 ver_res0_r, ver_res1_r, ver_res2_r, ver_res3_r;
52 
53     LD_SH4( p_src, i_src_stride, src0, src1, src2, src3 );
54     UNPCK_R_SH_SW( src0, src0_r );
55     UNPCK_R_SH_SW( src1, src1_r );
56     UNPCK_R_SH_SW( src2, src2_r );
57     UNPCK_R_SH_SW( src3, src3_r );
58     BUTTERFLY_4( src0_r, src2_r, src3_r, src1_r,
59                  tmp0, tmp3, tmp2, tmp1 );
60     BUTTERFLY_4( tmp0, tmp1, tmp2, tmp3,
61                  hor_res0, hor_res3, hor_res2, hor_res1 );
62     TRANSPOSE4x4_SW_SW( hor_res0, hor_res1, hor_res2, hor_res3,
63                         hor_res0, hor_res1, hor_res2, hor_res3 );
64     BUTTERFLY_4( hor_res0, hor_res2, hor_res3, hor_res1,
65                  tmp0, tmp3, tmp2, tmp1 );
66     BUTTERFLY_4( tmp0, tmp1, tmp2, tmp3,
67                  ver_res0_r, ver_res3_r, ver_res2_r, ver_res1_r );
68     SRARI_W4_SW( ver_res0_r, ver_res1_r, ver_res2_r, ver_res3_r, 1 );
69     PCKEV_H4_SH( ver_res0_r, ver_res0_r, ver_res1_r, ver_res1_r,
70                  ver_res2_r, ver_res2_r, ver_res3_r, ver_res3_r,
71                  ver_res0, ver_res1, ver_res2, ver_res3 );
72     PCKOD_D2_SH( ver_res1, ver_res0, ver_res3, ver_res2, ver_res0, ver_res2 );
73     ST_SH2( ver_res0, ver_res2, p_dst, 8 );
74 }
75 
avc_sub4x4_dct_msa(uint8_t * p_src,int32_t i_src_stride,uint8_t * p_ref,int32_t i_dst_stride,int16_t * p_dst)76 static void avc_sub4x4_dct_msa( uint8_t *p_src, int32_t i_src_stride,
77                                 uint8_t *p_ref, int32_t i_dst_stride,
78                                 int16_t *p_dst )
79 {
80     uint32_t i_src0, i_src1, i_src2, i_src3;
81     uint32_t i_ref0, i_ref1, i_ref2, i_ref3;
82     v16i8 src = { 0 };
83     v16i8 ref = { 0 };
84     v16u8 inp0, inp1;
85     v8i16 diff0, diff1, diff2, diff3;
86     v8i16 temp0, temp1, temp2, temp3;
87 
88     LW4( p_src, i_src_stride, i_src0, i_src1, i_src2, i_src3 );
89     LW4( p_ref, i_dst_stride, i_ref0, i_ref1, i_ref2, i_ref3 );
90 
91     INSERT_W4_SB( i_src0, i_src1, i_src2, i_src3, src );
92     INSERT_W4_SB( i_ref0, i_ref1, i_ref2, i_ref3, ref );
93 
94     ILVRL_B2_UB( src, ref, inp0, inp1 );
95 
96     HSUB_UB2_SH( inp0, inp1, diff0, diff2 );
97 
98     diff1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) diff0, ( v2i64 ) diff0 );
99     diff3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) diff2, ( v2i64 ) diff2 );
100 
101     BUTTERFLY_4( diff0, diff1, diff2, diff3, temp0, temp1, temp2, temp3 );
102 
103     diff0 = temp0 + temp1;
104     diff1 = ( temp3 << 1 ) + temp2;
105     diff2 = temp0 - temp1;
106     diff3 = temp3 - ( temp2 << 1 );
107 
108     TRANSPOSE4x4_SH_SH( diff0, diff1, diff2, diff3,
109                         temp0, temp1, temp2, temp3 );
110     BUTTERFLY_4( temp0, temp1, temp2, temp3, diff0, diff1, diff2, diff3 );
111 
112     temp0 = diff0 + diff1;
113     temp1 = ( diff3 << 1 ) + diff2;
114     temp2 = diff0 - diff1;
115     temp3 = diff3 - ( diff2 << 1 );
116 
117     ILVR_D2_UB( temp1, temp0, temp3, temp2, inp0, inp1 );
118     ST_UB2( inp0, inp1, p_dst, 8 );
119 }
120 
avc_zigzag_scan_4x4_frame_msa(int16_t pi_dct[16],int16_t pi_level[16])121 static void avc_zigzag_scan_4x4_frame_msa( int16_t pi_dct[16],
122                                            int16_t pi_level[16] )
123 {
124     v8i16 src0, src1;
125     v8i16 mask0 = { 0, 4, 1, 2, 5, 8, 12, 9 };
126     v8i16 mask1 = { 6, 3, 7, 10, 13, 14, 11, 15 };
127 
128     LD_SH2( pi_dct, 8, src0, src1 );
129     VSHF_H2_SH( src0, src1, src0, src1, mask0, mask1, mask0, mask1 );
130     ST_SH2( mask0, mask1, pi_level, 8 );
131 }
132 
avc_idct4x4_addblk_msa(uint8_t * p_dst,int16_t * p_src,int32_t i_dst_stride)133 static void avc_idct4x4_addblk_msa( uint8_t *p_dst, int16_t *p_src,
134                                     int32_t i_dst_stride )
135 {
136     v8i16 src0, src1, src2, src3;
137     v8i16 hres0, hres1, hres2, hres3;
138     v8i16 vres0, vres1, vres2, vres3;
139     v8i16 zeros = { 0 };
140 
141     LD4x4_SH( p_src, src0, src1, src2, src3 );
142     AVC_ITRANS_H( src0, src1, src2, src3, hres0, hres1, hres2, hres3 );
143     TRANSPOSE4x4_SH_SH( hres0, hres1, hres2, hres3,
144                         hres0, hres1, hres2, hres3 );
145     AVC_ITRANS_H( hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3 );
146     SRARI_H4_SH( vres0, vres1, vres2, vres3, 6 );
147     ADDBLK_ST4x4_UB( vres0, vres1, vres2, vres3, p_dst, i_dst_stride );
148     ST_SH2( zeros, zeros, p_src, 8 );
149 }
150 
avc_idct4x4_addblk_dc_msa(uint8_t * p_dst,int16_t * p_src,int32_t i_dst_stride)151 static void avc_idct4x4_addblk_dc_msa( uint8_t *p_dst, int16_t *p_src,
152                                        int32_t i_dst_stride )
153 {
154     int16_t i_dc;
155     uint32_t i_src0, i_src1, i_src2, i_src3;
156     v16u8 pred = { 0 };
157     v16i8 out;
158     v8i16 input_dc, pred_r, pred_l;
159 
160     i_dc = ( p_src[0] + 32 ) >> 6;
161     input_dc = __msa_fill_h( i_dc );
162     p_src[ 0 ] = 0;
163 
164     LW4( p_dst, i_dst_stride, i_src0, i_src1, i_src2, i_src3 );
165     INSERT_W4_UB( i_src0, i_src1, i_src2, i_src3, pred );
166     UNPCK_UB_SH( pred, pred_r, pred_l );
167 
168     pred_r += input_dc;
169     pred_l += input_dc;
170 
171     CLIP_SH2_0_255( pred_r, pred_l );
172     out = __msa_pckev_b( ( v16i8 ) pred_l, ( v16i8 ) pred_r );
173     ST4x4_UB( out, out, 0, 1, 2, 3, p_dst, i_dst_stride );
174 }
175 
avc_idct8_addblk_msa(uint8_t * p_dst,int16_t * p_src,int32_t i_dst_stride)176 static void avc_idct8_addblk_msa( uint8_t *p_dst, int16_t *p_src,
177                                   int32_t i_dst_stride )
178 {
179     v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
180     v8i16 vec0, vec1, vec2, vec3;
181     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
182     v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
183     v4i32 tmp0_r, tmp1_r, tmp2_r, tmp3_r, tmp4_r, tmp5_r, tmp6_r, tmp7_r;
184     v4i32 tmp0_l, tmp1_l, tmp2_l, tmp3_l, tmp4_l, tmp5_l, tmp6_l, tmp7_l;
185     v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec0_l, vec1_l, vec2_l, vec3_l;
186     v4i32 res0_r, res1_r, res2_r, res3_r, res4_r, res5_r, res6_r, res7_r;
187     v4i32 res0_l, res1_l, res2_l, res3_l, res4_l, res5_l, res6_l, res7_l;
188     v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
189     v16i8 zeros = { 0 };
190 
191     p_src[ 0 ] += 32;
192 
193     LD_SH8( p_src, 8, src0, src1, src2, src3, src4, src5, src6, src7 );
194 
195     vec0 = src0 + src4;
196     vec1 = src0 - src4;
197     vec2 = src2 >> 1;
198     vec2 = vec2 - src6;
199     vec3 = src6 >> 1;
200     vec3 = src2 + vec3;
201 
202     BUTTERFLY_4( vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3 );
203 
204     vec0 = src7 >> 1;
205     vec0 = src5 - vec0 - src3 - src7;
206     vec1 = src3 >> 1;
207     vec1 = src1 - vec1 + src7 - src3;
208     vec2 = src5 >> 1;
209     vec2 = vec2 - src1 + src7 + src5;
210     vec3 = src1 >> 1;
211     vec3 = vec3 + src3 + src5 + src1;
212     tmp4 = vec3 >> 2;
213     tmp4 += vec0;
214     tmp5 = vec2 >> 2;
215     tmp5 += vec1;
216     tmp6 = vec1 >> 2;
217     tmp6 -= vec2;
218     tmp7 = vec0 >> 2;
219     tmp7 = vec3 - tmp7;
220 
221     BUTTERFLY_8( tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
222                  res0, res1, res2, res3, res4, res5, res6, res7 );
223     TRANSPOSE8x8_SH_SH( res0, res1, res2, res3, res4, res5, res6, res7,
224                         res0, res1, res2, res3, res4, res5, res6, res7 );
225     UNPCK_SH_SW( res0, tmp0_r, tmp0_l );
226     UNPCK_SH_SW( res1, tmp1_r, tmp1_l );
227     UNPCK_SH_SW( res2, tmp2_r, tmp2_l );
228     UNPCK_SH_SW( res3, tmp3_r, tmp3_l );
229     UNPCK_SH_SW( res4, tmp4_r, tmp4_l );
230     UNPCK_SH_SW( res5, tmp5_r, tmp5_l );
231     UNPCK_SH_SW( res6, tmp6_r, tmp6_l );
232     UNPCK_SH_SW( res7, tmp7_r, tmp7_l );
233     BUTTERFLY_4( tmp0_r, tmp0_l, tmp4_l, tmp4_r,
234                  vec0_r, vec0_l, vec1_l, vec1_r );
235 
236     vec2_r = tmp2_r >> 1;
237     vec2_l = tmp2_l >> 1;
238     vec2_r -= tmp6_r;
239     vec2_l -= tmp6_l;
240     vec3_r = tmp6_r >> 1;
241     vec3_l = tmp6_l >> 1;
242     vec3_r += tmp2_r;
243     vec3_l += tmp2_l;
244 
245     BUTTERFLY_4( vec0_r, vec1_r, vec2_r, vec3_r,
246                  tmp0_r, tmp2_r, tmp4_r, tmp6_r );
247     BUTTERFLY_4( vec0_l, vec1_l, vec2_l, vec3_l,
248                  tmp0_l, tmp2_l, tmp4_l, tmp6_l );
249 
250     vec0_r = tmp7_r >> 1;
251     vec0_l = tmp7_l >> 1;
252     vec0_r = tmp5_r - vec0_r - tmp3_r - tmp7_r;
253     vec0_l = tmp5_l - vec0_l - tmp3_l - tmp7_l;
254     vec1_r = tmp3_r >> 1;
255     vec1_l = tmp3_l >> 1;
256     vec1_r = tmp1_r - vec1_r + tmp7_r - tmp3_r;
257     vec1_l = tmp1_l - vec1_l + tmp7_l - tmp3_l;
258     vec2_r = tmp5_r >> 1;
259     vec2_l = tmp5_l >> 1;
260     vec2_r = vec2_r - tmp1_r + tmp7_r + tmp5_r;
261     vec2_l = vec2_l - tmp1_l + tmp7_l + tmp5_l;
262     vec3_r = tmp1_r >> 1;
263     vec3_l = tmp1_l >> 1;
264     vec3_r = vec3_r + tmp3_r + tmp5_r + tmp1_r;
265     vec3_l = vec3_l + tmp3_l + tmp5_l + tmp1_l;
266     tmp1_r = vec3_r >> 2;
267     tmp1_l = vec3_l >> 2;
268     tmp1_r += vec0_r;
269     tmp1_l += vec0_l;
270     tmp3_r = vec2_r >> 2;
271     tmp3_l = vec2_l >> 2;
272     tmp3_r += vec1_r;
273     tmp3_l += vec1_l;
274     tmp5_r = vec1_r >> 2;
275     tmp5_l = vec1_l >> 2;
276     tmp5_r -= vec2_r;
277     tmp5_l -= vec2_l;
278     tmp7_r = vec0_r >> 2;
279     tmp7_l = vec0_l >> 2;
280     tmp7_r = vec3_r - tmp7_r;
281     tmp7_l = vec3_l - tmp7_l;
282 
283     BUTTERFLY_4( tmp0_r, tmp0_l, tmp7_l, tmp7_r,
284                  res0_r, res0_l, res7_l, res7_r );
285     BUTTERFLY_4( tmp2_r, tmp2_l, tmp5_l, tmp5_r,
286                  res1_r, res1_l, res6_l, res6_r );
287     BUTTERFLY_4( tmp4_r, tmp4_l, tmp3_l, tmp3_r,
288                  res2_r, res2_l, res5_l, res5_r );
289     BUTTERFLY_4( tmp6_r, tmp6_l, tmp1_l, tmp1_r,
290                  res3_r, res3_l, res4_l, res4_r );
291     SRA_4V( res0_r, res0_l, res1_r, res1_l, 6 );
292     SRA_4V( res2_r, res2_l, res3_r, res3_l, 6 );
293     SRA_4V( res4_r, res4_l, res5_r, res5_l, 6 );
294     SRA_4V( res6_r, res6_l, res7_r, res7_l, 6 );
295     PCKEV_H4_SH( res0_l, res0_r, res1_l, res1_r, res2_l, res2_r, res3_l, res3_r,
296                  res0, res1, res2, res3 );
297     PCKEV_H4_SH( res4_l, res4_r, res5_l, res5_r, res6_l, res6_r, res7_l, res7_r,
298                  res4, res5, res6, res7 );
299     LD_SB8( p_dst, i_dst_stride,
300             dst0, dst1, dst2, dst3,
301             dst4, dst5, dst6, dst7 );
302     ILVR_B4_SH( zeros, dst0, zeros, dst1, zeros, dst2, zeros, dst3,
303                 tmp0, tmp1, tmp2, tmp3 );
304     ILVR_B4_SH( zeros, dst4, zeros, dst5, zeros, dst6, zeros, dst7,
305                 tmp4, tmp5, tmp6, tmp7 );
306     ADD4( res0, tmp0, res1, tmp1, res2, tmp2, res3, tmp3,
307           res0, res1, res2, res3 );
308     ADD4( res4, tmp4, res5, tmp5, res6, tmp6, res7, tmp7,
309           res4, res5, res6, res7 );
310     CLIP_SH4_0_255( res0, res1, res2, res3 );
311     CLIP_SH4_0_255( res4, res5, res6, res7 );
312     PCKEV_B4_SB( res1, res0, res3, res2, res5, res4, res7, res6,
313                  dst0, dst1, dst2, dst3 );
314     ST8x4_UB( dst0, dst1, p_dst, i_dst_stride );
315     p_dst += ( 4 * i_dst_stride );
316     ST8x4_UB( dst2, dst3, p_dst, i_dst_stride );
317 }
318 
avc_idct4x4dc_msa(int16_t * p_src,int32_t i_src_stride,int16_t * p_dst,int32_t i_dst_stride)319 static void avc_idct4x4dc_msa( int16_t *p_src, int32_t i_src_stride,
320                                int16_t *p_dst, int32_t i_dst_stride )
321 {
322     v8i16 src0, src1, src2, src3;
323     v4i32 src0_r, src1_r, src2_r, src3_r;
324     v4i32 hres0, hres1, hres2, hres3;
325     v8i16 vres0, vres1, vres2, vres3;
326     v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
327     v2i64 res0, res1;
328 
329     LD_SH4( p_src, i_src_stride, src0, src1, src2, src3 );
330     UNPCK_R_SH_SW( src0, src0_r );
331     UNPCK_R_SH_SW( src1, src1_r );
332     UNPCK_R_SH_SW( src2, src2_r );
333     UNPCK_R_SH_SW( src3, src3_r );
334     BUTTERFLY_4( src0_r, src2_r, src3_r, src1_r, vec0, vec3, vec2, vec1 );
335     BUTTERFLY_4( vec0, vec1, vec2, vec3, hres0, hres3, hres2, hres1 );
336     TRANSPOSE4x4_SW_SW( hres0, hres1, hres2, hres3,
337                         hres0, hres1, hres2, hres3 );
338     BUTTERFLY_4( hres0, hres2, hres3, hres1, vec0, vec3, vec2, vec1 );
339     BUTTERFLY_4( vec0, vec1, vec2, vec3, vec4, vec7, vec6, vec5 );
340     PCKEV_H4_SH( vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
341                  vres0, vres1, vres2, vres3 );
342     PCKOD_D2_SD( vres1, vres0, vres3, vres2, res0, res1 );
343     ST8x4_UB( res0, res1, p_dst, i_dst_stride * 2 );
344 }
345 
subtract_sum4x4_msa(uint8_t * p_src,int32_t i_src_stride,uint8_t * pred_ptr,int32_t i_pred_stride)346 static int32_t subtract_sum4x4_msa( uint8_t *p_src, int32_t i_src_stride,
347                                     uint8_t *pred_ptr, int32_t i_pred_stride )
348 {
349     int16_t i_sum;
350     uint32_t i_src0, i_src1, i_src2, i_src3;
351     uint32_t i_pred0, i_pred1, i_pred2, i_pred3;
352     v16i8 src = { 0 };
353     v16i8 pred = { 0 };
354     v16u8 src_l0, src_l1;
355     v8i16 diff0, diff1;
356 
357     LW4( p_src, i_src_stride, i_src0, i_src1, i_src2, i_src3 );
358     LW4( pred_ptr, i_pred_stride, i_pred0, i_pred1, i_pred2, i_pred3 );
359     INSERT_W4_SB( i_src0, i_src1, i_src2, i_src3, src );
360     INSERT_W4_SB( i_pred0, i_pred1, i_pred2, i_pred3, pred );
361     ILVRL_B2_UB( src, pred, src_l0, src_l1 );
362     HSUB_UB2_SH( src_l0, src_l1, diff0, diff1 );
363     i_sum = HADD_UH_U32( diff0 + diff1 );
364 
365     return i_sum;
366 }
367 
x264_dct4x4dc_msa(int16_t d[16])368 void x264_dct4x4dc_msa( int16_t d[16] )
369 {
370     avc_dct4x4dc_msa( d, d, 4 );
371 }
372 
x264_idct4x4dc_msa(int16_t d[16])373 void x264_idct4x4dc_msa( int16_t d[16] )
374 {
375     avc_idct4x4dc_msa( d, 4, d, 4 );
376 }
377 
x264_add4x4_idct_msa(uint8_t * p_dst,int16_t pi_dct[16])378 void x264_add4x4_idct_msa( uint8_t *p_dst, int16_t pi_dct[16] )
379 {
380     avc_idct4x4_addblk_msa( p_dst, pi_dct, FDEC_STRIDE );
381 }
382 
x264_add8x8_idct_msa(uint8_t * p_dst,int16_t pi_dct[4][16])383 void x264_add8x8_idct_msa( uint8_t *p_dst, int16_t pi_dct[4][16] )
384 {
385     avc_idct4x4_addblk_msa( &p_dst[0], &pi_dct[0][0], FDEC_STRIDE );
386     avc_idct4x4_addblk_msa( &p_dst[4], &pi_dct[1][0], FDEC_STRIDE );
387     avc_idct4x4_addblk_msa( &p_dst[4 * FDEC_STRIDE + 0],
388                             &pi_dct[2][0], FDEC_STRIDE );
389     avc_idct4x4_addblk_msa( &p_dst[4 * FDEC_STRIDE + 4],
390                             &pi_dct[3][0], FDEC_STRIDE );
391 }
392 
x264_add16x16_idct_msa(uint8_t * p_dst,int16_t pi_dct[16][16])393 void x264_add16x16_idct_msa( uint8_t *p_dst, int16_t pi_dct[16][16] )
394 {
395     x264_add8x8_idct_msa( &p_dst[0], &pi_dct[0] );
396     x264_add8x8_idct_msa( &p_dst[8], &pi_dct[4] );
397     x264_add8x8_idct_msa( &p_dst[8 * FDEC_STRIDE + 0], &pi_dct[8] );
398     x264_add8x8_idct_msa( &p_dst[8 * FDEC_STRIDE + 8], &pi_dct[12] );
399 }
400 
x264_add8x8_idct8_msa(uint8_t * p_dst,int16_t pi_dct[64])401 void x264_add8x8_idct8_msa( uint8_t *p_dst, int16_t pi_dct[64] )
402 {
403     avc_idct8_addblk_msa( p_dst, pi_dct, FDEC_STRIDE );
404 }
405 
x264_add16x16_idct8_msa(uint8_t * p_dst,int16_t pi_dct[4][64])406 void x264_add16x16_idct8_msa( uint8_t *p_dst, int16_t pi_dct[4][64] )
407 {
408     avc_idct8_addblk_msa( &p_dst[0], &pi_dct[0][0], FDEC_STRIDE );
409     avc_idct8_addblk_msa( &p_dst[8], &pi_dct[1][0], FDEC_STRIDE );
410     avc_idct8_addblk_msa( &p_dst[8 * FDEC_STRIDE + 0],
411                           &pi_dct[2][0], FDEC_STRIDE );
412     avc_idct8_addblk_msa( &p_dst[8 * FDEC_STRIDE + 8],
413                           &pi_dct[3][0], FDEC_STRIDE );
414 }
415 
x264_add8x8_idct_dc_msa(uint8_t * p_dst,int16_t pi_dct[4])416 void x264_add8x8_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[4] )
417 {
418     avc_idct4x4_addblk_dc_msa( &p_dst[0], &pi_dct[0], FDEC_STRIDE );
419     avc_idct4x4_addblk_dc_msa( &p_dst[4], &pi_dct[1], FDEC_STRIDE );
420     avc_idct4x4_addblk_dc_msa( &p_dst[4 * FDEC_STRIDE + 0],
421                                &pi_dct[2], FDEC_STRIDE );
422     avc_idct4x4_addblk_dc_msa( &p_dst[4 * FDEC_STRIDE + 4],
423                                &pi_dct[3], FDEC_STRIDE );
424 }
425 
x264_add16x16_idct_dc_msa(uint8_t * p_dst,int16_t pi_dct[16])426 void x264_add16x16_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[16] )
427 {
428     for( int32_t i = 0; i < 4; i++, pi_dct += 4, p_dst += 4 * FDEC_STRIDE )
429     {
430         avc_idct4x4_addblk_dc_msa( &p_dst[ 0], &pi_dct[0], FDEC_STRIDE );
431         avc_idct4x4_addblk_dc_msa( &p_dst[ 4], &pi_dct[1], FDEC_STRIDE );
432         avc_idct4x4_addblk_dc_msa( &p_dst[ 8], &pi_dct[2], FDEC_STRIDE );
433         avc_idct4x4_addblk_dc_msa( &p_dst[12], &pi_dct[3], FDEC_STRIDE );
434     }
435 }
436 
x264_sub4x4_dct_msa(int16_t p_dst[16],uint8_t * p_src,uint8_t * p_ref)437 void x264_sub4x4_dct_msa( int16_t p_dst[16], uint8_t *p_src,
438                           uint8_t *p_ref )
439 {
440     avc_sub4x4_dct_msa( p_src, FENC_STRIDE, p_ref, FDEC_STRIDE, p_dst );
441 }
442 
x264_sub8x8_dct_msa(int16_t p_dst[4][16],uint8_t * p_src,uint8_t * p_ref)443 void x264_sub8x8_dct_msa( int16_t p_dst[4][16], uint8_t *p_src,
444                           uint8_t *p_ref )
445 {
446     avc_sub4x4_dct_msa( &p_src[0], FENC_STRIDE,
447                         &p_ref[0], FDEC_STRIDE, p_dst[0] );
448     avc_sub4x4_dct_msa( &p_src[4], FENC_STRIDE, &p_ref[4],
449                         FDEC_STRIDE, p_dst[1] );
450     avc_sub4x4_dct_msa( &p_src[4 * FENC_STRIDE + 0],
451                         FENC_STRIDE, &p_ref[4 * FDEC_STRIDE + 0],
452                         FDEC_STRIDE, p_dst[2] );
453     avc_sub4x4_dct_msa( &p_src[4 * FENC_STRIDE + 4],
454                         FENC_STRIDE, &p_ref[4 * FDEC_STRIDE + 4],
455                         FDEC_STRIDE, p_dst[3] );
456 }
457 
x264_sub16x16_dct_msa(int16_t p_dst[16][16],uint8_t * p_src,uint8_t * p_ref)458 void x264_sub16x16_dct_msa( int16_t p_dst[16][16],
459                             uint8_t *p_src,
460                             uint8_t *p_ref )
461 {
462     x264_sub8x8_dct_msa( &p_dst[ 0], &p_src[0], &p_ref[0] );
463     x264_sub8x8_dct_msa( &p_dst[ 4], &p_src[8], &p_ref[8] );
464     x264_sub8x8_dct_msa( &p_dst[ 8], &p_src[8 * FENC_STRIDE + 0],
465                          &p_ref[8*FDEC_STRIDE+0] );
466     x264_sub8x8_dct_msa( &p_dst[12], &p_src[8 * FENC_STRIDE + 8],
467                          &p_ref[8*FDEC_STRIDE+8] );
468 }
469 
x264_sub8x8_dct_dc_msa(int16_t pi_dct[4],uint8_t * p_pix1,uint8_t * p_pix2)470 void x264_sub8x8_dct_dc_msa( int16_t pi_dct[4],
471                              uint8_t *p_pix1, uint8_t *p_pix2 )
472 {
473     int32_t d0, d1, d2, d3;
474 
475     pi_dct[0] = subtract_sum4x4_msa( &p_pix1[0], FENC_STRIDE,
476                                      &p_pix2[0], FDEC_STRIDE );
477     pi_dct[1] = subtract_sum4x4_msa( &p_pix1[4], FENC_STRIDE,
478                                      &p_pix2[4], FDEC_STRIDE );
479     pi_dct[2] = subtract_sum4x4_msa( &p_pix1[4 * FENC_STRIDE + 0], FENC_STRIDE,
480                                      &p_pix2[4 * FDEC_STRIDE + 0],
481                                      FDEC_STRIDE );
482     pi_dct[3] = subtract_sum4x4_msa( &p_pix1[4 * FENC_STRIDE + 4], FENC_STRIDE,
483                                      &p_pix2[4 * FDEC_STRIDE + 4],
484                                      FDEC_STRIDE );
485 
486     BUTTERFLY_4( pi_dct[0], pi_dct[2], pi_dct[3], pi_dct[1], d0, d1, d3, d2 );
487     BUTTERFLY_4( d0, d2, d3, d1, pi_dct[0], pi_dct[2], pi_dct[3], pi_dct[1] );
488 }
489 
x264_sub8x16_dct_dc_msa(int16_t pi_dct[8],uint8_t * p_pix1,uint8_t * p_pix2)490 void x264_sub8x16_dct_dc_msa( int16_t pi_dct[8],
491                               uint8_t *p_pix1, uint8_t *p_pix2 )
492 {
493     int32_t a0, a1, a2, a3, a4, a5, a6, a7;
494     int32_t b0, b1, b2, b3, b4, b5, b6, b7;
495 
496     a0 = subtract_sum4x4_msa( &p_pix1[ 0 * FENC_STRIDE + 0], FENC_STRIDE,
497                               &p_pix2[ 0 * FDEC_STRIDE + 0], FDEC_STRIDE );
498     a1 = subtract_sum4x4_msa( &p_pix1[ 0 * FENC_STRIDE + 4], FENC_STRIDE,
499                               &p_pix2[ 0 * FDEC_STRIDE + 4], FDEC_STRIDE );
500     a2 = subtract_sum4x4_msa( &p_pix1[ 4 * FENC_STRIDE + 0], FENC_STRIDE,
501                               &p_pix2[ 4 * FDEC_STRIDE + 0], FDEC_STRIDE );
502     a3 = subtract_sum4x4_msa( &p_pix1[ 4 * FENC_STRIDE + 4], FENC_STRIDE,
503                               &p_pix2[ 4 * FDEC_STRIDE + 4], FDEC_STRIDE );
504     a4 = subtract_sum4x4_msa( &p_pix1[ 8 * FENC_STRIDE + 0], FENC_STRIDE,
505                               &p_pix2[ 8 * FDEC_STRIDE + 0], FDEC_STRIDE );
506     a5 = subtract_sum4x4_msa( &p_pix1[ 8 * FENC_STRIDE + 4], FENC_STRIDE,
507                               &p_pix2[ 8 * FDEC_STRIDE + 4], FDEC_STRIDE );
508     a6 = subtract_sum4x4_msa( &p_pix1[12 * FENC_STRIDE + 0], FENC_STRIDE,
509                               &p_pix2[12 * FDEC_STRIDE + 0], FDEC_STRIDE );
510     a7 = subtract_sum4x4_msa( &p_pix1[12 * FENC_STRIDE + 4], FENC_STRIDE,
511                               &p_pix2[12 * FDEC_STRIDE + 4], FDEC_STRIDE );
512 
513     BUTTERFLY_8( a0, a2, a4, a6, a7, a5, a3, a1,
514                  b0, b1, b2, b3, b7, b6, b5, b4 );
515     BUTTERFLY_8( b0, b2, b4, b6, b7, b5, b3, b1,
516                  a0, a1, a2, a3, a7, a6, a5, a4 );
517     BUTTERFLY_8( a0, a2, a4, a6, a7, a5, a3, a1,
518                  pi_dct[0], pi_dct[1], pi_dct[6], pi_dct[7],
519                  pi_dct[5], pi_dct[4], pi_dct[3], pi_dct[2] );
520 }
521 
x264_zigzag_scan_4x4_frame_msa(int16_t pi_level[16],int16_t pi_dct[16])522 void x264_zigzag_scan_4x4_frame_msa( int16_t pi_level[16], int16_t pi_dct[16] )
523 {
524     avc_zigzag_scan_4x4_frame_msa( pi_dct, pi_level );
525 }
526 #endif
527