1 /*****************************************************************************
2  * mc.c: ppc motion compensation
3  *****************************************************************************
4  * Copyright (C) 2003-2021 x264 project
5  *
6  * Authors: Eric Petit <eric.petit@lapsus.org>
7  *          Guillaume Poirier <gpoirier@mplayerhq.hu>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
22  *
23  * This program is also available under a commercial proprietary license.
24  * For more information, contact us at licensing@x264.com.
25  *****************************************************************************/
26 
27 #include "common/common.h"
28 #include "ppccommon.h"
29 #include "mc.h"
30 
31 #if !HIGH_BIT_DEPTH
32 typedef void (*pf_mc_t)( uint8_t *src, intptr_t i_src,
33                          uint8_t *dst, intptr_t i_dst, int i_height );
34 
pixel_avg2_w4_altivec(uint8_t * dst,intptr_t i_dst,uint8_t * src1,intptr_t i_src1,uint8_t * src2,int i_height)35 static inline void pixel_avg2_w4_altivec( uint8_t *dst,  intptr_t i_dst,
36                                           uint8_t *src1, intptr_t i_src1,
37                                           uint8_t *src2, int i_height )
38 {
39     for( int y = 0; y < i_height; y++ )
40     {
41 #ifndef __POWER9_VECTOR__
42         for( int x = 0; x < 4; x++ )
43             dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
44 #else
45         vec_u8_t s1 = vec_vsx_ld( 0, src1 );
46         vec_u8_t s2 = vec_vsx_ld( 0, src2 );
47         vec_u8_t avg = vec_avg( s1, s2 );
48 
49         vec_xst_len( avg, dst, 4 );
50 #endif
51         dst  += i_dst;
52         src1 += i_src1;
53         src2 += i_src1;
54     }
55 }
56 
pixel_avg2_w8_altivec(uint8_t * dst,intptr_t i_dst,uint8_t * src1,intptr_t i_src1,uint8_t * src2,int i_height)57 static inline void pixel_avg2_w8_altivec( uint8_t *dst,  intptr_t i_dst,
58                                           uint8_t *src1, intptr_t i_src1,
59                                           uint8_t *src2, int i_height )
60 {
61     vec_u8_t src1v, src2v;
62 
63     for( int y = 0; y < i_height; y++ )
64     {
65         src1v = vec_vsx_ld( 0, src1 );
66         src2v = vec_vsx_ld( 0, src2 );
67         src1v = vec_avg( src1v, src2v );
68 
69         VEC_STORE8(src1v, dst);
70 
71         dst  += i_dst;
72         src1 += i_src1;
73         src2 += i_src1;
74     }
75 }
76 
pixel_avg2_w16_altivec(uint8_t * dst,intptr_t i_dst,uint8_t * src1,intptr_t i_src1,uint8_t * src2,int i_height)77 static inline void pixel_avg2_w16_altivec( uint8_t *dst,  intptr_t i_dst,
78                                            uint8_t *src1, intptr_t i_src1,
79                                            uint8_t *src2, int i_height )
80 {
81     vec_u8_t src1v, src2v;
82 
83     for( int y = 0; y < i_height; y++ )
84     {
85         src1v = vec_vsx_ld( 0, src1 );
86         src2v = vec_vsx_ld( 0, src2 );
87         src1v = vec_avg( src1v, src2v );
88         vec_st(src1v, 0, dst);
89 
90         dst  += i_dst;
91         src1 += i_src1;
92         src2 += i_src1;
93     }
94 }
95 
pixel_avg2_w20_altivec(uint8_t * dst,intptr_t i_dst,uint8_t * src1,intptr_t i_src1,uint8_t * src2,int i_height)96 static inline void pixel_avg2_w20_altivec( uint8_t *dst,  intptr_t i_dst,
97                                            uint8_t *src1, intptr_t i_src1,
98                                            uint8_t *src2, int i_height )
99 {
100     pixel_avg2_w16_altivec(dst, i_dst, src1, i_src1, src2, i_height);
101     pixel_avg2_w4_altivec(dst+16, i_dst, src1+16, i_src1, src2+16, i_height);
102 }
103 
104 /* mc_copy: plain c */
105 
106 #ifndef __POWER9_VECTOR__
107 #define tiny_copy( d, s, l ) memcpy( d, s, l )
108 #else
109 #define tiny_copy( d, s, l ) vec_xst_len( vec_vsx_ld( 0, s ), d, l )
110 #endif
111 
112 #define MC_COPY( name, a )                                \
113 static void name( uint8_t *dst, intptr_t i_dst,           \
114                   uint8_t *src, intptr_t i_src, int i_height ) \
115 {                                                         \
116     int y;                                                \
117     for( y = 0; y < i_height; y++ )                       \
118     {                                                     \
119         memcpy( dst, src, a );                            \
120         src += i_src;                                     \
121         dst += i_dst;                                     \
122     }                                                     \
123 }
124 MC_COPY( mc_copy_w4_altivec,  4  )
125 MC_COPY( mc_copy_w8_altivec,  8  )
126 
mc_copy_w16_altivec(uint8_t * dst,intptr_t i_dst,uint8_t * src,intptr_t i_src,int i_height)127 static void mc_copy_w16_altivec( uint8_t *dst, intptr_t i_dst,
128                                  uint8_t *src, intptr_t i_src, int i_height )
129 {
130     vec_u8_t cpyV;
131 
132     for( int y = 0; y < i_height; y++ )
133     {
134         cpyV = vec_vsx_ld( 0, src );
135         vec_st(cpyV, 0, dst);
136 
137         src += i_src;
138         dst += i_dst;
139     }
140 }
141 
142 
mc_copy_w16_aligned_altivec(uint8_t * dst,intptr_t i_dst,uint8_t * src,intptr_t i_src,int i_height)143 static void mc_copy_w16_aligned_altivec( uint8_t *dst, intptr_t i_dst,
144                                          uint8_t *src, intptr_t i_src, int i_height )
145 {
146     for( int y = 0; y < i_height; ++y )
147     {
148         vec_u8_t cpyV = vec_ld( 0, src );
149         vec_st(cpyV, 0, dst);
150 
151         src += i_src;
152         dst += i_dst;
153     }
154 }
155 
156 #define x264_plane_copy_swap_core_altivec x264_template(plane_copy_swap_core_altivec)
x264_plane_copy_swap_core_altivec(uint8_t * dst,intptr_t i_dst,uint8_t * src,intptr_t i_src,int w,int h)157 void x264_plane_copy_swap_core_altivec( uint8_t *dst, intptr_t i_dst,
158                                         uint8_t *src, intptr_t i_src, int w, int h )
159 {
160     const vec_u8_t mask = { 0x01, 0x00, 0x03, 0x02, 0x05, 0x04, 0x07, 0x06, 0x09, 0x08, 0x0B, 0x0A, 0x0D, 0x0C, 0x0F, 0x0E };
161 
162     for( int y = 0; y < h; y++, dst += i_dst, src += i_src )
163         for( int x = 0; x < 2 * w; x += 16 )
164         {
165             vec_u8_t srcv = vec_vsx_ld( x, src );
166             vec_u8_t dstv = vec_perm( srcv, srcv, mask );
167 
168             vec_vsx_st( dstv, x, dst );
169         }
170 }
171 
172 #define x264_plane_copy_interleave_core_altivec x264_template(plane_copy_interleave_core_altivec)
x264_plane_copy_interleave_core_altivec(uint8_t * dst,intptr_t i_dst,uint8_t * srcu,intptr_t i_srcu,uint8_t * srcv,intptr_t i_srcv,int w,int h)173 void x264_plane_copy_interleave_core_altivec( uint8_t *dst, intptr_t i_dst,
174                                               uint8_t *srcu, intptr_t i_srcu,
175                                               uint8_t *srcv, intptr_t i_srcv, int w, int h )
176 {
177     for( int y = 0; y < h; y++, dst += i_dst, srcu += i_srcu, srcv += i_srcv )
178         for( int x = 0; x < w; x += 16 )
179         {
180             vec_u8_t srcvv = vec_vsx_ld( x, srcv );
181             vec_u8_t srcuv = vec_vsx_ld( x, srcu );
182             vec_u8_t dstv1 = vec_mergeh( srcuv, srcvv );
183             vec_u8_t dstv2 = vec_mergel( srcuv, srcvv );
184 
185             vec_vsx_st( dstv1, 2 * x, dst );
186             vec_vsx_st( dstv2, 2 * x + 16, dst );
187         }
188 }
189 
x264_store_interleave_chroma_altivec(uint8_t * dst,intptr_t i_dst,uint8_t * srcu,uint8_t * srcv,int height)190 void x264_store_interleave_chroma_altivec( uint8_t *dst, intptr_t i_dst,
191                                            uint8_t *srcu, uint8_t *srcv, int height )
192 {
193     for( int y = 0; y < height; y++, dst += i_dst, srcu += FDEC_STRIDE, srcv += FDEC_STRIDE )
194     {
195         vec_u8_t srcvv = vec_vsx_ld( 0, srcv );
196         vec_u8_t srcuv = vec_vsx_ld( 0, srcu );
197         vec_u8_t dstv = vec_mergeh( srcuv, srcvv );
198 
199         vec_vsx_st(dstv, 0, dst);
200     }
201 }
202 
x264_plane_copy_deinterleave_altivec(uint8_t * dstu,intptr_t i_dstu,uint8_t * dstv,intptr_t i_dstv,uint8_t * src,intptr_t i_src,int w,int h)203 void x264_plane_copy_deinterleave_altivec( uint8_t *dstu, intptr_t i_dstu,
204                                            uint8_t *dstv, intptr_t i_dstv,
205                                            uint8_t *src, intptr_t i_src, int w, int h )
206 {
207     const vec_u8_t mask[2] = {
208         { 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E },
209         { 0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F, 0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F }
210     };
211     for( int y = 0; y < h; y++, dstu += i_dstu, dstv += i_dstv, src += i_src )
212     {
213         for( int x = 0; x < w; x += 16 )
214         {
215             vec_u8_t srcv1 = vec_vsx_ld( 2 * x, src );
216             vec_u8_t srcv2 = vec_vsx_ld( 2 * x + 16, src );
217             vec_u8_t dstuv = vec_perm( srcv1, srcv2, mask[0] );
218             vec_u8_t dstvv = vec_perm( srcv1, srcv2, mask[1] );
219 
220             vec_vsx_st( dstuv, x, dstu );
221             vec_vsx_st( dstvv, x, dstv );
222         }
223     }
224 }
225 
load_deinterleave_chroma_fenc_altivec(uint8_t * dst,uint8_t * src,intptr_t i_src,int height)226 static void load_deinterleave_chroma_fenc_altivec( uint8_t *dst, uint8_t *src, intptr_t i_src, int height )
227 {
228     const vec_u8_t mask = { 0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E, 0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F };
229 
230     for( int y = 0; y < height; y += 2, dst += 2*FENC_STRIDE, src += 2*i_src )
231     {
232         vec_u8_t src0 = vec_ld( 0, src );
233         vec_u8_t src1 = vec_ld( i_src, src );
234 
235         vec_st( vec_perm( src0, src0, mask ), 0*FENC_STRIDE, dst );
236         vec_st( vec_perm( src1, src1, mask ), 1*FENC_STRIDE, dst );
237     }
238 }
239 
240 #if HAVE_VSX
x264_plane_copy_deinterleave_rgb_altivec(uint8_t * dsta,intptr_t i_dsta,uint8_t * dstb,intptr_t i_dstb,uint8_t * dstc,intptr_t i_dstc,uint8_t * src,intptr_t i_src,int pw,int w,int h)241 void x264_plane_copy_deinterleave_rgb_altivec( uint8_t *dsta, intptr_t i_dsta,
242                                                uint8_t *dstb, intptr_t i_dstb,
243                                                uint8_t *dstc, intptr_t i_dstc,
244                                                uint8_t *src, intptr_t i_src,
245                                                int pw, int w, int h )
246 {
247     if( pw == 3 )
248     {
249         const vec_u8_t mask[4] = {
250             { 0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F, 0x12, 0x15, 0x01, 0x04, 0x07, 0x0A, 0x0D, 0x10, 0x13, 0x16 },
251             { 0x08, 0x0B, 0x0E, 0x11, 0x14, 0x17, 0x1A, 0x1D, 0x09, 0x0C, 0x0F, 0x12, 0x15, 0x18, 0x1B, 0x1E },
252             { 0x02, 0x05, 0x08, 0x0B, 0x0E, 0x11, 0x14, 0x17, 0x1A, 0x1D, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF },
253             { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x10, 0x13, 0x16, 0x19, 0x1C, 0x1F }
254         };
255 
256         for( int y = 0; y < h; y++, dsta += i_dsta, dstb += i_dstb, dstc += i_dstc, src += i_src )
257         {
258             for( int x = 0; x < w; x += 16 )
259             {
260                 vec_u8_t srcv1 = vec_vsx_ld( 3 * x, src );
261                 vec_u8_t srcv2 = vec_vsx_ld( 3 * x + 16, src );
262                 vec_u8_t srcv3 = vec_vsx_ld( 3 * x + 32, src );
263                 vec_u64_t tmp1 = (vec_u64_t)vec_perm( srcv1, srcv2, mask[0] ); // a0  a1  a2  a3  a4  a5  a6  a7  b0  b1  b2  b3  b4  b5  b6  b7
264                 vec_u64_t tmp2 = (vec_u64_t)vec_perm( srcv2, srcv3, mask[1] ); // a8  a9  a10 a11 a12 a13 a14 a15 b8  b9  b10 b11 b12 b13 b14 b15
265                 vec_st( (vec_u8_t)vec_mergeh( tmp1, tmp2 ), x, dsta );
266                 vec_st( (vec_u8_t)vec_mergel( tmp1, tmp2 ), x, dstb );
267 
268                 srcv1 = vec_perm( srcv1, srcv2, mask[2] );          // c0  c1  c2  c3  c4  c5  c6  c7  c8  c9
269                 srcv1 = vec_perm( srcv1, srcv3, mask[3] );          // c0  c1  c2  c3  c4  c5  c6  c7  c8  c9  c10 c11 c12 c13 c14 c15
270                 vec_st( srcv1, x, dstc );
271             }
272         }
273     }
274     else
275     {
276         const vec_u8_t mask[2] = {
277             { 0x00, 0x04, 0x08, 0x0C, 0x10, 0x14, 0x18, 0x1C, 0x01, 0x05, 0x09, 0x0D, 0x11, 0x15, 0x19, 0x1D },
278             { 0x02, 0x06, 0x0A, 0x0E, 0x12, 0x16, 0x1A, 0x1E, 0x03, 0x07, 0x0B, 0x0F, 0x13, 0x17, 0x1B, 0x1F }
279         };
280 
281         for( int y = 0; y < h; y++, dsta += i_dsta, dstb += i_dstb, dstc += i_dstc, src += i_src )
282         {
283             for( int x = 0; x < w; x += 16 )
284             {
285                 vec_u8_t srcv1 = vec_vsx_ld( 4 * x, src );
286                 vec_u8_t srcv2 = vec_vsx_ld( 4 * x + 16, src );
287                 vec_u8_t srcv3 = vec_vsx_ld( 4 * x + 32, src );
288                 vec_u8_t srcv4 = vec_vsx_ld( 4 * x + 48, src );
289 
290                 vec_u64_t tmp1 = (vec_u64_t)vec_perm( srcv1, srcv2, mask[0] ); // a0  a1  a2  a3  a4  a5  a6  a7  b0  b1  b2  b3  b4  b5  b6  b7
291                 vec_u64_t tmp2 = (vec_u64_t)vec_perm( srcv3, srcv4, mask[0] ); // a8  a9  a10 a11 a12 a13 a14 a15 b8  b9  b10 b11 b12 b13 b14 b15
292                 vec_st( (vec_u8_t)vec_mergeh( tmp1, tmp2 ), x, dsta );
293                 vec_st( (vec_u8_t)vec_mergel( tmp1, tmp2 ), x, dstb );
294 
295                 tmp1 = (vec_u64_t)vec_perm( srcv1, srcv2, mask[1] );           // c0  c1  c2  c3  c4  c5  c6  c7
296                 tmp2 = (vec_u64_t)vec_perm( srcv3, srcv4, mask[1] );           // c8  c9  c10 c11 c12 c13 c14 c15
297                 vec_st( (vec_u8_t)vec_mergeh( tmp1, tmp2 ), x, dstc );
298             }
299         }
300     }
301 }
302 #endif
303 
mc_luma_altivec(uint8_t * dst,intptr_t i_dst_stride,uint8_t * src[4],intptr_t i_src_stride,int mvx,int mvy,int i_width,int i_height,const x264_weight_t * weight)304 static void mc_luma_altivec( uint8_t *dst,    intptr_t i_dst_stride,
305                              uint8_t *src[4], intptr_t i_src_stride,
306                              int mvx, int mvy,
307                              int i_width, int i_height, const x264_weight_t *weight )
308 {
309     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
310     intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
311     uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
312     if( qpel_idx & 5 ) /* qpel interpolation needed */
313     {
314         uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
315 
316         switch( i_width )
317         {
318             case 4:
319                 pixel_avg2_w4_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height );
320                 break;
321             case 8:
322                 pixel_avg2_w8_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height );
323                 break;
324             case 16:
325             default:
326                 pixel_avg2_w16_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height );
327         }
328         if( weight->weightfn )
329             weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );
330     }
331     else if( weight->weightfn )
332         weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );
333     else
334     {
335         switch( i_width )
336         {
337             case 4:
338                 mc_copy_w4_altivec( dst, i_dst_stride, src1, i_src_stride, i_height );
339                 break;
340             case 8:
341                 mc_copy_w8_altivec( dst, i_dst_stride, src1, i_src_stride, i_height );
342                 break;
343             case 16:
344                 mc_copy_w16_altivec( dst, i_dst_stride, src1, i_src_stride, i_height );
345                 break;
346         }
347     }
348 }
349 
350 
351 
get_ref_altivec(uint8_t * dst,intptr_t * i_dst_stride,uint8_t * src[4],intptr_t i_src_stride,int mvx,int mvy,int i_width,int i_height,const x264_weight_t * weight)352 static uint8_t *get_ref_altivec( uint8_t *dst,   intptr_t *i_dst_stride,
353                                  uint8_t *src[4], intptr_t i_src_stride,
354                                  int mvx, int mvy,
355                                  int i_width, int i_height, const x264_weight_t *weight )
356 {
357     int qpel_idx = ((mvy&3)<<2) + (mvx&3);
358     intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
359     uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
360     if( qpel_idx & 5 ) /* qpel interpolation needed */
361     {
362         uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
363         switch( i_width )
364         {
365             case 4:
366                 pixel_avg2_w4_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
367                 break;
368             case 8:
369                 pixel_avg2_w8_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
370                 break;
371             case 12:
372             case 16:
373             default:
374                 pixel_avg2_w16_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
375                 break;
376             case 20:
377                 pixel_avg2_w20_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
378                 break;
379         }
380         if( weight->weightfn )
381             weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );
382         return dst;
383     }
384     else if( weight->weightfn )
385     {
386         weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );
387         return dst;
388     }
389     else
390     {
391         *i_dst_stride = i_src_stride;
392         return src1;
393     }
394 }
395 
mc_chroma_2xh(uint8_t * dstu,uint8_t * dstv,intptr_t i_dst_stride,uint8_t * src,intptr_t i_src_stride,int mvx,int mvy,int i_height)396 static void mc_chroma_2xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride,
397                            uint8_t *src, intptr_t i_src_stride,
398                            int mvx, int mvy, int i_height )
399 {
400     uint8_t *srcp;
401     int d8x = mvx&0x07;
402     int d8y = mvy&0x07;
403 
404     int cA = (8-d8x)*(8-d8y);
405     int cB = d8x    *(8-d8y);
406     int cC = (8-d8x)*d8y;
407     int cD = d8x    *d8y;
408 
409     src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2;
410     srcp = &src[i_src_stride];
411 
412     for( int y = 0; y < i_height; y++ )
413     {
414         dstu[0] = ( cA*src[0] + cB*src[2] + cC*srcp[0] + cD*srcp[2] + 32 ) >> 6;
415         dstv[0] = ( cA*src[1] + cB*src[3] + cC*srcp[1] + cD*srcp[3] + 32 ) >> 6;
416         dstu[1] = ( cA*src[2] + cB*src[4] + cC*srcp[2] + cD*srcp[4] + 32 ) >> 6;
417         dstv[1] = ( cA*src[3] + cB*src[5] + cC*srcp[3] + cD*srcp[5] + 32 ) >> 6;
418 
419         src  += i_src_stride;
420         srcp += i_src_stride;
421         dstu += i_dst_stride;
422         dstv += i_dst_stride;
423     }
424  }
425 
426 #ifdef WORDS_BIGENDIAN
427 #define VSLD(a,b,n) vec_sld(a,b,n)
428 #else
429 #define VSLD(a,b,n) vec_sld(b,a,16-n)
430 #endif
431 
432 #ifndef __POWER9_VECTOR__
433 #define STORE4_ALIGNED(d, s) vec_ste( (vec_u32_t)s, 0, (uint32_t*) d )
434 #define STORE2_UNALIGNED(d, s) vec_ste( vec_splat( (vec_u16_t)s, 0 ), 0, (uint16_t*)d )
435 #else
436 #define STORE4_ALIGNED(d, s) vec_xst_len( (vec_u8_t)s, d, 4 )
437 #define STORE2_UNALIGNED(d, s) vec_xst_len( (vec_u8_t)s, d, 2 )
438 #endif
439 
mc_chroma_4xh_altivec(uint8_t * dstu,uint8_t * dstv,intptr_t i_dst_stride,uint8_t * src,intptr_t i_src_stride,int mvx,int mvy,int i_height)440 static void mc_chroma_4xh_altivec( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride,
441                                    uint8_t *src, intptr_t i_src_stride,
442                                    int mvx, int mvy, int i_height )
443 {
444     uint8_t *srcp;
445     int d8x = mvx & 0x07;
446     int d8y = mvy & 0x07;
447 
448     ALIGNED_16( uint16_t coeff[4] );
449     coeff[0] = (8-d8x)*(8-d8y);
450     coeff[1] = d8x    *(8-d8y);
451     coeff[2] = (8-d8x)*d8y;
452     coeff[3] = d8x    *d8y;
453 
454     src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2;
455     srcp = &src[i_src_stride];
456 
457     LOAD_ZERO;
458     vec_u16_t   coeff0v, coeff1v, coeff2v, coeff3v;
459     vec_u8_t    src2v_8, dstuv, dstvv;
460     vec_u16_t   src0v_16, src1v_16, src2v_16, src3v_16, dstv16;
461     vec_u16_t   shiftv, k32v;
462 
463 #ifdef WORDS_BIGENDIAN
464     static const vec_u8_t perm0v = CV(1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13);
465     static const vec_u8_t perm1v = CV(3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15);
466 #else
467     static const vec_u8_t perm0v = CV(0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12);
468     static const vec_u8_t perm1v = CV(2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14);
469 #endif
470 
471     coeff0v = vec_ld( 0, coeff );
472     coeff3v = vec_splat( coeff0v, 3 );
473     coeff2v = vec_splat( coeff0v, 2 );
474     coeff1v = vec_splat( coeff0v, 1 );
475     coeff0v = vec_splat( coeff0v, 0 );
476     k32v    = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) );
477     shiftv  = vec_splat_u16( 6 );
478 
479     src2v_8 = vec_vsx_ld( 0, src );
480     src2v_16 = vec_u8_to_u16( src2v_8 );
481     src3v_16 = vec_u8_to_u16( VSLD( src2v_8, src2v_8, 2 ) );
482 
483     for( int y = 0; y < i_height; y += 2 )
484     {
485         src0v_16 = src2v_16;
486         src1v_16 = src3v_16;
487         src2v_8 = vec_vsx_ld( 0, srcp );
488         src2v_16 = vec_u8_to_u16( src2v_8 );
489         src3v_16 = vec_u8_to_u16( VSLD( src2v_8, src2v_8, 2 ) );
490 
491         dstv16 = vec_mladd( coeff0v, src0v_16, k32v );
492         dstv16 = vec_mladd( coeff1v, src1v_16, dstv16 );
493         dstv16 = vec_mladd( coeff2v, src2v_16, dstv16 );
494         dstv16 = vec_mladd( coeff3v, src3v_16, dstv16 );
495 
496         dstv16 = vec_sr( dstv16, shiftv );
497 
498         dstuv = (vec_u8_t)vec_perm( dstv16, dstv16, perm0v );
499         dstvv = (vec_u8_t)vec_perm( dstv16, dstv16, perm1v );
500         STORE4_ALIGNED( dstu, dstuv );
501         STORE4_ALIGNED( dstv, dstvv );
502 
503         srcp += i_src_stride;
504         dstu += i_dst_stride;
505         dstv += i_dst_stride;
506 
507         src0v_16 = src2v_16;
508         src1v_16 = src3v_16;
509         src2v_8 = vec_vsx_ld( 0, srcp );
510         src2v_16 = vec_u8_to_u16( src2v_8 );
511         src3v_16 = vec_u8_to_u16( VSLD( src2v_8, src2v_8, 2 ) );
512 
513         dstv16 = vec_mladd( coeff0v, src0v_16, k32v );
514         dstv16 = vec_mladd( coeff1v, src1v_16, dstv16 );
515         dstv16 = vec_mladd( coeff2v, src2v_16, dstv16 );
516         dstv16 = vec_mladd( coeff3v, src3v_16, dstv16 );
517 
518         dstv16 = vec_sr( dstv16, shiftv );
519 
520         dstuv = (vec_u8_t)vec_perm( dstv16, dstv16, perm0v );
521         dstvv = (vec_u8_t)vec_perm( dstv16, dstv16, perm1v );
522         STORE4_ALIGNED( dstu, dstuv );
523         STORE4_ALIGNED( dstv, dstvv );
524 
525         srcp += i_src_stride;
526         dstu += i_dst_stride;
527         dstv += i_dst_stride;
528     }
529 }
530 
mc_chroma_8xh_altivec(uint8_t * dstu,uint8_t * dstv,intptr_t i_dst_stride,uint8_t * src,intptr_t i_src_stride,int mvx,int mvy,int i_height)531 static void mc_chroma_8xh_altivec( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride,
532                                    uint8_t *src, intptr_t i_src_stride,
533                                    int mvx, int mvy, int i_height )
534 {
535     uint8_t *srcp;
536     int d8x = mvx & 0x07;
537     int d8y = mvy & 0x07;
538 
539     ALIGNED_16( uint16_t coeff[4] );
540     coeff[0] = (8-d8x)*(8-d8y);
541     coeff[1] = d8x    *(8-d8y);
542     coeff[2] = (8-d8x)*d8y;
543     coeff[3] = d8x    *d8y;
544 
545     src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2;
546     srcp = &src[i_src_stride];
547 
548     LOAD_ZERO;
549     vec_u16_t   coeff0v, coeff1v, coeff2v, coeff3v;
550     vec_u8_t    src0v_8, src1v_8, src2v_8, src3v_8;
551     vec_u8_t    dstuv, dstvv;
552     vec_u16_t   src0v_16h, src1v_16h, src2v_16h, src3v_16h, dstv_16h;
553     vec_u16_t   src0v_16l, src1v_16l, src2v_16l, src3v_16l, dstv_16l;
554     vec_u16_t   shiftv, k32v;
555 
556     coeff0v = vec_ld( 0, coeff );
557     coeff3v = vec_splat( coeff0v, 3 );
558     coeff2v = vec_splat( coeff0v, 2 );
559     coeff1v = vec_splat( coeff0v, 1 );
560     coeff0v = vec_splat( coeff0v, 0 );
561     k32v    = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) );
562     shiftv  = vec_splat_u16( 6 );
563 
564 #ifdef WORDS_BIGENDIAN
565     static const vec_u8_t perm0v = CV(1,5,9,13,17,21,25,29,0,0,0,0,0,0,0,0);
566     static const vec_u8_t perm1v = CV(3,7,11,15,19,23,27,31,0,0,0,0,0,0,0,0);
567 #else
568     static const vec_u8_t perm0v = CV(0,4,8,12,16,20,24,28,1,1,1,1,1,1,1,1);
569     static const vec_u8_t perm1v = CV(2,6,10,14,18,22,26,30,1,1,1,1,1,1,1,1);
570 #endif
571 
572     src2v_8 = vec_vsx_ld( 0, src );
573     src3v_8 = vec_vsx_ld( 16, src );
574     src3v_8 = VSLD( src2v_8, src3v_8, 2 );
575 
576     for( int y = 0; y < i_height; y += 2 )
577     {
578         src0v_8 = src2v_8;
579         src1v_8 = src3v_8;
580         src2v_8 = vec_vsx_ld( 0, srcp );
581         src3v_8 = vec_vsx_ld( 16, srcp );
582 
583         src3v_8 = VSLD( src2v_8, src3v_8, 2 );
584 
585         src0v_16h = vec_u8_to_u16_h( src0v_8 );
586         src0v_16l = vec_u8_to_u16_l( src0v_8 );
587         src1v_16h = vec_u8_to_u16_h( src1v_8 );
588         src1v_16l = vec_u8_to_u16_l( src1v_8 );
589         src2v_16h = vec_u8_to_u16_h( src2v_8 );
590         src2v_16l = vec_u8_to_u16_l( src2v_8 );
591         src3v_16h = vec_u8_to_u16_h( src3v_8 );
592         src3v_16l = vec_u8_to_u16_l( src3v_8 );
593 
594         dstv_16h = vec_mladd( coeff0v, src0v_16h, k32v );
595         dstv_16l = vec_mladd( coeff0v, src0v_16l, k32v );
596         dstv_16h = vec_mladd( coeff1v, src1v_16h, dstv_16h );
597         dstv_16l = vec_mladd( coeff1v, src1v_16l, dstv_16l );
598         dstv_16h = vec_mladd( coeff2v, src2v_16h, dstv_16h );
599         dstv_16l = vec_mladd( coeff2v, src2v_16l, dstv_16l );
600         dstv_16h = vec_mladd( coeff3v, src3v_16h, dstv_16h );
601         dstv_16l = vec_mladd( coeff3v, src3v_16l, dstv_16l );
602 
603         dstv_16h = vec_sr( dstv_16h, shiftv );
604         dstv_16l = vec_sr( dstv_16l, shiftv );
605 
606         dstuv = (vec_u8_t)vec_perm( dstv_16h, dstv_16l, perm0v );
607         dstvv = (vec_u8_t)vec_perm( dstv_16h, dstv_16l, perm1v );
608 
609         VEC_STORE8( dstuv, dstu );
610         VEC_STORE8( dstvv, dstv );
611 
612         srcp += i_src_stride;
613         dstu += i_dst_stride;
614         dstv += i_dst_stride;
615 
616         src0v_8 = src2v_8;
617         src1v_8 = src3v_8;
618         src2v_8 = vec_vsx_ld( 0, srcp );
619         src3v_8 = vec_vsx_ld( 16, srcp );
620 
621         src3v_8 = VSLD( src2v_8, src3v_8, 2 );
622 
623         src0v_16h = vec_u8_to_u16_h( src0v_8 );
624         src0v_16l = vec_u8_to_u16_l( src0v_8 );
625         src1v_16h = vec_u8_to_u16_h( src1v_8 );
626         src1v_16l = vec_u8_to_u16_l( src1v_8 );
627         src2v_16h = vec_u8_to_u16_h( src2v_8 );
628         src2v_16l = vec_u8_to_u16_l( src2v_8 );
629         src3v_16h = vec_u8_to_u16_h( src3v_8 );
630         src3v_16l = vec_u8_to_u16_l( src3v_8 );
631 
632         dstv_16h = vec_mladd( coeff0v, src0v_16h, k32v );
633         dstv_16l = vec_mladd( coeff0v, src0v_16l, k32v );
634         dstv_16h = vec_mladd( coeff1v, src1v_16h, dstv_16h );
635         dstv_16l = vec_mladd( coeff1v, src1v_16l, dstv_16l );
636         dstv_16h = vec_mladd( coeff2v, src2v_16h, dstv_16h );
637         dstv_16l = vec_mladd( coeff2v, src2v_16l, dstv_16l );
638         dstv_16h = vec_mladd( coeff3v, src3v_16h, dstv_16h );
639         dstv_16l = vec_mladd( coeff3v, src3v_16l, dstv_16l );
640 
641         dstv_16h = vec_sr( dstv_16h, shiftv );
642         dstv_16l = vec_sr( dstv_16l, shiftv );
643 
644         dstuv = (vec_u8_t)vec_perm( dstv_16h, dstv_16l, perm0v );
645         dstvv = (vec_u8_t)vec_perm( dstv_16h, dstv_16l, perm1v );
646 
647         VEC_STORE8( dstuv, dstu );
648         VEC_STORE8( dstvv, dstv );
649 
650         srcp += i_src_stride;
651         dstu += i_dst_stride;
652         dstv += i_dst_stride;
653     }
654 }
655 
mc_chroma_altivec(uint8_t * dstu,uint8_t * dstv,intptr_t i_dst_stride,uint8_t * src,intptr_t i_src_stride,int mvx,int mvy,int i_width,int i_height)656 static void mc_chroma_altivec( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride,
657                                uint8_t *src, intptr_t i_src_stride,
658                                int mvx, int mvy, int i_width, int i_height )
659 {
660     if( i_width == 8 )
661         mc_chroma_8xh_altivec( dstu, dstv, i_dst_stride, src, i_src_stride,
662                                mvx, mvy, i_height );
663     else if( i_width == 4 )
664         mc_chroma_4xh_altivec( dstu, dstv, i_dst_stride, src, i_src_stride,
665                                mvx, mvy, i_height );
666     else
667         mc_chroma_2xh( dstu, dstv, i_dst_stride, src, i_src_stride,
668                        mvx, mvy, i_height );
669 }
670 
671 #define HPEL_FILTER_1( t1v, t2v, t3v, t4v, t5v, t6v ) \
672 {                                                     \
673     t1v = vec_add( t1v, t6v );                        \
674     t2v = vec_add( t2v, t5v );                        \
675     t3v = vec_add( t3v, t4v );                        \
676                                                       \
677     t1v = vec_sub( t1v, t2v );   /* (a-b) */          \
678     t2v = vec_sub( t2v, t3v );   /* (b-c) */          \
679     t2v = vec_sl(  t2v, twov );  /* (b-c)*4 */        \
680     t1v = vec_sub( t1v, t2v );   /* a-5*b+4*c */      \
681     t3v = vec_sl(  t3v, fourv ); /* 16*c */           \
682     t1v = vec_add( t1v, t3v );   /* a-5*b+20*c */     \
683 }
684 
685 #define HPEL_FILTER_2( t1v, t2v, t3v, t4v, t5v, t6v ) \
686 {                                                     \
687     t1v = vec_add( t1v, t6v );                        \
688     t2v = vec_add( t2v, t5v );                        \
689     t3v = vec_add( t3v, t4v );                        \
690                                                       \
691     t1v = vec_sub( t1v, t2v );  /* (a-b) */           \
692     t1v = vec_sra( t1v, twov ); /* (a-b)/4 */         \
693     t1v = vec_sub( t1v, t2v );  /* (a-b)/4-b */       \
694     t1v = vec_add( t1v, t3v );  /* (a-b)/4-b+c */     \
695     t1v = vec_sra( t1v, twov ); /* ((a-b)/4-b+c)/4 */ \
696     t1v = vec_add( t1v, t3v );  /* ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 */ \
697 }
698 
699 #define HPEL_FILTER_HORIZONTAL()                             \
700 {                                                            \
701     src1v = vec_vsx_ld( x- 2+i_stride*y, src );              \
702     src6v = vec_vsx_ld( x+14+i_stride*y, src );              \
703                                                              \
704     src2v = VSLD( src1v, src6v,  1 );                        \
705     src3v = VSLD( src1v, src6v,  2 );                        \
706     src4v = VSLD( src1v, src6v,  3 );                        \
707     src5v = VSLD( src1v, src6v,  4 );                        \
708     src6v = VSLD( src1v, src6v,  5 );                        \
709                                                              \
710     temp1v = vec_u8_to_s16_h( src1v );                       \
711     temp2v = vec_u8_to_s16_h( src2v );                       \
712     temp3v = vec_u8_to_s16_h( src3v );                       \
713     temp4v = vec_u8_to_s16_h( src4v );                       \
714     temp5v = vec_u8_to_s16_h( src5v );                       \
715     temp6v = vec_u8_to_s16_h( src6v );                       \
716                                                              \
717     HPEL_FILTER_1( temp1v, temp2v, temp3v,                   \
718                    temp4v, temp5v, temp6v );                 \
719                                                              \
720     dest1v = vec_add( temp1v, sixteenv );                    \
721     dest1v = vec_sra( dest1v, fivev );                       \
722                                                              \
723     temp1v = vec_u8_to_s16_l( src1v );                       \
724     temp2v = vec_u8_to_s16_l( src2v );                       \
725     temp3v = vec_u8_to_s16_l( src3v );                       \
726     temp4v = vec_u8_to_s16_l( src4v );                       \
727     temp5v = vec_u8_to_s16_l( src5v );                       \
728     temp6v = vec_u8_to_s16_l( src6v );                       \
729                                                              \
730     HPEL_FILTER_1( temp1v, temp2v, temp3v,                   \
731                    temp4v, temp5v, temp6v );                 \
732                                                              \
733     dest2v = vec_add( temp1v, sixteenv );                    \
734     dest2v = vec_sra( dest2v, fivev );                       \
735                                                              \
736     destv = vec_packsu( dest1v, dest2v );                    \
737                                                              \
738     vec_vsx_st( destv, x+i_stride*y, dsth );                 \
739 }
740 
741 #define HPEL_FILTER_VERTICAL()                                    \
742 {                                                                 \
743     src1v = vec_vsx_ld( x+i_stride*(y-2), src );                  \
744     src2v = vec_vsx_ld( x+i_stride*(y-1), src );                  \
745     src3v = vec_vsx_ld( x+i_stride*(y-0), src );                  \
746     src4v = vec_vsx_ld( x+i_stride*(y+1), src );                  \
747     src5v = vec_vsx_ld( x+i_stride*(y+2), src );                  \
748     src6v = vec_vsx_ld( x+i_stride*(y+3), src );                  \
749                                                                   \
750     temp1v = vec_u8_to_s16_h( src1v );                            \
751     temp2v = vec_u8_to_s16_h( src2v );                            \
752     temp3v = vec_u8_to_s16_h( src3v );                            \
753     temp4v = vec_u8_to_s16_h( src4v );                            \
754     temp5v = vec_u8_to_s16_h( src5v );                            \
755     temp6v = vec_u8_to_s16_h( src6v );                            \
756                                                                   \
757     HPEL_FILTER_1( temp1v, temp2v, temp3v,                        \
758                    temp4v, temp5v, temp6v );                      \
759                                                                   \
760     dest1v = vec_add( temp1v, sixteenv );                         \
761     dest1v = vec_sra( dest1v, fivev );                            \
762                                                                   \
763     temp4v = vec_u8_to_s16_l( src1v );                            \
764     temp5v = vec_u8_to_s16_l( src2v );                            \
765     temp6v = vec_u8_to_s16_l( src3v );                            \
766     temp7v = vec_u8_to_s16_l( src4v );                            \
767     temp8v = vec_u8_to_s16_l( src5v );                            \
768     temp9v = vec_u8_to_s16_l( src6v );                            \
769                                                                   \
770     HPEL_FILTER_1( temp4v, temp5v, temp6v,                        \
771                    temp7v, temp8v, temp9v );                      \
772                                                                   \
773     dest2v = vec_add( temp4v, sixteenv );                         \
774     dest2v = vec_sra( dest2v, fivev );                            \
775                                                                   \
776     destv = vec_packsu( dest1v, dest2v );                         \
777                                                                   \
778     vec_vsx_st( destv, x+i_stride*y, dstv );                      \
779 }
780 
781 #define HPEL_FILTER_CENTRAL()                           \
782 {                                                       \
783     temp1v = VSLD( tempav, tempbv, 12 );                \
784     temp2v = VSLD( tempav, tempbv, 14 );                \
785     temp3v = tempbv;                                    \
786     temp4v = VSLD( tempbv, tempcv,  2 );                \
787     temp5v = VSLD( tempbv, tempcv,  4 );                \
788     temp6v = VSLD( tempbv, tempcv,  6 );                \
789                                                         \
790     HPEL_FILTER_2( temp1v, temp2v, temp3v,              \
791                    temp4v, temp5v, temp6v );            \
792                                                         \
793     dest1v = vec_add( temp1v, thirtytwov );             \
794     dest1v = vec_sra( dest1v, sixv );                   \
795                                                         \
796     temp1v = VSLD( tempbv, tempcv, 12 );                \
797     temp2v = VSLD( tempbv, tempcv, 14 );                \
798     temp3v = tempcv;                                    \
799     temp4v = VSLD( tempcv, tempdv,  2 );                \
800     temp5v = VSLD( tempcv, tempdv,  4 );                \
801     temp6v = VSLD( tempcv, tempdv,  6 );                \
802                                                         \
803     HPEL_FILTER_2( temp1v, temp2v, temp3v,              \
804                    temp4v, temp5v, temp6v );            \
805                                                         \
806     dest2v = vec_add( temp1v, thirtytwov );             \
807     dest2v = vec_sra( dest2v, sixv );                   \
808                                                         \
809     destv = vec_packsu( dest1v, dest2v );               \
810                                                         \
811     vec_vsx_st( destv, x-16+i_stride*y, dstc );         \
812 }
813 
x264_hpel_filter_altivec(uint8_t * dsth,uint8_t * dstv,uint8_t * dstc,uint8_t * src,intptr_t i_stride,int i_width,int i_height,int16_t * buf)814 void x264_hpel_filter_altivec( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
815                                intptr_t i_stride, int i_width, int i_height, int16_t *buf )
816 {
817     vec_u8_t destv;
818     vec_u8_t src1v, src2v, src3v, src4v, src5v, src6v;
819     vec_s16_t dest1v, dest2v;
820     vec_s16_t temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v, temp8v, temp9v;
821     vec_s16_t tempav, tempbv, tempcv, tempdv, tempev;
822 
823     LOAD_ZERO;
824 
825     vec_u16_t twov, fourv, fivev, sixv;
826     vec_s16_t sixteenv, thirtytwov;
827 
828     twov = vec_splats( (uint16_t)2 );
829     fourv = vec_splats( (uint16_t)4 );
830     fivev = vec_splats( (uint16_t)5 );
831     sixv = vec_splats( (uint16_t)6 );
832     sixteenv = vec_splats( (int16_t)16 );
833     thirtytwov = vec_splats( (int16_t)32 );
834 
835     for( int y = 0; y < i_height; y++ )
836     {
837         int x = 0;
838 
839         /* horizontal_filter */
840         HPEL_FILTER_HORIZONTAL();
841 
842         /* vertical_filter */
843         HPEL_FILTER_VERTICAL();
844 
845         /* central_filter */
846         tempav = tempcv;
847         tempbv = tempdv;
848         tempcv = vec_splat( temp1v, 0 ); /* first only */
849         tempdv = temp1v;
850         tempev = temp4v;
851 
852         for( x = 16; x < i_width; x+=16 )
853         {
854             /* horizontal_filter */
855             HPEL_FILTER_HORIZONTAL();
856 
857             /* vertical_filter */
858             HPEL_FILTER_VERTICAL();
859 
860             /* central_filter */
861             tempav = tempcv;
862             tempbv = tempdv;
863             tempcv = tempev;
864             tempdv = temp1v;
865             tempev = temp4v;
866 
867             HPEL_FILTER_CENTRAL();
868         }
869 
870         /* Partial vertical filter */
871         src1v = vec_vsx_ld( x+i_stride*(y-2), src );
872         src2v = vec_vsx_ld( x+i_stride*(y-1), src );
873         src3v = vec_vsx_ld( x+i_stride*(y-0), src );
874         src4v = vec_vsx_ld( x+i_stride*(y+1), src );
875         src5v = vec_vsx_ld( x+i_stride*(y+2), src );
876         src6v = vec_vsx_ld( x+i_stride*(y+3), src );
877 
878         temp1v = vec_u8_to_s16_h( src1v );
879         temp2v = vec_u8_to_s16_h( src2v );
880         temp3v = vec_u8_to_s16_h( src3v );
881         temp4v = vec_u8_to_s16_h( src4v );
882         temp5v = vec_u8_to_s16_h( src5v );
883         temp6v = vec_u8_to_s16_h( src6v );
884 
885         HPEL_FILTER_1( temp1v, temp2v, temp3v, temp4v, temp5v, temp6v );
886 
887         /* central_filter */
888         tempav = tempcv;
889         tempbv = tempdv;
890         tempcv = tempev;
891         tempdv = temp1v;
892         /* tempev is not used */
893 
894         HPEL_FILTER_CENTRAL();
895     }
896 }
897 
frame_init_lowres_core_altivec(uint8_t * src0,uint8_t * dst0,uint8_t * dsth,uint8_t * dstv,uint8_t * dstc,intptr_t src_stride,intptr_t dst_stride,int width,int height)898 static void frame_init_lowres_core_altivec( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
899                                             intptr_t src_stride, intptr_t dst_stride, int width, int height )
900 {
901     int w = width >> 4;
902     int end = (width & 15);
903     vec_u8_t src0v, src1v, src2v;
904     vec_u8_t lv, hv, src1p1v;
905     vec_u8_t avg0v, avg1v, avghv, avghp1v, avgleftv, avgrightv;
906     static const vec_u8_t inverse_bridge_shuffle = CV(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E );
907 #ifndef WORDS_BIGENDIAN
908     static const vec_u8_t inverse_bridge_shuffle_1 = CV(0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F, 0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F );
909 #endif
910 
911     for( int y = 0; y < height; y++ )
912     {
913         int x;
914         uint8_t *src1 = src0+src_stride;
915         uint8_t *src2 = src1+src_stride;
916 
917         src0v = vec_ld(0, src0);
918         src1v = vec_ld(0, src1);
919         src2v = vec_ld(0, src2);
920 
921         avg0v = vec_avg(src0v, src1v);
922         avg1v = vec_avg(src1v, src2v);
923 
924         for( x = 0; x < w; x++ )
925         {
926             lv = vec_ld(16*(x*2+1), src0);
927             src1v = vec_ld(16*(x*2+1), src1);
928             avghv = vec_avg(lv, src1v);
929 
930             lv = vec_ld(16*(x*2+2), src0);
931             src1p1v = vec_ld(16*(x*2+2), src1);
932             avghp1v = vec_avg(lv, src1p1v);
933 
934             avgleftv = vec_avg(VSLD(avg0v, avghv, 1), avg0v);
935             avgrightv = vec_avg(VSLD(avghv, avghp1v, 1), avghv);
936 
937             vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle), 16*x, dst0);
938 #ifdef WORDS_BIGENDIAN
939             vec_st((vec_u8_t)vec_pack((vec_u16_t)avgleftv,(vec_u16_t)avgrightv), 16*x, dsth);
940 #else
941             vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle_1), 16*x, dsth);
942 #endif
943 
944             avg0v = avghp1v;
945 
946             hv = vec_ld(16*(x*2+1), src2);
947             avghv = vec_avg(src1v, hv);
948 
949             hv = vec_ld(16*(x*2+2), src2);
950             avghp1v = vec_avg(src1p1v, hv);
951 
952             avgleftv = vec_avg(VSLD(avg1v, avghv, 1), avg1v);
953             avgrightv = vec_avg(VSLD(avghv, avghp1v, 1), avghv);
954 
955             vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle), 16*x, dstv);
956 #ifdef WORDS_BIGENDIAN
957             vec_st((vec_u8_t)vec_pack((vec_u16_t)avgleftv,(vec_u16_t)avgrightv), 16*x, dstc);
958 #else
959             vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle_1), 16*x, dstc);
960 #endif
961 
962             avg1v = avghp1v;
963 
964         }
965         if( end )
966         {
967             lv = vec_ld(16*(x*2+1), src0);
968             src1v = vec_ld(16*(x*2+1), src1);
969             avghv = vec_avg(lv, src1v);
970 
971             lv = vec_ld(16*(x*2+1), src2);
972             avghp1v = vec_avg(src1v, lv);
973 
974             avgleftv = vec_avg(VSLD(avg0v, avghv, 1), avg0v);
975             avgrightv = vec_avg(VSLD(avg1v, avghp1v, 1), avg1v);
976 
977             lv = vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle);
978 #ifdef WORDS_BIGENDIAN
979             hv = (vec_u8_t)vec_pack((vec_u16_t)avgleftv,(vec_u16_t)avgrightv);
980 #else
981             hv = vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle_1);
982 #endif
983 
984             VEC_STORE8( lv, dst0 + 16 * x );
985             VEC_STORE8( hv, dsth + 16 * x );
986 
987             lv = vec_sld(lv, lv, 8);
988             hv = vec_sld(hv, hv, 8);
989 
990             VEC_STORE8( lv, dstv + 16 * x );
991             VEC_STORE8( hv, dstc + 16 * x );
992         }
993 
994         src0 += src_stride*2;
995         dst0 += dst_stride;
996         dsth += dst_stride;
997         dstv += dst_stride;
998         dstc += dst_stride;
999     }
1000 }
1001 
mc_weight_w2_altivec(uint8_t * dst,intptr_t i_dst,uint8_t * src,intptr_t i_src,const x264_weight_t * weight,int i_height)1002 static void mc_weight_w2_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src,
1003                                   const x264_weight_t *weight, int i_height )
1004 {
1005     LOAD_ZERO;
1006     vec_u8_t srcv;
1007     vec_s16_t weightv;
1008     vec_s16_t scalev, offsetv, denomv, roundv;
1009 
1010     int denom = weight->i_denom;
1011 
1012     scalev = vec_splats( (int16_t)weight->i_scale );
1013     offsetv = vec_splats( (int16_t)weight->i_offset );
1014 
1015     if( denom >= 1 )
1016     {
1017         denomv = vec_splats( (int16_t)denom );
1018         roundv = vec_splats( (int16_t)(1 << (denom - 1)) );
1019 
1020         for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
1021         {
1022             srcv = vec_vsx_ld( 0, src );
1023             weightv = vec_u8_to_s16( srcv );
1024 
1025             weightv = vec_mladd( weightv, scalev, roundv );
1026             weightv = vec_sra( weightv, (vec_u16_t)denomv );
1027             weightv = vec_add( weightv, offsetv );
1028 
1029             srcv = vec_packsu( weightv, zero_s16v );
1030             STORE2_UNALIGNED( dst, srcv );
1031         }
1032     }
1033     else
1034     {
1035         for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
1036         {
1037             srcv = vec_vsx_ld( 0, src );
1038             weightv = vec_u8_to_s16( srcv );
1039 
1040             weightv = vec_mladd( weightv, scalev, offsetv );
1041 
1042             srcv = vec_packsu( weightv, zero_s16v );
1043             STORE2_UNALIGNED( dst, srcv );
1044         }
1045     }
1046 }
mc_weight_w4_altivec(uint8_t * dst,intptr_t i_dst,uint8_t * src,intptr_t i_src,const x264_weight_t * weight,int i_height)1047 static void mc_weight_w4_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src,
1048                                   const x264_weight_t *weight, int i_height )
1049 {
1050     LOAD_ZERO;
1051     vec_u8_t srcv;
1052     vec_s16_t weightv;
1053     vec_s16_t scalev, offsetv, denomv, roundv;
1054 
1055     int denom = weight->i_denom;
1056 
1057     scalev = vec_splats( (int16_t)weight->i_scale );
1058     offsetv = vec_splats( (int16_t)weight->i_offset );
1059 
1060     if( denom >= 1 )
1061     {
1062         denomv = vec_splats( (int16_t)denom );
1063         roundv = vec_splats( (int16_t)(1 << (denom - 1)) );
1064 
1065         for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
1066         {
1067             srcv = vec_vsx_ld( 0, src );
1068             weightv = vec_u8_to_s16( srcv );
1069 
1070             weightv = vec_mladd( weightv, scalev, roundv );
1071             weightv = vec_sra( weightv, (vec_u16_t)denomv );
1072             weightv = vec_add( weightv, offsetv );
1073 
1074             srcv = vec_packsu( weightv, zero_s16v );
1075             vec_ste( vec_splat( (vec_u32_t)srcv, 0 ), 0, (uint32_t*)dst );
1076         }
1077     }
1078     else
1079     {
1080         for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
1081         {
1082             srcv = vec_vsx_ld( 0, src );
1083             weightv = vec_u8_to_s16( srcv );
1084 
1085             weightv = vec_mladd( weightv, scalev, offsetv );
1086 
1087             srcv = vec_packsu( weightv, zero_s16v );
1088             vec_ste( vec_splat( (vec_u32_t)srcv, 0 ), 0, (uint32_t*)dst );
1089         }
1090     }
1091 }
mc_weight_w8_altivec(uint8_t * dst,intptr_t i_dst,uint8_t * src,intptr_t i_src,const x264_weight_t * weight,int i_height)1092 static void mc_weight_w8_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src,
1093                                   const x264_weight_t *weight, int i_height )
1094 {
1095     LOAD_ZERO;
1096     vec_u8_t srcv;
1097     vec_s16_t weightv;
1098     vec_s16_t scalev, offsetv, denomv, roundv;
1099 
1100     int denom = weight->i_denom;
1101 
1102     scalev = vec_splats( (int16_t)weight->i_scale );
1103     offsetv = vec_splats( (int16_t)weight->i_offset );
1104 
1105     if( denom >= 1 )
1106     {
1107         denomv = vec_splats( (int16_t)denom );
1108         roundv = vec_splats( (int16_t)(1 << (denom - 1)) );
1109 
1110         for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
1111         {
1112             srcv = vec_vsx_ld( 0, src );
1113             weightv = vec_u8_to_s16( srcv );
1114 
1115             weightv = vec_mladd( weightv, scalev, roundv );
1116             weightv = vec_sra( weightv, (vec_u16_t)denomv );
1117             weightv = vec_add( weightv, offsetv );
1118 
1119             srcv = vec_packsu( weightv, zero_s16v );
1120             VEC_STORE8( srcv, dst );
1121         }
1122     }
1123     else
1124     {
1125         for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
1126         {
1127             srcv = vec_vsx_ld( 0, src );
1128             weightv = vec_u8_to_s16( srcv );
1129 
1130             weightv = vec_mladd( weightv, scalev, offsetv );
1131 
1132             srcv = vec_packsu( weightv, zero_s16v );
1133             VEC_STORE8( srcv, dst );
1134         }
1135     }
1136 }
mc_weight_w16_altivec(uint8_t * dst,intptr_t i_dst,uint8_t * src,intptr_t i_src,const x264_weight_t * weight,int i_height)1137 static void mc_weight_w16_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src,
1138                                    const x264_weight_t *weight, int i_height )
1139 {
1140     LOAD_ZERO;
1141     vec_u8_t srcv;
1142     vec_s16_t weight_lv, weight_hv;
1143     vec_s16_t scalev, offsetv, denomv, roundv;
1144 
1145     int denom = weight->i_denom;
1146 
1147     scalev = vec_splats( (int16_t)weight->i_scale );
1148     offsetv = vec_splats( (int16_t)weight->i_offset );
1149 
1150     if( denom >= 1 )
1151     {
1152         denomv = vec_splats( (int16_t)denom );
1153         roundv = vec_splats( (int16_t)(1 << (denom - 1)) );
1154 
1155         for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
1156         {
1157             srcv = vec_vsx_ld( 0, src );
1158             weight_hv = vec_u8_to_s16_h( srcv );
1159             weight_lv = vec_u8_to_s16_l( srcv );
1160 
1161             weight_hv = vec_mladd( weight_hv, scalev, roundv );
1162             weight_lv = vec_mladd( weight_lv, scalev, roundv );
1163             weight_hv = vec_sra( weight_hv, (vec_u16_t)denomv );
1164             weight_lv = vec_sra( weight_lv, (vec_u16_t)denomv );
1165             weight_hv = vec_add( weight_hv, offsetv );
1166             weight_lv = vec_add( weight_lv, offsetv );
1167 
1168             srcv = vec_packsu( weight_hv, weight_lv );
1169             vec_st( srcv, 0, dst );
1170         }
1171     }
1172     else
1173     {
1174         for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
1175         {
1176             srcv = vec_vsx_ld( 0, src );
1177             weight_hv = vec_u8_to_s16_h( srcv );
1178             weight_lv = vec_u8_to_s16_l( srcv );
1179 
1180             weight_hv = vec_mladd( weight_hv, scalev, offsetv );
1181             weight_lv = vec_mladd( weight_lv, scalev, offsetv );
1182 
1183             srcv = vec_packsu( weight_hv, weight_lv );
1184             vec_st( srcv, 0, dst );
1185         }
1186     }
1187 }
mc_weight_w20_altivec(uint8_t * dst,intptr_t i_dst,uint8_t * src,intptr_t i_src,const x264_weight_t * weight,int i_height)1188 static void mc_weight_w20_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src,
1189                                    const x264_weight_t *weight, int i_height )
1190 {
1191     LOAD_ZERO;
1192     vec_u8_t srcv, srcv2;
1193     vec_s16_t weight_lv, weight_hv, weight_3v;
1194     vec_s16_t scalev, offsetv, denomv, roundv;
1195 
1196     int denom = weight->i_denom;
1197 
1198     scalev = vec_splats( (int16_t)weight->i_scale );
1199     offsetv = vec_splats( (int16_t)weight->i_offset );
1200 
1201     if( denom >= 1 )
1202     {
1203         int16_t round = 1 << (denom - 1);
1204         vec_s16_t tab[4] = {
1205             { weight->i_scale, weight->i_scale, weight->i_scale, weight->i_scale, 1, 1, 1, 1 },
1206             { weight->i_offset, weight->i_offset, weight->i_offset, weight->i_offset, 0, 0, 0, 0 },
1207             { denom, denom, denom, denom, 0, 0, 0, 0 },
1208             { round, round, round, round, 0, 0, 0, 0 },
1209         };
1210 
1211         denomv = vec_splats( (int16_t)denom );
1212         roundv = vec_splats( (int16_t)(1 << (denom - 1)) );
1213 
1214         for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
1215         {
1216             srcv = vec_vsx_ld( 0, src );
1217             srcv2 = vec_vsx_ld( 16, src );
1218 
1219             weight_hv = vec_u8_to_s16_h( srcv );
1220             weight_lv = vec_u8_to_s16_l( srcv );
1221             weight_3v = vec_u8_to_s16_h( srcv2 );
1222 
1223             weight_hv = vec_mladd( weight_hv, scalev, roundv );
1224             weight_lv = vec_mladd( weight_lv, scalev, roundv );
1225             weight_3v = vec_mladd( weight_3v, tab[0], tab[3] );
1226 
1227             weight_hv = vec_sra( weight_hv, (vec_u16_t)denomv );
1228             weight_lv = vec_sra( weight_lv, (vec_u16_t)denomv );
1229             weight_3v = vec_sra( weight_3v, (vec_u16_t)tab[2] );
1230 
1231             weight_hv = vec_add( weight_hv, offsetv );
1232             weight_lv = vec_add( weight_lv, offsetv );
1233             weight_3v = vec_add( weight_3v, tab[1] );
1234 
1235             srcv = vec_packsu( weight_hv, weight_lv );
1236             srcv2 = vec_packsu( weight_3v, vec_u8_to_s16_l( srcv2 ) );
1237             vec_vsx_st( srcv, 0, dst );
1238             vec_vsx_st( srcv2, 16, dst );
1239         }
1240     }
1241     else
1242     {
1243         vec_s16_t offset_mask = { weight->i_offset, weight->i_offset, weight->i_offset,
1244                                   weight->i_offset, 0, 0, 0, 0 };
1245         for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
1246         {
1247             srcv = vec_vsx_ld( 0, src );
1248             srcv2 = vec_vsx_ld( 16, src );
1249 
1250             weight_hv = vec_u8_to_s16_h( srcv );
1251             weight_lv = vec_u8_to_s16_l( srcv );
1252             weight_3v = vec_u8_to_s16_h( srcv2 );
1253 
1254             weight_hv = vec_mladd( weight_hv, scalev, offsetv );
1255             weight_lv = vec_mladd( weight_lv, scalev, offsetv );
1256             weight_3v = vec_mladd( weight_3v, scalev, offset_mask );
1257 
1258             srcv = vec_packsu( weight_hv, weight_lv );
1259             srcv2 = vec_packsu( weight_3v, vec_u8_to_s16_l( srcv2 ) );
1260             vec_vsx_st( srcv, 0, dst );
1261             vec_vsx_st( srcv2, 16, dst );
1262         }
1263     }
1264 }
1265 
1266 static weight_fn_t mc_weight_wtab_altivec[6] =
1267 {
1268     mc_weight_w2_altivec,
1269     mc_weight_w4_altivec,
1270     mc_weight_w8_altivec,
1271     mc_weight_w16_altivec,
1272     mc_weight_w16_altivec,
1273     mc_weight_w20_altivec,
1274 };
1275 
1276 PLANE_COPY_SWAP(16, altivec)
PLANE_INTERLEAVE(altivec)1277 PLANE_INTERLEAVE(altivec)
1278 #endif // !HIGH_BIT_DEPTH
1279 
1280 #if HIGH_BIT_DEPTH
1281 
1282 #define LOAD_SRC( l )                   \
1283 {                                       \
1284     srcv[l] = vec_vsx_ld( s, src );     \
1285     s += 16;                            \
1286     srcv[l + 1] = vec_vsx_ld( s, src ); \
1287     s += 16;                            \
1288 }
1289 
1290 #define STORE_8( mask, shift, dst, a, b )                 \
1291 {                                                         \
1292     dstv = (vec_u16_t)vec_perm( srcv[a], srcv[b], mask ); \
1293     dstv = vec_sr( dstv, shift );                         \
1294     dstv = vec_and( dstv, and_mask );                     \
1295                                                           \
1296     vec_st( dstv, offset, dst );                          \
1297 }
1298 
1299 // v210 input is only compatible with bit-depth of 10 bits
1300 void x264_plane_copy_deinterleave_v210_altivec( uint16_t *dsty, intptr_t i_dsty,
1301                                                 uint16_t *dstc, intptr_t i_dstc,
1302                                                 uint32_t *src, intptr_t i_src, int w, int h )
1303 {
1304 #ifdef WORDS_BIGENDIAN
1305     const vec_u8_t masky[3] = {
1306         { 0x02, 0x01, 0x05, 0x04, 0x07, 0x06, 0x0A, 0x09, 0x0D, 0x0C, 0x0F, 0x0E, 0x12, 0x11, 0x15, 0x14 },
1307         { 0x07, 0x06, 0x0A, 0x09, 0x0D, 0x0C, 0x0F, 0x0E, 0x12, 0x11, 0x15, 0x14, 0x17, 0x16, 0x1A, 0x19 },
1308         { 0x0D, 0x0C, 0x0F, 0x0E, 0x12, 0x11, 0x15, 0x14, 0x17, 0x16, 0x1A, 0x19, 0x1D, 0x1C, 0x1F, 0x1E }
1309     };
1310     const vec_u8_t maskc[3] = {
1311         { 0x01, 0x00, 0x03, 0x02, 0x06, 0x05, 0x09, 0x08, 0x0B, 0x0A, 0x0E, 0x0D, 0x11, 0x10, 0x13, 0x12 },
1312         { 0x06, 0x05, 0x09, 0x08, 0x0B, 0x0A, 0x0E, 0x0D, 0x11, 0x10, 0x13, 0x12, 0x16, 0x15, 0x19, 0x18 },
1313         { 0x0B, 0x0A, 0x0E, 0x0D, 0x11, 0x10, 0x13, 0x12, 0x16, 0x15, 0x19, 0x18, 0x1B, 0x1A, 0x1E, 0x1D }
1314     };
1315 #else
1316     const vec_u8_t masky[3] = {
1317         { 0x01, 0x02, 0x04, 0x05, 0x06, 0x07, 0x09, 0x0A, 0x0C, 0x0D, 0x0E, 0x0F, 0x11, 0x12, 0x14, 0x15 },
1318         { 0x06, 0x07, 0x09, 0x0A, 0x0C, 0x0D, 0x0E, 0x0F, 0x11, 0x12, 0x14, 0x15, 0x16, 0x17, 0x19, 0x1A },
1319         { 0x0C, 0x0D, 0x0E, 0x0F, 0x11, 0x12, 0x14, 0x15, 0x16, 0x17, 0x19, 0x1A, 0x1C, 0x1D, 0x1E, 0x1F }
1320     };
1321     const vec_u8_t maskc[3] = {
1322         { 0x00, 0x01, 0x02, 0x03, 0x05, 0x06, 0x08, 0x09, 0x0A, 0x0B, 0x0D, 0x0E, 0x10, 0x11, 0x12, 0x13 },
1323         { 0x05, 0x06, 0x08, 0x09, 0x0A, 0x0B, 0x0D, 0x0E, 0x10, 0x11, 0x12, 0x13, 0x15, 0x16, 0x18, 0x19 },
1324         { 0x0A, 0x0B, 0x0D, 0x0E, 0x10, 0x11, 0x12, 0x13, 0x15, 0x16, 0x18, 0x19, 0x1A, 0x1B, 0x1D, 0x1E }
1325     };
1326 #endif
1327     const vec_u16_t shift[3] = {
1328         { 0, 4, 2, 0, 4, 2, 0, 4 },
1329         { 2, 0, 4, 2, 0, 4, 2, 0 },
1330         { 4, 2, 0, 4, 2, 0, 4, 2 }
1331     };
1332 
1333     vec_u16_t dstv;
1334     vec_u16_t and_mask = vec_sub( vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 10 ) ), vec_splat_u16( 1 ) );
1335     vec_u32_t srcv[4];
1336 
1337     for( int i = 0; i < h; i++ )
1338     {
1339         int offset = 0;
1340         int s = 0;
1341 
1342         for( int j = 0; j < w; j += 24 )
1343         {
1344             LOAD_SRC( 0 );
1345             STORE_8( maskc[0], shift[0], dstc, 0, 1 );
1346             STORE_8( masky[0], shift[1], dsty, 0, 1 );
1347             offset += 16;
1348 
1349             LOAD_SRC( 2 );
1350             STORE_8( maskc[1], shift[1], dstc, 1, 2 );
1351             STORE_8( masky[1], shift[2], dsty, 1, 2 );
1352             offset += 16;
1353 
1354             STORE_8( maskc[2], shift[2], dstc, 2, 3 );
1355             STORE_8( masky[2], shift[0], dsty, 2, 3 );
1356             offset += 16;
1357         }
1358 
1359         dsty += i_dsty;
1360         dstc += i_dstc;
1361         src  += i_src;
1362     }
1363 }
1364 
1365 #endif // HIGH_BIT_DEPTH
1366 
x264_mc_init_altivec(x264_mc_functions_t * pf)1367 void x264_mc_init_altivec( x264_mc_functions_t *pf )
1368 {
1369 #if HIGH_BIT_DEPTH
1370     pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_altivec;
1371 #else // !HIGH_BIT_DEPTH
1372     pf->mc_luma   = mc_luma_altivec;
1373     pf->get_ref   = get_ref_altivec;
1374     pf->mc_chroma = mc_chroma_altivec;
1375 
1376     pf->copy_16x16_unaligned = mc_copy_w16_altivec;
1377     pf->copy[PIXEL_16x16] = mc_copy_w16_aligned_altivec;
1378 
1379     pf->hpel_filter = x264_hpel_filter_altivec;
1380     pf->frame_init_lowres_core = frame_init_lowres_core_altivec;
1381 
1382     pf->weight = mc_weight_wtab_altivec;
1383 
1384     pf->plane_copy_swap = plane_copy_swap_altivec;
1385     pf->plane_copy_interleave = plane_copy_interleave_altivec;
1386     pf->store_interleave_chroma = x264_store_interleave_chroma_altivec;
1387     pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_altivec;
1388     pf->load_deinterleave_chroma_fenc = load_deinterleave_chroma_fenc_altivec;
1389 #if HAVE_VSX
1390     pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_altivec;
1391 #endif // HAVE_VSX
1392 #endif // !HIGH_BIT_DEPTH
1393 }
1394