1 /*****************************************************************************
2  * pixel.c: ppc pixel metrics
3  *****************************************************************************
4  * Copyright (C) 2003-2021 x264 project
5  *
6  * Authors: Eric Petit <eric.petit@lapsus.org>
7  *          Guillaume Poirier <gpoirier@mplayerhq.hu>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
22  *
23  * This program is also available under a commercial proprietary license.
24  * For more information, contact us at licensing@x264.com.
25  *****************************************************************************/
26 
27 #include "common/common.h"
28 #include "ppccommon.h"
29 #include "pixel.h"
30 
31 #if !HIGH_BIT_DEPTH
32 /***********************************************************************
33  * SAD routines
34  **********************************************************************/
35 
36 #define PIXEL_SAD_ALTIVEC( name, lx, ly, a, b )        \
37 static int name( uint8_t *pix1, intptr_t i_pix1,       \
38                  uint8_t *pix2, intptr_t i_pix2 )      \
39 {                                                      \
40     ALIGNED_16( int sum );                             \
41                                                        \
42     LOAD_ZERO;                                         \
43     vec_u8_t  pix1v, pix2v;                            \
44     vec_s32_t sumv = zero_s32v;                        \
45     for( int y = 0; y < ly; y++ )                      \
46     {                                                  \
47         pix1v = vec_vsx_ld( 0, pix1 );                 \
48         pix2v = vec_vsx_ld( 0, pix2 );                 \
49         sumv = (vec_s32_t) vec_sum4s(                  \
50                    vec_absd( pix1v, pix2v ),           \
51                    (vec_u32_t) sumv );                 \
52         pix1 += i_pix1;                                \
53         pix2 += i_pix2;                                \
54     }                                                  \
55     sumv = vec_sum##a( sumv, zero_s32v );              \
56     sumv = vec_splat( sumv, b );                       \
57     vec_ste( sumv, 0, &sum );                          \
58     return sum;                                        \
59 }
60 
61 PIXEL_SAD_ALTIVEC( pixel_sad_16x16_altivec, 16, 16, s,  3 )
62 PIXEL_SAD_ALTIVEC( pixel_sad_8x16_altivec,  8,  16, 2s, 1 )
63 PIXEL_SAD_ALTIVEC( pixel_sad_16x8_altivec,  16, 8,  s,  3 )
64 PIXEL_SAD_ALTIVEC( pixel_sad_8x8_altivec,   8,  8,  2s, 1 )
65 
66 
67 
68 /***********************************************************************
69  * SATD routines
70  **********************************************************************/
71 
72 /***********************************************************************
73  * VEC_HADAMAR
74  ***********************************************************************
75  * b[0] = a[0] + a[1] + a[2] + a[3]
76  * b[1] = a[0] + a[1] - a[2] - a[3]
77  * b[2] = a[0] - a[1] - a[2] + a[3]
78  * b[3] = a[0] - a[1] + a[2] - a[3]
79  **********************************************************************/
80 #define VEC_HADAMAR(a0,a1,a2,a3,b0,b1,b2,b3) \
81     b2 = vec_add( a0, a1 ); \
82     b3 = vec_add( a2, a3 ); \
83     a0 = vec_sub( a0, a1 ); \
84     a2 = vec_sub( a2, a3 ); \
85     b0 = vec_add( b2, b3 ); \
86     b1 = vec_sub( b2, b3 ); \
87     b2 = vec_sub( a0, a2 ); \
88     b3 = vec_add( a0, a2 )
89 
90 /***********************************************************************
91  * VEC_ABS
92  ***********************************************************************
93  * a: s16v
94  *
95  * a = abs(a)
96  *
97  * Call vec_sub()/vec_max() instead of vec_abs() because vec_abs()
98  * actually also calls vec_splat(0), but we already have a null vector.
99  **********************************************************************/
100 #define VEC_ABS(a)                            \
101     a = vec_max( a, vec_sub( zero_s16v, a ) );
102 
103 #define VEC_ABSOLUTE(a) (vec_u16_t)vec_max( a, vec_sub( zero_s16v, a ) )
104 
105 /***********************************************************************
106  * VEC_ADD_ABS
107  ***********************************************************************
108  * a:    s16v
109  * b, c: s32v
110  *
111  * c[i] = abs(a[2*i]) + abs(a[2*i+1]) + [bi]
112  **********************************************************************/
113 #define VEC_ADD_ABS(a,b,c) \
114     VEC_ABS( a );          \
115     c = vec_sum4s( a, b )
116 
add_abs_4(vec_s16_t a,vec_s16_t b,vec_s16_t c,vec_s16_t d)117 static ALWAYS_INLINE vec_s32_t add_abs_4( vec_s16_t a, vec_s16_t b,
118                                           vec_s16_t c, vec_s16_t d )
119 {
120     vec_s16_t t0 = vec_abs( a );
121     vec_s16_t t1 = vec_abs( b );
122     vec_s16_t t2 = vec_abs( c );
123     vec_s16_t t3 = vec_abs( d );
124 
125     vec_s16_t s0 = vec_adds( t0, t1 );
126     vec_s16_t s1 = vec_adds( t2, t3 );
127 
128     vec_s32_t s01 = vec_sum4s( s0, vec_splat_s32( 0 ) );
129     vec_s32_t s23 = vec_sum4s( s1, vec_splat_s32( 0 ) );
130 
131     return vec_add( s01, s23 );
132 }
133 
134 /***********************************************************************
135  * SATD 4x4
136  **********************************************************************/
pixel_satd_4x4_altivec(uint8_t * pix1,intptr_t i_pix1,uint8_t * pix2,intptr_t i_pix2)137 static int pixel_satd_4x4_altivec( uint8_t *pix1, intptr_t i_pix1,
138                                    uint8_t *pix2, intptr_t i_pix2 )
139 {
140     ALIGNED_16( int i_satd );
141 
142     PREP_DIFF;
143     vec_s16_t diff0v, diff1v, diff2v, diff3v;
144     vec_s16_t temp0v, temp1v, temp2v, temp3v;
145     vec_s32_t satdv;
146 
147     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v );
148     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v );
149     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v );
150     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v );
151 
152     /* Hadamar H */
153     VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
154                  temp0v, temp1v, temp2v, temp3v );
155 
156     VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v,
157                      diff0v, diff1v, diff2v, diff3v );
158     /* Hadamar V */
159     VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
160                  temp0v, temp1v, temp2v, temp3v );
161 
162     satdv = add_abs_4( temp0v, temp1v, temp2v, temp3v );
163 
164     satdv = vec_sum2s( satdv, zero_s32v );
165     satdv = vec_splat( satdv, 1 );
166     vec_ste( satdv, 0, &i_satd );
167 
168     return i_satd >> 1;
169 }
170 
171 /***********************************************************************
172  * SATD 4x8
173  **********************************************************************/
pixel_satd_4x8_altivec(uint8_t * pix1,intptr_t i_pix1,uint8_t * pix2,intptr_t i_pix2)174 static int pixel_satd_4x8_altivec( uint8_t *pix1, intptr_t i_pix1,
175                                    uint8_t *pix2, intptr_t i_pix2 )
176 {
177     ALIGNED_16( int i_satd );
178 
179     PREP_DIFF;
180     vec_s16_t diff0v, diff1v, diff2v, diff3v;
181     vec_s16_t temp0v, temp1v, temp2v, temp3v;
182     vec_s32_t satdv;
183 
184     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v );
185     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v );
186     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v );
187     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v );
188     VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
189                  temp0v, temp1v, temp2v, temp3v );
190     VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v,
191                      diff0v, diff1v, diff2v, diff3v );
192     VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
193                  temp0v, temp1v, temp2v, temp3v );
194 
195     satdv = add_abs_4( temp0v, temp1v, temp2v, temp3v );
196 
197     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v );
198     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v );
199     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v );
200     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v );
201     VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
202                  temp0v, temp1v, temp2v, temp3v );
203     VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v,
204                      diff0v, diff1v, diff2v, diff3v );
205     VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
206                  temp0v, temp1v, temp2v, temp3v );
207 
208     satdv = vec_add( satdv, add_abs_4( temp0v, temp1v, temp2v, temp3v ) );
209 
210     satdv = vec_sum2s( satdv, zero_s32v );
211     satdv = vec_splat( satdv, 1 );
212     vec_ste( satdv, 0, &i_satd );
213 
214     return i_satd >> 1;
215 }
216 
add_abs_8(vec_s16_t a,vec_s16_t b,vec_s16_t c,vec_s16_t d,vec_s16_t e,vec_s16_t f,vec_s16_t g,vec_s16_t h)217 static ALWAYS_INLINE vec_s32_t add_abs_8( vec_s16_t a, vec_s16_t b,
218                                           vec_s16_t c, vec_s16_t d,
219                                           vec_s16_t e, vec_s16_t f,
220                                           vec_s16_t g, vec_s16_t h )
221 {
222     vec_s16_t t0 = vec_abs( a );
223     vec_s16_t t1 = vec_abs( b );
224     vec_s16_t t2 = vec_abs( c );
225     vec_s16_t t3 = vec_abs( d );
226 
227     vec_s16_t s0 = vec_adds( t0, t1 );
228     vec_s16_t s1 = vec_adds( t2, t3 );
229 
230     vec_s32_t s01 = vec_sum4s( s0, vec_splat_s32( 0 ) );
231     vec_s32_t s23 = vec_sum4s( s1, vec_splat_s32( 0 ) );
232 
233     vec_s16_t t4 = vec_abs( e );
234     vec_s16_t t5 = vec_abs( f );
235     vec_s16_t t6 = vec_abs( g );
236     vec_s16_t t7 = vec_abs( h );
237 
238     vec_s16_t s2 = vec_adds( t4, t5 );
239     vec_s16_t s3 = vec_adds( t6, t7 );
240 
241     vec_s32_t s0145 = vec_sum4s( s2, s01 );
242     vec_s32_t s2367 = vec_sum4s( s3, s23 );
243 
244     return vec_add( s0145, s2367 );
245 }
246 
247 /***********************************************************************
248  * SATD 8x4
249  **********************************************************************/
pixel_satd_8x4_altivec(uint8_t * pix1,intptr_t i_pix1,uint8_t * pix2,intptr_t i_pix2)250 static int pixel_satd_8x4_altivec( uint8_t *pix1, intptr_t i_pix1,
251                                    uint8_t *pix2, intptr_t i_pix2 )
252 {
253     ALIGNED_16( int i_satd );
254 
255     PREP_DIFF;
256     vec_s16_t diff0v, diff1v, diff2v, diff3v,
257               diff4v, diff5v, diff6v, diff7v;
258     vec_s16_t temp0v, temp1v, temp2v, temp3v,
259               temp4v, temp5v, temp6v, temp7v;
260     vec_s32_t satdv;
261 
262     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
263     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
264     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
265     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
266 
267     VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
268                  temp0v, temp1v, temp2v, temp3v );
269     /* This causes warnings because temp4v...temp7v haven't be set,
270        but we don't care */
271     VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
272                      temp4v, temp5v, temp6v, temp7v,
273                      diff0v, diff1v, diff2v, diff3v,
274                      diff4v, diff5v, diff6v, diff7v );
275     VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
276                  temp0v, temp1v, temp2v, temp3v );
277     VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
278                  temp4v, temp5v, temp6v, temp7v );
279 
280     satdv = add_abs_8( temp0v, temp1v, temp2v, temp3v,
281                        temp4v, temp5v, temp6v, temp7v );
282 
283     satdv = vec_sum2s( satdv, zero_s32v );
284     satdv = vec_splat( satdv, 1 );
285     vec_ste( satdv, 0, &i_satd );
286 
287     return i_satd >> 1;
288 }
289 
290 /***********************************************************************
291  * SATD 8x8
292  **********************************************************************/
pixel_satd_8x8_altivec(uint8_t * pix1,intptr_t i_pix1,uint8_t * pix2,intptr_t i_pix2)293 static int pixel_satd_8x8_altivec( uint8_t *pix1, intptr_t i_pix1,
294                                    uint8_t *pix2, intptr_t i_pix2 )
295 {
296     ALIGNED_16( int i_satd );
297 
298     PREP_DIFF;
299     vec_s16_t diff0v, diff1v, diff2v, diff3v,
300               diff4v, diff5v, diff6v, diff7v;
301     vec_s16_t temp0v, temp1v, temp2v, temp3v,
302               temp4v, temp5v, temp6v, temp7v;
303     vec_s32_t satdv;
304 
305     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
306     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
307     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
308     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
309     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v );
310     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v );
311     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v );
312     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v );
313 
314     VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
315                  temp0v, temp1v, temp2v, temp3v );
316     VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
317                  temp4v, temp5v, temp6v, temp7v );
318 
319     VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
320                      temp4v, temp5v, temp6v, temp7v,
321                      diff0v, diff1v, diff2v, diff3v,
322                      diff4v, diff5v, diff6v, diff7v );
323 
324     VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
325                  temp0v, temp1v, temp2v, temp3v );
326     VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
327                  temp4v, temp5v, temp6v, temp7v );
328 
329     satdv = add_abs_8( temp0v, temp1v, temp2v, temp3v,
330                        temp4v, temp5v, temp6v, temp7v );
331 
332     satdv = vec_sums( satdv, zero_s32v );
333     satdv = vec_splat( satdv, 3 );
334     vec_ste( satdv, 0, &i_satd );
335 
336     return i_satd >> 1;
337 }
338 
339 /***********************************************************************
340  * SATD 8x16
341  **********************************************************************/
pixel_satd_8x16_altivec(uint8_t * pix1,intptr_t i_pix1,uint8_t * pix2,intptr_t i_pix2)342 static int pixel_satd_8x16_altivec( uint8_t *pix1, intptr_t i_pix1,
343                                     uint8_t *pix2, intptr_t i_pix2 )
344 {
345     ALIGNED_16( int i_satd );
346 
347     PREP_DIFF;
348     vec_s16_t diff0v, diff1v, diff2v, diff3v,
349               diff4v, diff5v, diff6v, diff7v;
350     vec_s16_t temp0v, temp1v, temp2v, temp3v,
351               temp4v, temp5v, temp6v, temp7v;
352     vec_s32_t satdv;
353 
354     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
355     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
356     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
357     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
358     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v );
359     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v );
360     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v );
361     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v );
362     VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
363                  temp0v, temp1v, temp2v, temp3v );
364     VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
365                  temp4v, temp5v, temp6v, temp7v );
366     VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
367                      temp4v, temp5v, temp6v, temp7v,
368                      diff0v, diff1v, diff2v, diff3v,
369                      diff4v, diff5v, diff6v, diff7v );
370     VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
371                  temp0v, temp1v, temp2v, temp3v );
372     VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
373                  temp4v, temp5v, temp6v, temp7v );
374 
375     satdv = add_abs_8( temp0v, temp1v, temp2v, temp3v,
376                        temp4v, temp5v, temp6v, temp7v );
377 
378     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
379     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
380     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
381     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
382     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v );
383     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v );
384     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v );
385     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v );
386     VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
387                  temp0v, temp1v, temp2v, temp3v );
388     VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
389                  temp4v, temp5v, temp6v, temp7v );
390     VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
391                      temp4v, temp5v, temp6v, temp7v,
392                      diff0v, diff1v, diff2v, diff3v,
393                      diff4v, diff5v, diff6v, diff7v );
394     VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
395                  temp0v, temp1v, temp2v, temp3v );
396     VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
397                  temp4v, temp5v, temp6v, temp7v );
398 
399     satdv = vec_add( satdv, add_abs_8( temp0v, temp1v, temp2v, temp3v,
400                                        temp4v, temp5v, temp6v, temp7v ) );
401 
402     satdv = vec_sums( satdv, zero_s32v );
403     satdv = vec_splat( satdv, 3 );
404     vec_ste( satdv, 0, &i_satd );
405 
406     return i_satd >> 1;
407 }
408 
409 /***********************************************************************
410  * SATD 16x8
411  **********************************************************************/
pixel_satd_16x8_altivec(uint8_t * pix1,intptr_t i_pix1,uint8_t * pix2,intptr_t i_pix2)412 static int pixel_satd_16x8_altivec( uint8_t *pix1, intptr_t i_pix1,
413                                     uint8_t *pix2, intptr_t i_pix2 )
414 {
415     ALIGNED_16( int i_satd );
416 
417     LOAD_ZERO;
418     vec_s32_t satdv;
419     vec_s16_t pix1v, pix2v;
420     vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v,
421               diffh4v, diffh5v, diffh6v, diffh7v;
422     vec_s16_t diffl0v, diffl1v, diffl2v, diffl3v,
423               diffl4v, diffl5v, diffl6v, diffl7v;
424     vec_s16_t temp0v, temp1v, temp2v, temp3v,
425               temp4v, temp5v, temp6v, temp7v;
426 
427     VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v );
428     VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v );
429     VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh2v, diffl2v );
430     VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh3v, diffl3v );
431     VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh4v, diffl4v );
432     VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh5v, diffl5v );
433     VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh6v, diffl6v );
434     VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh7v, diffl7v );
435 
436     VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
437                  temp0v, temp1v, temp2v, temp3v );
438     VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
439                  temp4v, temp5v, temp6v, temp7v );
440 
441     VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
442                      temp4v, temp5v, temp6v, temp7v,
443                      diffh0v, diffh1v, diffh2v, diffh3v,
444                      diffh4v, diffh5v, diffh6v, diffh7v );
445 
446     VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
447                  temp0v, temp1v, temp2v, temp3v );
448     VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
449                  temp4v, temp5v, temp6v, temp7v );
450 
451     satdv = add_abs_8( temp0v, temp1v, temp2v, temp3v,
452                        temp4v, temp5v, temp6v, temp7v );
453 
454     VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
455                  temp0v, temp1v, temp2v, temp3v );
456     VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
457                  temp4v, temp5v, temp6v, temp7v );
458 
459     VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
460                      temp4v, temp5v, temp6v, temp7v,
461                      diffl0v, diffl1v, diffl2v, diffl3v,
462                      diffl4v, diffl5v, diffl6v, diffl7v );
463 
464     VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
465                  temp0v, temp1v, temp2v, temp3v );
466     VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
467                  temp4v, temp5v, temp6v, temp7v );
468 
469     satdv = vec_add( satdv, add_abs_8( temp0v, temp1v, temp2v, temp3v,
470                                        temp4v, temp5v, temp6v, temp7v ) );
471 
472     satdv = vec_sums( satdv, zero_s32v );
473     satdv = vec_splat( satdv, 3 );
474     vec_ste( satdv, 0, &i_satd );
475 
476     return i_satd >> 1;
477 }
478 
479 /***********************************************************************
480  * SATD 16x16
481  **********************************************************************/
pixel_satd_16x16_altivec(uint8_t * pix1,intptr_t i_pix1,uint8_t * pix2,intptr_t i_pix2)482 static int pixel_satd_16x16_altivec( uint8_t *pix1, intptr_t i_pix1,
483                                      uint8_t *pix2, intptr_t i_pix2 )
484 {
485     ALIGNED_16( int i_satd );
486 
487     LOAD_ZERO;
488     vec_s32_t satdv;
489     vec_s16_t pix1v, pix2v;
490     vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v,
491               diffh4v, diffh5v, diffh6v, diffh7v;
492     vec_s16_t diffl0v, diffl1v, diffl2v, diffl3v,
493               diffl4v, diffl5v, diffl6v, diffl7v;
494     vec_s16_t temp0v, temp1v, temp2v, temp3v,
495               temp4v, temp5v, temp6v, temp7v;
496 
497     VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v );
498     VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v );
499     VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh2v, diffl2v );
500     VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh3v, diffl3v );
501     VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh4v, diffl4v );
502     VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh5v, diffl5v );
503     VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh6v, diffl6v );
504     VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh7v, diffl7v );
505     VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
506                  temp0v, temp1v, temp2v, temp3v );
507     VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
508                  temp4v, temp5v, temp6v, temp7v );
509     VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
510                      temp4v, temp5v, temp6v, temp7v,
511                      diffh0v, diffh1v, diffh2v, diffh3v,
512                      diffh4v, diffh5v, diffh6v, diffh7v );
513     VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
514                  temp0v, temp1v, temp2v, temp3v );
515     VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
516                  temp4v, temp5v, temp6v, temp7v );
517 
518     satdv = add_abs_8( temp0v, temp1v, temp2v, temp3v,
519                        temp4v, temp5v, temp6v, temp7v );
520 
521     VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
522                  temp0v, temp1v, temp2v, temp3v );
523     VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
524                  temp4v, temp5v, temp6v, temp7v );
525     VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
526                      temp4v, temp5v, temp6v, temp7v,
527                      diffl0v, diffl1v, diffl2v, diffl3v,
528                      diffl4v, diffl5v, diffl6v, diffl7v );
529     VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
530                  temp0v, temp1v, temp2v, temp3v );
531     VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
532                  temp4v, temp5v, temp6v, temp7v );
533 
534     satdv = vec_add( satdv, add_abs_8( temp0v, temp1v, temp2v, temp3v,
535                                        temp4v, temp5v, temp6v, temp7v ) );
536 
537     VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v );
538     VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v );
539     VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh2v, diffl2v );
540     VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh3v, diffl3v );
541     VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh4v, diffl4v );
542     VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh5v, diffl5v );
543     VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh6v, diffl6v );
544     VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh7v, diffl7v );
545     VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
546                  temp0v, temp1v, temp2v, temp3v );
547     VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
548                  temp4v, temp5v, temp6v, temp7v );
549     VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
550                      temp4v, temp5v, temp6v, temp7v,
551                      diffh0v, diffh1v, diffh2v, diffh3v,
552                      diffh4v, diffh5v, diffh6v, diffh7v );
553     VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
554                  temp0v, temp1v, temp2v, temp3v );
555     VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
556                  temp4v, temp5v, temp6v, temp7v );
557 
558     satdv = vec_add( satdv, add_abs_8( temp0v, temp1v, temp2v, temp3v,
559                                        temp4v, temp5v, temp6v, temp7v ) );
560 
561     VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
562                  temp0v, temp1v, temp2v, temp3v );
563     VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
564                  temp4v, temp5v, temp6v, temp7v );
565     VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
566                      temp4v, temp5v, temp6v, temp7v,
567                      diffl0v, diffl1v, diffl2v, diffl3v,
568                      diffl4v, diffl5v, diffl6v, diffl7v );
569     VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
570                  temp0v, temp1v, temp2v, temp3v );
571     VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
572                  temp4v, temp5v, temp6v, temp7v );
573 
574     satdv = vec_add( satdv, add_abs_8( temp0v, temp1v, temp2v, temp3v,
575                                        temp4v, temp5v, temp6v, temp7v ) );
576 
577     satdv = vec_sums( satdv, zero_s32v );
578     satdv = vec_splat( satdv, 3 );
579     vec_ste( satdv, 0, &i_satd );
580 
581     return i_satd >> 1;
582 }
583 
584 
585 
586 /***********************************************************************
587 * Interleaved SAD routines
588 **********************************************************************/
589 
pixel_sad_x4_16x16_altivec(uint8_t * fenc,uint8_t * pix0,uint8_t * pix1,uint8_t * pix2,uint8_t * pix3,intptr_t i_stride,int scores[4])590 static void pixel_sad_x4_16x16_altivec( uint8_t *fenc,
591                                         uint8_t *pix0, uint8_t *pix1,
592                                         uint8_t *pix2, uint8_t *pix3,
593                                         intptr_t i_stride, int scores[4] )
594 {
595     ALIGNED_16( int sum0 );
596     ALIGNED_16( int sum1 );
597     ALIGNED_16( int sum2 );
598     ALIGNED_16( int sum3 );
599 
600     LOAD_ZERO;
601     vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
602     vec_s32_t sum0v, sum1v, sum2v, sum3v;
603 
604     sum0v = vec_splat_s32(0);
605     sum1v = vec_splat_s32(0);
606     sum2v = vec_splat_s32(0);
607     sum3v = vec_splat_s32(0);
608 
609     for( int y = 0; y < 8; y++ )
610     {
611         pix0v = vec_vsx_ld( 0, pix0 );
612         pix0 += i_stride;
613 
614         pix1v = vec_vsx_ld( 0, pix1 );
615         pix1 += i_stride;
616 
617         fencv = vec_ld(0, fenc);
618         fenc += FENC_STRIDE;
619 
620         pix2v = vec_vsx_ld( 0, pix2 );
621         pix2 += i_stride;
622 
623         pix3v = vec_vsx_ld( 0, pix3 );
624         pix3 += i_stride;
625 
626         sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v );
627         sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v );
628         sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v );
629         sum3v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix3v ), (vec_u32_t) sum3v );
630 
631         pix0v = vec_vsx_ld( 0, pix0 );
632         pix0 += i_stride;
633 
634         pix1v = vec_vsx_ld( 0, pix1 );
635         pix1 += i_stride;
636 
637         fencv = vec_ld(0, fenc);
638         fenc += FENC_STRIDE;
639 
640         pix2v = vec_vsx_ld( 0, pix2 );
641         pix2 += i_stride;
642 
643         pix3v = vec_vsx_ld( 0, pix3 );
644         pix3 += i_stride;
645 
646         sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v );
647         sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v );
648         sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v );
649         sum3v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix3v ), (vec_u32_t) sum3v );
650 
651     }
652 
653     sum0v = vec_sums( sum0v, zero_s32v );
654     sum1v = vec_sums( sum1v, zero_s32v );
655     sum2v = vec_sums( sum2v, zero_s32v );
656     sum3v = vec_sums( sum3v, zero_s32v );
657 
658     sum0v = vec_splat( sum0v, 3 );
659     sum1v = vec_splat( sum1v, 3 );
660     sum2v = vec_splat( sum2v, 3 );
661     sum3v = vec_splat( sum3v, 3 );
662 
663     vec_ste( sum0v, 0, &sum0);
664     vec_ste( sum1v, 0, &sum1);
665     vec_ste( sum2v, 0, &sum2);
666     vec_ste( sum3v, 0, &sum3);
667 
668     scores[0] = sum0;
669     scores[1] = sum1;
670     scores[2] = sum2;
671     scores[3] = sum3;
672 }
673 
pixel_sad_x3_16x16_altivec(uint8_t * fenc,uint8_t * pix0,uint8_t * pix1,uint8_t * pix2,intptr_t i_stride,int scores[3])674 static void pixel_sad_x3_16x16_altivec( uint8_t *fenc, uint8_t *pix0,
675                                         uint8_t *pix1, uint8_t *pix2,
676                                         intptr_t i_stride, int scores[3] )
677 {
678     ALIGNED_16( int sum0 );
679     ALIGNED_16( int sum1 );
680     ALIGNED_16( int sum2 );
681 
682     LOAD_ZERO;
683     vec_u8_t fencv, pix0v, pix1v, pix2v;
684     vec_s32_t sum0v, sum1v, sum2v;
685 
686     sum0v = vec_splat_s32(0);
687     sum1v = vec_splat_s32(0);
688     sum2v = vec_splat_s32(0);
689 
690     for( int y = 0; y < 8; y++ )
691     {
692         pix0v = vec_vsx_ld( 0, pix0 );
693         pix0 += i_stride;
694 
695         pix1v = vec_vsx_ld( 0, pix1 );
696         pix1 += i_stride;
697 
698         fencv = vec_ld(0, fenc);
699         fenc += FENC_STRIDE;
700 
701         pix2v = vec_vsx_ld( 0, pix2 );
702         pix2 += i_stride;
703 
704         sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v );
705         sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v );
706         sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v );
707         pix0v = vec_vsx_ld( 0, pix0 );
708         pix0 += i_stride;
709 
710 
711         pix1v = vec_vsx_ld( 0, pix1 );
712         pix1 += i_stride;
713 
714         fencv = vec_ld(0, fenc);
715         fenc += FENC_STRIDE;
716 
717         pix2v = vec_vsx_ld( 0, pix2 );
718         pix2 += i_stride;
719 
720         sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v );
721         sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v );
722         sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v );
723     }
724 
725     sum0v = vec_sums( sum0v, zero_s32v );
726     sum1v = vec_sums( sum1v, zero_s32v );
727     sum2v = vec_sums( sum2v, zero_s32v );
728 
729     sum0v = vec_splat( sum0v, 3 );
730     sum1v = vec_splat( sum1v, 3 );
731     sum2v = vec_splat( sum2v, 3 );
732 
733     vec_ste( sum0v, 0, &sum0);
734     vec_ste( sum1v, 0, &sum1);
735     vec_ste( sum2v, 0, &sum2);
736 
737     scores[0] = sum0;
738     scores[1] = sum1;
739     scores[2] = sum2;
740 }
741 
pixel_sad_x4_16x8_altivec(uint8_t * fenc,uint8_t * pix0,uint8_t * pix1,uint8_t * pix2,uint8_t * pix3,intptr_t i_stride,int scores[4])742 static void pixel_sad_x4_16x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2,
743                                        uint8_t *pix3, intptr_t i_stride, int scores[4] )
744 {
745     ALIGNED_16( int sum0 );
746     ALIGNED_16( int sum1 );
747     ALIGNED_16( int sum2 );
748     ALIGNED_16( int sum3 );
749 
750     LOAD_ZERO;
751     vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
752     vec_s32_t sum0v, sum1v, sum2v, sum3v;
753 
754     sum0v = vec_splat_s32(0);
755     sum1v = vec_splat_s32(0);
756     sum2v = vec_splat_s32(0);
757     sum3v = vec_splat_s32(0);
758 
759     for( int y = 0; y < 4; y++ )
760     {
761         pix0v = vec_vsx_ld( 0, pix0 );
762         pix0 += i_stride;
763 
764         pix1v = vec_vsx_ld( 0, pix1 );
765         pix1 += i_stride;
766 
767         fencv = vec_ld( 0, fenc );
768         fenc += FENC_STRIDE;
769 
770         pix2v = vec_vsx_ld( 0, pix2 );
771         pix2 += i_stride;
772 
773         pix3v = vec_vsx_ld( 0, pix3 );
774         pix3 += i_stride;
775 
776         sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v );
777         sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v );
778         sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v );
779         sum3v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix3v ), (vec_u32_t) sum3v );
780 
781         pix0v = vec_vsx_ld( 0, pix0 );
782         pix0 += i_stride;
783 
784         pix1v = vec_vsx_ld( 0, pix1 );
785         pix1 += i_stride;
786 
787         fencv = vec_ld(0, fenc);
788         fenc += FENC_STRIDE;
789 
790         pix2v = vec_vsx_ld( 0, pix2 );
791         pix2 += i_stride;
792 
793         pix3v = vec_vsx_ld( 0, pix3 );
794         pix3 += i_stride;
795 
796         sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v );
797         sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v );
798         sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v );
799         sum3v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix3v ), (vec_u32_t) sum3v );
800     }
801 
802     sum0v = vec_sums( sum0v, zero_s32v );
803     sum1v = vec_sums( sum1v, zero_s32v );
804     sum2v = vec_sums( sum2v, zero_s32v );
805     sum3v = vec_sums( sum3v, zero_s32v );
806 
807     sum0v = vec_splat( sum0v, 3 );
808     sum1v = vec_splat( sum1v, 3 );
809     sum2v = vec_splat( sum2v, 3 );
810     sum3v = vec_splat( sum3v, 3 );
811 
812     vec_ste( sum0v, 0, &sum0);
813     vec_ste( sum1v, 0, &sum1);
814     vec_ste( sum2v, 0, &sum2);
815     vec_ste( sum3v, 0, &sum3);
816 
817     scores[0] = sum0;
818     scores[1] = sum1;
819     scores[2] = sum2;
820     scores[3] = sum3;
821 }
822 
823 #define PROCESS_PIXS                                                                  \
824         vec_u8_t pix0vH = vec_vsx_ld( 0, pix0 );                                      \
825         pix0 += i_stride;                                                             \
826                                                                                       \
827         vec_u8_t pix1vH = vec_vsx_ld( 0, pix1 );                                      \
828         pix1 += i_stride;                                                             \
829                                                                                       \
830         vec_u8_t fencvH = vec_vsx_ld( 0, fenc );                                      \
831         fenc += FENC_STRIDE;                                                          \
832                                                                                       \
833         vec_u8_t pix2vH = vec_vsx_ld( 0, pix2 );                                      \
834         pix2 += i_stride;                                                             \
835                                                                                       \
836         vec_u8_t pix0vL = vec_vsx_ld( 0, pix0 );                                      \
837         pix0 += i_stride;                                                             \
838                                                                                       \
839         vec_u8_t pix1vL = vec_vsx_ld( 0, pix1 );                                      \
840         pix1 += i_stride;                                                             \
841                                                                                       \
842         vec_u8_t fencvL = vec_vsx_ld( 0, fenc );                                      \
843         fenc += FENC_STRIDE;                                                          \
844                                                                                       \
845         vec_u8_t pix2vL = vec_vsx_ld( 0, pix2 );                                      \
846         pix2 += i_stride;                                                             \
847                                                                                       \
848         fencv = xxpermdi( fencvH, fencvL, 0 );                                        \
849         pix0v = xxpermdi( pix0vH, pix0vL, 0 );                                        \
850         pix1v = xxpermdi( pix1vH, pix1vL, 0 );                                        \
851         pix2v = xxpermdi( pix2vH, pix2vL, 0 );                                        \
852                                                                                       \
853         sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v ); \
854         sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v ); \
855         sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v );
856 
857 #define PIXEL_SAD_X3_ALTIVEC( name, ly )            \
858 static int name( uint8_t *fenc, uint8_t *pix0,      \
859                  uint8_t *pix1, uint8_t *pix2,      \
860                  intptr_t i_stride, int scores[3] ) \
861 {                                                   \
862     ALIGNED_16( int sum0 );                         \
863     ALIGNED_16( int sum1 );                         \
864     ALIGNED_16( int sum2 );                         \
865                                                     \
866     LOAD_ZERO;                                      \
867     vec_u8_t fencv, pix0v, pix1v, pix2v;            \
868     vec_s32_t sum0v, sum1v, sum2v;                  \
869                                                     \
870     sum0v = vec_splat_s32( 0 );                     \
871     sum1v = vec_splat_s32( 0 );                     \
872     sum2v = vec_splat_s32( 0 );                     \
873                                                     \
874     for( int y = 0; y < ly; y++ )                   \
875     {                                               \
876         PROCESS_PIXS                                \
877     }                                               \
878                                                     \
879     sum0v = vec_sums( sum0v, zero_s32v );           \
880     sum1v = vec_sums( sum1v, zero_s32v );           \
881     sum2v = vec_sums( sum2v, zero_s32v );           \
882                                                     \
883     sum0v = vec_splat( sum0v, 3 );                  \
884     sum1v = vec_splat( sum1v, 3 );                  \
885     sum2v = vec_splat( sum2v, 3 );                  \
886                                                     \
887     vec_ste( sum0v, 0, &sum0 );                     \
888     vec_ste( sum1v, 0, &sum1 );                     \
889     vec_ste( sum2v, 0, &sum2 );                     \
890                                                     \
891     scores[0] = sum0;                               \
892     scores[1] = sum1;                               \
893     scores[2] = sum2;                               \
894 }
895 
896 PIXEL_SAD_X3_ALTIVEC( pixel_sad_x3_8x8_altivec, 4 )
897 PIXEL_SAD_X3_ALTIVEC( pixel_sad_x3_8x16_altivec, 8 )
898 
pixel_sad_x3_16x8_altivec(uint8_t * fenc,uint8_t * pix0,uint8_t * pix1,uint8_t * pix2,intptr_t i_stride,int scores[3])899 static void pixel_sad_x3_16x8_altivec( uint8_t *fenc, uint8_t *pix0,
900                                        uint8_t *pix1, uint8_t *pix2,
901                                        intptr_t i_stride, int scores[3] )
902 {
903     ALIGNED_16( int sum0 );
904     ALIGNED_16( int sum1 );
905     ALIGNED_16( int sum2 );
906 
907     LOAD_ZERO;
908     vec_u8_t fencv, pix0v, pix1v, pix2v;
909     vec_s32_t sum0v, sum1v, sum2v;
910 
911     sum0v = vec_splat_s32(0);
912     sum1v = vec_splat_s32(0);
913     sum2v = vec_splat_s32(0);
914 
915     for( int y = 0; y < 4; y++ )
916     {
917         pix0v = vec_vsx_ld(0, pix0);
918         pix0 += i_stride;
919 
920         pix1v = vec_vsx_ld(0, pix1);
921         pix1 += i_stride;
922 
923         fencv = vec_ld(0, fenc);
924         fenc += FENC_STRIDE;
925 
926         pix2v = vec_vsx_ld(0, pix2);
927         pix2 += i_stride;
928 
929         sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v );
930         sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v );
931         sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v );
932 
933         pix0v = vec_vsx_ld(0, pix0);
934         pix0 += i_stride;
935 
936         pix1v = vec_vsx_ld(0, pix1);
937         pix1 += i_stride;
938 
939         fencv = vec_ld(0, fenc);
940         fenc += FENC_STRIDE;
941 
942         pix2v = vec_vsx_ld(0, pix2);
943         pix2 += i_stride;
944 
945         sum0v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix0v ), (vec_u32_t) sum0v );
946         sum1v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix1v ), (vec_u32_t) sum1v );
947         sum2v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix2v ), (vec_u32_t) sum2v );
948     }
949 
950     sum0v = vec_sums( sum0v, zero_s32v );
951     sum1v = vec_sums( sum1v, zero_s32v );
952     sum2v = vec_sums( sum2v, zero_s32v );
953 
954     sum0v = vec_splat( sum0v, 3 );
955     sum1v = vec_splat( sum1v, 3 );
956     sum2v = vec_splat( sum2v, 3 );
957 
958     vec_ste( sum0v, 0, &sum0);
959     vec_ste( sum1v, 0, &sum1);
960     vec_ste( sum2v, 0, &sum2);
961 
962     scores[0] = sum0;
963     scores[1] = sum1;
964     scores[2] = sum2;
965 }
966 
967 #define PIXEL_SAD_X4_ALTIVEC( name, ly )                                              \
968 static int name( uint8_t *fenc,                                                       \
969                  uint8_t *pix0, uint8_t *pix1,                                        \
970                  uint8_t *pix2, uint8_t *pix3,                                        \
971                  intptr_t i_stride, int scores[4] )                                   \
972 {                                                                                     \
973     ALIGNED_16( int sum0 );                                                           \
974     ALIGNED_16( int sum1 );                                                           \
975     ALIGNED_16( int sum2 );                                                           \
976                                                                                       \
977     LOAD_ZERO;                                                                        \
978     vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;                                       \
979     vec_s32_t sum0v, sum1v, sum2v, sum3v;                                             \
980                                                                                       \
981     sum0v = vec_splat_s32( 0 );                                                       \
982     sum1v = vec_splat_s32( 0 );                                                       \
983     sum2v = vec_splat_s32( 0 );                                                       \
984                                                                                       \
985     for( int y = 0; y < ly; y++ )                                                     \
986     {                                                                                 \
987         PROCESS_PIXS                                                                  \
988         vec_u8_t pix3vH = vec_vsx_ld( 0, pix3 );                                      \
989         pix3 += i_stride;                                                             \
990         vec_u8_t pix3vL = vec_vsx_ld( 0, pix3 );                                      \
991         pix3 += i_stride;                                                             \
992         pix3v = xxpermdi( pix3vH, pix3vL, 0 );                                        \
993         sum3v = (vec_s32_t) vec_sum4s( vec_absd( fencv, pix3v ), (vec_u32_t) sum3v ); \
994     }                                                                                 \
995                                                                                       \
996     sum0v = vec_sums( sum0v, zero_s32v );                                             \
997     sum1v = vec_sums( sum1v, zero_s32v );                                             \
998     sum2v = vec_sums( sum2v, zero_s32v );                                             \
999     sum3v = vec_sums( sum3v, zero_s32v );                                             \
1000                                                                                       \
1001     vec_s32_t s01 = vec_mergel( sum0v, sum1v );                                       \
1002     vec_s32_t s23 = vec_mergel( sum2v, sum3v );                                       \
1003     vec_s32_t s = xxpermdi( s01, s23, 3 );                                            \
1004                                                                                       \
1005     vec_vsx_st( s, 0, scores );                                                       \
1006 }
1007 
1008 PIXEL_SAD_X4_ALTIVEC( pixel_sad_x4_8x8_altivec, 4 )
1009 PIXEL_SAD_X4_ALTIVEC( pixel_sad_x4_8x16_altivec, 8 )
1010 
1011 /***********************************************************************
1012 * SSD routines
1013 **********************************************************************/
1014 
pixel_ssd_16x16_altivec(uint8_t * pix1,intptr_t i_stride_pix1,uint8_t * pix2,intptr_t i_stride_pix2)1015 static int pixel_ssd_16x16_altivec( uint8_t *pix1, intptr_t i_stride_pix1,
1016                                     uint8_t *pix2, intptr_t i_stride_pix2 )
1017 {
1018     ALIGNED_16( int sum );
1019 
1020     LOAD_ZERO;
1021     vec_u8_t  pix1vA, pix2vA, pix1vB, pix2vB;
1022     vec_u32_t sumv;
1023     vec_u8_t diffA, diffB;
1024 
1025     sumv = vec_splat_u32(0);
1026 
1027     pix2vA = vec_vsx_ld(0, pix2);
1028     pix1vA = vec_ld(0, pix1);
1029 
1030     for( int y = 0; y < 7; y++ )
1031     {
1032         pix1 += i_stride_pix1;
1033         pix2 += i_stride_pix2;
1034 
1035         pix2vB = vec_vsx_ld(0, pix2);
1036         pix1vB = vec_ld(0, pix1);
1037 
1038         diffA = vec_absd(pix1vA, pix2vA);
1039         sumv = vec_msum(diffA, diffA, sumv);
1040 
1041         pix1 += i_stride_pix1;
1042         pix2 += i_stride_pix2;
1043 
1044         pix2vA = vec_vsx_ld(0, pix2);
1045         pix1vA = vec_ld(0, pix1);
1046 
1047         diffB = vec_absd(pix1vB, pix2vB);
1048         sumv = vec_msum(diffB, diffB, sumv);
1049     }
1050 
1051     pix1 += i_stride_pix1;
1052     pix2 += i_stride_pix2;
1053 
1054     pix2vB = vec_vsx_ld(0, pix2);
1055     pix1vB = vec_ld(0, pix1);
1056 
1057     diffA = vec_absd(pix1vA, pix2vA);
1058     sumv = vec_msum(diffA, diffA, sumv);
1059 
1060     diffB = vec_absd(pix1vB, pix2vB);
1061     sumv = vec_msum(diffB, diffB, sumv);
1062 
1063     sumv = (vec_u32_t) vec_sums((vec_s32_t) sumv, zero_s32v);
1064     sumv = vec_splat(sumv, 3);
1065     vec_ste((vec_s32_t) sumv, 0, &sum);
1066     return sum;
1067 }
1068 
pixel_ssd_8x8_altivec(uint8_t * pix1,intptr_t i_stride_pix1,uint8_t * pix2,intptr_t i_stride_pix2)1069 static int pixel_ssd_8x8_altivec( uint8_t *pix1, intptr_t i_stride_pix1,
1070                                   uint8_t *pix2, intptr_t i_stride_pix2 )
1071 {
1072     ALIGNED_16( int sum );
1073 
1074     LOAD_ZERO;
1075     vec_u8_t  pix1v, pix2v;
1076     vec_u32_t sumv;
1077     vec_u8_t diffv;
1078 
1079     const vec_u32_t sel = (vec_u32_t)CV(-1,-1,0,0);
1080 
1081     sumv = vec_splat_u32(0);
1082 
1083     for( int y = 0; y < 8; y++ )
1084     {
1085         pix1v = vec_vsx_ld(0, pix1);
1086         pix2v = vec_vsx_ld(0, pix2);
1087 
1088         diffv = vec_absd( pix1v, pix2v );
1089         sumv = vec_msum(diffv, diffv, sumv);
1090 
1091         pix1 += i_stride_pix1;
1092         pix2 += i_stride_pix2;
1093     }
1094 
1095     sumv = vec_sel( zero_u32v, sumv, sel );
1096 
1097     sumv = (vec_u32_t) vec_sums((vec_s32_t) sumv, zero_s32v);
1098     sumv = vec_splat(sumv, 3);
1099     vec_ste((vec_s32_t) sumv, 0, &sum);
1100 
1101     return sum;
1102 }
1103 
1104 
1105 /****************************************************************************
1106  * variance
1107  ****************************************************************************/
pixel_var_16x16_altivec(uint8_t * pix,intptr_t i_stride)1108 static uint64_t pixel_var_16x16_altivec( uint8_t *pix, intptr_t i_stride )
1109 {
1110     ALIGNED_16(uint32_t sum_tab[4]);
1111     ALIGNED_16(uint32_t sqr_tab[4]);
1112 
1113     LOAD_ZERO;
1114     vec_u32_t sqr_v = zero_u32v;
1115     vec_u32_t sum_v = zero_u32v;
1116 
1117     for( int y = 0; y < 16; y++ )
1118     {
1119         vec_u8_t pix0_v = vec_ld(0, pix);
1120         sum_v = vec_sum4s(pix0_v, sum_v);
1121         sqr_v = vec_msum(pix0_v, pix0_v, sqr_v);
1122 
1123         pix += i_stride;
1124     }
1125     sum_v = (vec_u32_t)vec_sums( (vec_s32_t)sum_v, zero_s32v );
1126     sqr_v = (vec_u32_t)vec_sums( (vec_s32_t)sqr_v, zero_s32v );
1127     vec_ste(sum_v, 12, sum_tab);
1128     vec_ste(sqr_v, 12, sqr_tab);
1129 
1130     uint32_t sum = sum_tab[3];
1131     uint32_t sqr = sqr_tab[3];
1132     return sum + ((uint64_t)sqr<<32);
1133 }
1134 
pixel_var_8x8_altivec(uint8_t * pix,intptr_t i_stride)1135 static uint64_t pixel_var_8x8_altivec( uint8_t *pix, intptr_t i_stride )
1136 {
1137     ALIGNED_16(uint32_t sum_tab[4]);
1138     ALIGNED_16(uint32_t sqr_tab[4]);
1139 
1140     LOAD_ZERO;
1141     vec_u32_t sqr_v = zero_u32v;
1142     vec_u32_t sum_v = zero_u32v;
1143 
1144     static const vec_u8_t perm_tab[] =
1145     {
1146         CV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,  /* pix=mod16, i_stride=mod16 */
1147            0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17),
1148         CV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,  /* pix=mod8, i_stride=mod16  */
1149            0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F),
1150     };
1151     vec_u8_t perm = perm_tab[ ((uintptr_t)pix & 8) >> 3 ];
1152 
1153     for( int y = 0; y < 4; y++ )
1154     {
1155         vec_u8_t pix0_v = vec_ld(0, pix);
1156         vec_u8_t pix1_v = vec_ld(i_stride, pix);
1157         vec_u8_t pix_v = vec_perm(pix0_v, pix1_v, perm);
1158         sum_v = vec_sum4s(pix_v, sum_v);
1159         sqr_v = vec_msum(pix_v, pix_v, sqr_v);
1160 
1161         pix += i_stride<<1;
1162     }
1163     sum_v = (vec_u32_t)vec_sums( (vec_s32_t)sum_v, zero_s32v );
1164     sqr_v = (vec_u32_t)vec_sums( (vec_s32_t)sqr_v, zero_s32v );
1165     vec_ste(sum_v, 12, sum_tab);
1166     vec_ste(sqr_v, 12, sqr_tab);
1167 
1168     uint32_t sum = sum_tab[3];
1169     uint32_t sqr = sqr_tab[3];
1170     return sum + ((uint64_t)sqr<<32);
1171 }
1172 
1173 
1174 /**********************************************************************
1175  * SA8D routines: sum of 8x8 Hadamard transformed differences
1176  **********************************************************************/
1177 /* SA8D_1D unrolled by 8 in Altivec */
1178 #define SA8D_1D_ALTIVEC( sa8d0v, sa8d1v, sa8d2v, sa8d3v,  \
1179                          sa8d4v, sa8d5v, sa8d6v, sa8d7v ) \
1180 {                                                         \
1181     /* int    a0  =        SRC(0) + SRC(4) */             \
1182     vec_s16_t a0v = vec_add(sa8d0v, sa8d4v);              \
1183     /* int    a4  =        SRC(0) - SRC(4) */             \
1184     vec_s16_t a4v = vec_sub(sa8d0v, sa8d4v);              \
1185     /* int    a1  =        SRC(1) + SRC(5) */             \
1186     vec_s16_t a1v = vec_add(sa8d1v, sa8d5v);              \
1187     /* int    a5  =        SRC(1) - SRC(5) */             \
1188     vec_s16_t a5v = vec_sub(sa8d1v, sa8d5v);              \
1189     /* int    a2  =        SRC(2) + SRC(6) */             \
1190     vec_s16_t a2v = vec_add(sa8d2v, sa8d6v);              \
1191     /* int    a6  =        SRC(2) - SRC(6) */             \
1192     vec_s16_t a6v = vec_sub(sa8d2v, sa8d6v);              \
1193     /* int    a3  =        SRC(3) + SRC(7) */             \
1194     vec_s16_t a3v = vec_add(sa8d3v, sa8d7v);              \
1195     /* int    a7  =        SRC(3) - SRC(7) */             \
1196     vec_s16_t a7v = vec_sub(sa8d3v, sa8d7v);              \
1197                                                           \
1198     /* int    b0  =         a0 + a2  */                   \
1199     vec_s16_t b0v = vec_add(a0v, a2v);                    \
1200     /* int    b2  =         a0 - a2; */                   \
1201     vec_s16_t  b2v = vec_sub(a0v, a2v);                   \
1202     /* int    b1  =         a1 + a3; */                   \
1203     vec_s16_t b1v = vec_add(a1v, a3v);                    \
1204     /* int    b3  =         a1 - a3; */                   \
1205     vec_s16_t b3v = vec_sub(a1v, a3v);                    \
1206     /* int    b4  =         a4 + a6; */                   \
1207     vec_s16_t b4v = vec_add(a4v, a6v);                    \
1208     /* int    b6  =         a4 - a6; */                   \
1209     vec_s16_t b6v = vec_sub(a4v, a6v);                    \
1210     /* int    b5  =         a5 + a7; */                   \
1211     vec_s16_t b5v = vec_add(a5v, a7v);                    \
1212     /* int    b7  =         a5 - a7; */                   \
1213     vec_s16_t b7v = vec_sub(a5v, a7v);                    \
1214                                                           \
1215     /* DST(0,        b0 + b1) */                          \
1216     sa8d0v = vec_add(b0v, b1v);                           \
1217     /* DST(1,        b0 - b1) */                          \
1218     sa8d1v = vec_sub(b0v, b1v);                           \
1219     /* DST(2,        b2 + b3) */                          \
1220     sa8d2v = vec_add(b2v, b3v);                           \
1221     /* DST(3,        b2 - b3) */                          \
1222     sa8d3v = vec_sub(b2v, b3v);                           \
1223     /* DST(4,        b4 + b5) */                          \
1224     sa8d4v = vec_add(b4v, b5v);                           \
1225     /* DST(5,        b4 - b5) */                          \
1226     sa8d5v = vec_sub(b4v, b5v);                           \
1227     /* DST(6,        b6 + b7) */                          \
1228     sa8d6v = vec_add(b6v, b7v);                           \
1229     /* DST(7,        b6 - b7) */                          \
1230     sa8d7v = vec_sub(b6v, b7v);                           \
1231 }
1232 
pixel_sa8d_8x8_core_altivec(uint8_t * pix1,intptr_t i_pix1,uint8_t * pix2,intptr_t i_pix2)1233 static int pixel_sa8d_8x8_core_altivec( uint8_t *pix1, intptr_t i_pix1,
1234                                         uint8_t *pix2, intptr_t i_pix2 )
1235 {
1236     int32_t i_satd=0;
1237 
1238     PREP_DIFF;
1239 
1240     vec_s16_t diff0v, diff1v, diff2v, diff3v, diff4v, diff5v, diff6v, diff7v;
1241 
1242     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
1243     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
1244     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
1245     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
1246 
1247     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v );
1248     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v );
1249     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v );
1250     VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v );
1251 
1252     vec_s16_t sa8d0v, sa8d1v, sa8d2v, sa8d3v, sa8d4v, sa8d5v, sa8d6v, sa8d7v;
1253 
1254     SA8D_1D_ALTIVEC(diff0v, diff1v, diff2v, diff3v,
1255                     diff4v, diff5v, diff6v, diff7v);
1256 
1257     VEC_TRANSPOSE_8(diff0v, diff1v, diff2v, diff3v,
1258                     diff4v, diff5v, diff6v, diff7v,
1259                     sa8d0v, sa8d1v, sa8d2v, sa8d3v,
1260                     sa8d4v, sa8d5v, sa8d6v, sa8d7v );
1261 
1262     SA8D_1D_ALTIVEC(sa8d0v, sa8d1v, sa8d2v, sa8d3v,
1263                     sa8d4v, sa8d5v, sa8d6v, sa8d7v );
1264 
1265     /* accumulation of the absolute value of all elements of the resulting bloc */
1266     vec_s16_t abs0v = VEC_ABS(sa8d0v);
1267     vec_s16_t abs1v = VEC_ABS(sa8d1v);
1268     vec_s16_t sum01v = vec_add(abs0v, abs1v);
1269 
1270     vec_s16_t abs2v = VEC_ABS(sa8d2v);
1271     vec_s16_t abs3v = VEC_ABS(sa8d3v);
1272     vec_s16_t sum23v = vec_add(abs2v, abs3v);
1273 
1274     vec_s16_t abs4v = VEC_ABS(sa8d4v);
1275     vec_s16_t abs5v = VEC_ABS(sa8d5v);
1276     vec_s16_t sum45v = vec_add(abs4v, abs5v);
1277 
1278     vec_s16_t abs6v = VEC_ABS(sa8d6v);
1279     vec_s16_t abs7v = VEC_ABS(sa8d7v);
1280     vec_s16_t sum67v = vec_add(abs6v, abs7v);
1281 
1282     vec_s16_t sum0123v = vec_add(sum01v, sum23v);
1283     vec_s16_t sum4567v = vec_add(sum45v, sum67v);
1284 
1285     vec_s32_t sumblocv;
1286 
1287     sumblocv = vec_sum4s(sum0123v, (vec_s32_t)zerov );
1288     sumblocv = vec_sum4s(sum4567v, sumblocv );
1289 
1290     sumblocv = vec_sums(sumblocv, (vec_s32_t)zerov );
1291 
1292     sumblocv = vec_splat(sumblocv, 3);
1293 
1294     vec_ste(sumblocv, 0, &i_satd);
1295 
1296     return i_satd;
1297 }
1298 
pixel_sa8d_8x8_altivec(uint8_t * pix1,intptr_t i_pix1,uint8_t * pix2,intptr_t i_pix2)1299 static int pixel_sa8d_8x8_altivec( uint8_t *pix1, intptr_t i_pix1,
1300                                    uint8_t *pix2, intptr_t i_pix2 )
1301 {
1302     int32_t i_satd;
1303     i_satd = (pixel_sa8d_8x8_core_altivec( pix1, i_pix1, pix2, i_pix2 )+2)>>2;
1304     return i_satd;
1305 }
1306 
pixel_sa8d_16x16_altivec(uint8_t * pix1,intptr_t i_pix1,uint8_t * pix2,intptr_t i_pix2)1307 static int pixel_sa8d_16x16_altivec( uint8_t *pix1, intptr_t i_pix1,
1308                                      uint8_t *pix2, intptr_t i_pix2 )
1309 {
1310     int32_t i_satd;
1311 
1312     i_satd = (pixel_sa8d_8x8_core_altivec( &pix1[0],          i_pix1, &pix2[0],          i_pix2 )
1313             + pixel_sa8d_8x8_core_altivec( &pix1[8],          i_pix1, &pix2[8],          i_pix2 )
1314             + pixel_sa8d_8x8_core_altivec( &pix1[8*i_pix1],   i_pix1, &pix2[8*i_pix2],   i_pix2 )
1315             + pixel_sa8d_8x8_core_altivec( &pix1[8*i_pix1+8], i_pix1, &pix2[8*i_pix2+8], i_pix2 ) +2)>>2;
1316     return i_satd;
1317 }
1318 
1319 #define HADAMARD4_ALTIVEC(d0,d1,d2,d3,s0,s1,s2,s3) {\
1320     vec_s16_t t0 = vec_add(s0, s1);                 \
1321     vec_s16_t t1 = vec_sub(s0, s1);                 \
1322     vec_s16_t t2 = vec_add(s2, s3);                 \
1323     vec_s16_t t3 = vec_sub(s2, s3);                 \
1324     d0 = vec_add(t0, t2);                           \
1325     d2 = vec_sub(t0, t2);                           \
1326     d1 = vec_add(t1, t3);                           \
1327     d3 = vec_sub(t1, t3);                           \
1328 }
1329 
1330 #ifdef WORDS_BIGENDIAN
1331 #define vec_perm_extend_s16(val, perm) (vec_s16_t)vec_perm(val, zero_u8v, perm)
1332 #else
1333 #define vec_perm_extend_s16(val, perm) (vec_s16_t)vec_perm(zero_u8v, val, perm)
1334 #endif
1335 
1336 #define VEC_LOAD_HIGH( p, num )                                    \
1337     vec_u8_t pix8_##num = vec_ld( stride*num, p );                 \
1338     vec_s16_t pix16_s##num = vec_perm_extend_s16( pix8_##num, perm ); \
1339     vec_s16_t pix16_d##num;
1340 
pixel_hadamard_ac_altivec(uint8_t * pix,intptr_t stride,const vec_u8_t perm)1341 static uint64_t pixel_hadamard_ac_altivec( uint8_t *pix, intptr_t stride, const vec_u8_t perm )
1342 {
1343     ALIGNED_16( int32_t sum4_tab[4] );
1344     ALIGNED_16( int32_t sum8_tab[4] );
1345     LOAD_ZERO;
1346 
1347     VEC_LOAD_HIGH( pix, 0 );
1348     VEC_LOAD_HIGH( pix, 1 );
1349     VEC_LOAD_HIGH( pix, 2 );
1350     VEC_LOAD_HIGH( pix, 3 );
1351     HADAMARD4_ALTIVEC(pix16_d0,pix16_d1,pix16_d2,pix16_d3,
1352                       pix16_s0,pix16_s1,pix16_s2,pix16_s3);
1353 
1354     VEC_LOAD_HIGH( pix, 4 );
1355     VEC_LOAD_HIGH( pix, 5 );
1356     VEC_LOAD_HIGH( pix, 6 );
1357     VEC_LOAD_HIGH( pix, 7 );
1358     HADAMARD4_ALTIVEC(pix16_d4,pix16_d5,pix16_d6,pix16_d7,
1359                       pix16_s4,pix16_s5,pix16_s6,pix16_s7);
1360 
1361     VEC_TRANSPOSE_8(pix16_d0, pix16_d1, pix16_d2, pix16_d3,
1362                     pix16_d4, pix16_d5, pix16_d6, pix16_d7,
1363                     pix16_s0, pix16_s1, pix16_s2, pix16_s3,
1364                     pix16_s4, pix16_s5, pix16_s6, pix16_s7);
1365 
1366     HADAMARD4_ALTIVEC(pix16_d0,pix16_d1,pix16_d2,pix16_d3,
1367                       pix16_s0,pix16_s1,pix16_s2,pix16_s3);
1368 
1369     HADAMARD4_ALTIVEC(pix16_d4,pix16_d5,pix16_d6,pix16_d7,
1370                       pix16_s4,pix16_s5,pix16_s6,pix16_s7);
1371 
1372     vec_u16_t addabs01 = vec_add( VEC_ABSOLUTE(pix16_d0), VEC_ABSOLUTE(pix16_d1) );
1373     vec_u16_t addabs23 = vec_add( VEC_ABSOLUTE(pix16_d2), VEC_ABSOLUTE(pix16_d3) );
1374     vec_u16_t addabs45 = vec_add( VEC_ABSOLUTE(pix16_d4), VEC_ABSOLUTE(pix16_d5) );
1375     vec_u16_t addabs67 = vec_add( VEC_ABSOLUTE(pix16_d6), VEC_ABSOLUTE(pix16_d7) );
1376 
1377     vec_u16_t sum4_v = vec_add(vec_add(addabs01, addabs23), vec_add(addabs45, addabs67));
1378     vec_ste(vec_sums(vec_sum4s((vec_s16_t)sum4_v, zero_s32v), zero_s32v), 12, sum4_tab);
1379 
1380     vec_s16_t tmpi0 = vec_add(pix16_d0, pix16_d4);
1381     vec_s16_t tmpi4 = vec_sub(pix16_d0, pix16_d4);
1382     vec_s16_t tmpi1 = vec_add(pix16_d1, pix16_d5);
1383     vec_s16_t tmpi5 = vec_sub(pix16_d1, pix16_d5);
1384     vec_s16_t tmpi2 = vec_add(pix16_d2, pix16_d6);
1385     vec_s16_t tmpi6 = vec_sub(pix16_d2, pix16_d6);
1386     vec_s16_t tmpi3 = vec_add(pix16_d3, pix16_d7);
1387     vec_s16_t tmpi7 = vec_sub(pix16_d3, pix16_d7);
1388 
1389     int sum4 = sum4_tab[3];
1390 
1391     VEC_TRANSPOSE_8(tmpi0, tmpi1, tmpi2, tmpi3,
1392                     tmpi4, tmpi5, tmpi6, tmpi7,
1393                     pix16_d0, pix16_d1, pix16_d2, pix16_d3,
1394                     pix16_d4, pix16_d5, pix16_d6, pix16_d7);
1395 
1396     vec_u16_t addsum04 = vec_add( VEC_ABSOLUTE( vec_add(pix16_d0, pix16_d4) ),
1397                                   VEC_ABSOLUTE( vec_sub(pix16_d0, pix16_d4) ) );
1398     vec_u16_t addsum15 = vec_add( VEC_ABSOLUTE( vec_add(pix16_d1, pix16_d5) ),
1399                                   VEC_ABSOLUTE( vec_sub(pix16_d1, pix16_d5) ) );
1400     vec_u16_t addsum26 = vec_add( VEC_ABSOLUTE( vec_add(pix16_d2, pix16_d6) ),
1401                                   VEC_ABSOLUTE( vec_sub(pix16_d2, pix16_d6) ) );
1402     vec_u16_t addsum37 = vec_add( VEC_ABSOLUTE( vec_add(pix16_d3, pix16_d7) ),
1403                                   VEC_ABSOLUTE( vec_sub(pix16_d3, pix16_d7) ) );
1404 
1405     vec_u16_t sum8_v = vec_add( vec_add(addsum04, addsum15), vec_add(addsum26, addsum37) );
1406     vec_ste(vec_sums(vec_sum4s((vec_s16_t)sum8_v, zero_s32v), zero_s32v), 12, sum8_tab);
1407 
1408     int sum8 = sum8_tab[3];
1409 
1410     ALIGNED_16( int16_t tmp0_4_tab[8] );
1411     vec_ste(vec_add(pix16_d0, pix16_d4), 0, tmp0_4_tab);
1412 
1413     sum4 -= tmp0_4_tab[0];
1414     sum8 -= tmp0_4_tab[0];
1415     return ((uint64_t)sum8<<32) + sum4;
1416 }
1417 
1418 
1419 static const vec_u8_t hadamard_permtab[] =
1420 {
1421     CV(0x10,0x00,0x11,0x01, 0x12,0x02,0x13,0x03,     /* pix = mod16 */
1422        0x14,0x04,0x15,0x05, 0x16,0x06,0x17,0x07 ),
1423     CV(0x18,0x08,0x19,0x09, 0x1A,0x0A,0x1B,0x0B,     /* pix = mod8 */
1424        0x1C,0x0C,0x1D,0x0D, 0x1E,0x0E,0x1F,0x0F )
1425  };
1426 
pixel_hadamard_ac_16x16_altivec(uint8_t * pix,intptr_t stride)1427 static uint64_t pixel_hadamard_ac_16x16_altivec( uint8_t *pix, intptr_t stride )
1428 {
1429     int idx =  ((uintptr_t)pix & 8) >> 3;
1430     vec_u8_t permh = hadamard_permtab[idx];
1431     vec_u8_t perml = hadamard_permtab[!idx];
1432     uint64_t sum = pixel_hadamard_ac_altivec( pix, stride, permh );
1433     sum += pixel_hadamard_ac_altivec( pix+8, stride, perml );
1434     sum += pixel_hadamard_ac_altivec( pix+8*stride, stride, permh );
1435     sum += pixel_hadamard_ac_altivec( pix+8*stride+8, stride, perml );
1436     return ((sum>>34)<<32) + ((uint32_t)sum>>1);
1437 }
1438 
pixel_hadamard_ac_16x8_altivec(uint8_t * pix,intptr_t stride)1439 static uint64_t pixel_hadamard_ac_16x8_altivec( uint8_t *pix, intptr_t stride )
1440 {
1441     int idx =  ((uintptr_t)pix & 8) >> 3;
1442     vec_u8_t permh = hadamard_permtab[idx];
1443     vec_u8_t perml = hadamard_permtab[!idx];
1444     uint64_t sum = pixel_hadamard_ac_altivec( pix, stride, permh );
1445     sum += pixel_hadamard_ac_altivec( pix+8, stride, perml );
1446     return ((sum>>34)<<32) + ((uint32_t)sum>>1);
1447 }
1448 
pixel_hadamard_ac_8x16_altivec(uint8_t * pix,intptr_t stride)1449 static uint64_t pixel_hadamard_ac_8x16_altivec( uint8_t *pix, intptr_t stride )
1450 {
1451     vec_u8_t perm = hadamard_permtab[ (((uintptr_t)pix & 8) >> 3) ];
1452     uint64_t sum = pixel_hadamard_ac_altivec( pix, stride, perm );
1453     sum += pixel_hadamard_ac_altivec( pix+8*stride, stride, perm );
1454     return ((sum>>34)<<32) + ((uint32_t)sum>>1);
1455 }
1456 
pixel_hadamard_ac_8x8_altivec(uint8_t * pix,intptr_t stride)1457 static uint64_t pixel_hadamard_ac_8x8_altivec( uint8_t *pix, intptr_t stride )
1458 {
1459     vec_u8_t perm = hadamard_permtab[ (((uintptr_t)pix & 8) >> 3) ];
1460     uint64_t sum = pixel_hadamard_ac_altivec( pix, stride, perm );
1461     return ((sum>>34)<<32) + ((uint32_t)sum>>1);
1462 }
1463 
1464 
1465 /****************************************************************************
1466  * structural similarity metric
1467  ****************************************************************************/
ssim_4x4x2_core_altivec(const uint8_t * pix1,intptr_t stride1,const uint8_t * pix2,intptr_t stride2,int sums[2][4])1468 static void ssim_4x4x2_core_altivec( const uint8_t *pix1, intptr_t stride1,
1469                                      const uint8_t *pix2, intptr_t stride2,
1470                                      int sums[2][4] )
1471 {
1472     ALIGNED_16( int temp[4] );
1473 
1474     vec_u8_t pix1v, pix2v;
1475     vec_u32_t s1v, s2v, ssv, s12v;
1476     LOAD_ZERO;
1477 
1478     s1v = s2v = ssv = s12v = zero_u32v;
1479 
1480     for( int y = 0; y < 4; y++ )
1481     {
1482         pix1v = vec_vsx_ld( y*stride1, pix1 );
1483         pix2v = vec_vsx_ld( y*stride2, pix2 );
1484 
1485         s1v = vec_sum4s( pix1v, s1v );
1486         s2v = vec_sum4s( pix2v, s2v );
1487         ssv = vec_msum( pix1v, pix1v, ssv );
1488         ssv = vec_msum( pix2v, pix2v, ssv );
1489         s12v = vec_msum( pix1v, pix2v, s12v );
1490     }
1491 
1492     vec_st( (vec_s32_t)s1v, 0, temp );
1493     sums[0][0] = temp[0];
1494     sums[1][0] = temp[1];
1495     vec_st( (vec_s32_t)s2v, 0, temp );
1496     sums[0][1] = temp[0];
1497     sums[1][1] = temp[1];
1498     vec_st( (vec_s32_t)ssv, 0, temp );
1499     sums[0][2] = temp[0];
1500     sums[1][2] = temp[1];
1501     vec_st( (vec_s32_t)s12v, 0, temp );
1502     sums[0][3] = temp[0];
1503     sums[1][3] = temp[1];
1504 }
1505 
1506 #define SATD_X( size ) \
1507 static void pixel_satd_x3_##size##_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2,\
1508                                             intptr_t i_stride, int scores[3] )\
1509 {\
1510     scores[0] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix0, i_stride );\
1511     scores[1] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix1, i_stride );\
1512     scores[2] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix2, i_stride );\
1513 }\
1514 static void pixel_satd_x4_##size##_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2,\
1515                                             uint8_t *pix3, intptr_t i_stride, int scores[4] )\
1516 {\
1517     scores[0] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix0, i_stride );\
1518     scores[1] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix1, i_stride );\
1519     scores[2] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix2, i_stride );\
1520     scores[3] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix3, i_stride );\
1521 }
1522 SATD_X( 16x16 )\
1523 SATD_X( 16x8 )\
1524 SATD_X( 8x16 )\
1525 SATD_X( 8x8 )\
1526 SATD_X( 8x4 )\
1527 SATD_X( 4x8 )\
1528 SATD_X( 4x4 )
1529 
1530 
1531 #define INTRA_MBCMP_8x8( mbcmp )\
1532 static void intra_##mbcmp##_x3_8x8_altivec( uint8_t *fenc, uint8_t edge[36], int res[3] )\
1533 {\
1534     ALIGNED_8( uint8_t pix[8*FDEC_STRIDE] );\
1535     x264_predict_8x8_v_c( pix, edge );\
1536     res[0] = pixel_##mbcmp##_8x8_altivec( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\
1537     x264_predict_8x8_h_c( pix, edge );\
1538     res[1] = pixel_##mbcmp##_8x8_altivec( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\
1539     x264_predict_8x8_dc_c( pix, edge );\
1540     res[2] = pixel_##mbcmp##_8x8_altivec( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\
1541 }
1542 
INTRA_MBCMP_8x8(sad)1543 INTRA_MBCMP_8x8(sad)
1544 INTRA_MBCMP_8x8(sa8d)
1545 
1546 #define INTRA_MBCMP( mbcmp, size, pred1, pred2, pred3, chroma )\
1547 static void intra_##mbcmp##_x3_##size##x##size##chroma##_altivec( uint8_t *fenc, uint8_t *fdec, int res[3] )\
1548 {\
1549     x264_predict_##size##x##size##chroma##_##pred1##_c( fdec );\
1550     res[0] = pixel_##mbcmp##_##size##x##size##_altivec( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
1551     x264_predict_##size##x##size##chroma##_##pred2##_c( fdec );\
1552     res[1] = pixel_##mbcmp##_##size##x##size##_altivec( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
1553     x264_predict_##size##x##size##chroma##_##pred3##_c( fdec );\
1554     res[2] = pixel_##mbcmp##_##size##x##size##_altivec( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
1555 }
1556 
1557 INTRA_MBCMP(satd, 4, v, h, dc, )
1558 INTRA_MBCMP(sad, 8, dc, h, v, c )
1559 INTRA_MBCMP(satd, 8, dc, h, v, c )
1560 INTRA_MBCMP(sad, 16, v, h, dc, )
1561 INTRA_MBCMP(satd, 16, v, h, dc, )
1562 #endif // !HIGH_BIT_DEPTH
1563 
1564 /****************************************************************************
1565  * x264_pixel_init:
1566  ****************************************************************************/
1567 void x264_pixel_init_altivec( x264_pixel_function_t *pixf )
1568 {
1569 #if !HIGH_BIT_DEPTH
1570     pixf->sad[PIXEL_16x16]  = pixel_sad_16x16_altivec;
1571     pixf->sad[PIXEL_8x16]   = pixel_sad_8x16_altivec;
1572     pixf->sad[PIXEL_16x8]   = pixel_sad_16x8_altivec;
1573     pixf->sad[PIXEL_8x8]    = pixel_sad_8x8_altivec;
1574 
1575     pixf->sad_x3[PIXEL_16x16] = pixel_sad_x3_16x16_altivec;
1576     pixf->sad_x3[PIXEL_8x16]  = pixel_sad_x3_8x16_altivec;
1577     pixf->sad_x3[PIXEL_16x8]  = pixel_sad_x3_16x8_altivec;
1578     pixf->sad_x3[PIXEL_8x8]   = pixel_sad_x3_8x8_altivec;
1579 
1580     pixf->sad_x4[PIXEL_16x16] = pixel_sad_x4_16x16_altivec;
1581     pixf->sad_x4[PIXEL_8x16]  = pixel_sad_x4_8x16_altivec;
1582     pixf->sad_x4[PIXEL_16x8]  = pixel_sad_x4_16x8_altivec;
1583     pixf->sad_x4[PIXEL_8x8]   = pixel_sad_x4_8x8_altivec;
1584 
1585     pixf->satd[PIXEL_16x16] = pixel_satd_16x16_altivec;
1586     pixf->satd[PIXEL_8x16]  = pixel_satd_8x16_altivec;
1587     pixf->satd[PIXEL_16x8]  = pixel_satd_16x8_altivec;
1588     pixf->satd[PIXEL_8x8]   = pixel_satd_8x8_altivec;
1589     pixf->satd[PIXEL_8x4]   = pixel_satd_8x4_altivec;
1590     pixf->satd[PIXEL_4x8]   = pixel_satd_4x8_altivec;
1591     pixf->satd[PIXEL_4x4]   = pixel_satd_4x4_altivec;
1592 
1593     pixf->satd_x3[PIXEL_16x16] = pixel_satd_x3_16x16_altivec;
1594     pixf->satd_x3[PIXEL_8x16]  = pixel_satd_x3_8x16_altivec;
1595     pixf->satd_x3[PIXEL_16x8]  = pixel_satd_x3_16x8_altivec;
1596     pixf->satd_x3[PIXEL_8x8]   = pixel_satd_x3_8x8_altivec;
1597     pixf->satd_x3[PIXEL_8x4]   = pixel_satd_x3_8x4_altivec;
1598     pixf->satd_x3[PIXEL_4x8]   = pixel_satd_x3_4x8_altivec;
1599     pixf->satd_x3[PIXEL_4x4]   = pixel_satd_x3_4x4_altivec;
1600 
1601     pixf->satd_x4[PIXEL_16x16] = pixel_satd_x4_16x16_altivec;
1602     pixf->satd_x4[PIXEL_8x16]  = pixel_satd_x4_8x16_altivec;
1603     pixf->satd_x4[PIXEL_16x8]  = pixel_satd_x4_16x8_altivec;
1604     pixf->satd_x4[PIXEL_8x8]   = pixel_satd_x4_8x8_altivec;
1605     pixf->satd_x4[PIXEL_8x4]   = pixel_satd_x4_8x4_altivec;
1606     pixf->satd_x4[PIXEL_4x8]   = pixel_satd_x4_4x8_altivec;
1607     pixf->satd_x4[PIXEL_4x4]   = pixel_satd_x4_4x4_altivec;
1608 
1609     pixf->intra_sad_x3_8x8    = intra_sad_x3_8x8_altivec;
1610     pixf->intra_sad_x3_8x8c   = intra_sad_x3_8x8c_altivec;
1611     pixf->intra_sad_x3_16x16  = intra_sad_x3_16x16_altivec;
1612 
1613     pixf->intra_satd_x3_4x4   = intra_satd_x3_4x4_altivec;
1614     pixf->intra_satd_x3_8x8c  = intra_satd_x3_8x8c_altivec;
1615     pixf->intra_satd_x3_16x16 = intra_satd_x3_16x16_altivec;
1616 
1617     pixf->ssd[PIXEL_16x16] = pixel_ssd_16x16_altivec;
1618     pixf->ssd[PIXEL_8x8]   = pixel_ssd_8x8_altivec;
1619 
1620     pixf->sa8d[PIXEL_16x16] = pixel_sa8d_16x16_altivec;
1621     pixf->sa8d[PIXEL_8x8]   = pixel_sa8d_8x8_altivec;
1622 
1623     pixf->intra_sa8d_x3_8x8   = intra_sa8d_x3_8x8_altivec;
1624 
1625     pixf->var[PIXEL_16x16] = pixel_var_16x16_altivec;
1626     pixf->var[PIXEL_8x8]   = pixel_var_8x8_altivec;
1627 
1628     pixf->hadamard_ac[PIXEL_16x16] = pixel_hadamard_ac_16x16_altivec;
1629     pixf->hadamard_ac[PIXEL_16x8]  = pixel_hadamard_ac_16x8_altivec;
1630     pixf->hadamard_ac[PIXEL_8x16]  = pixel_hadamard_ac_8x16_altivec;
1631     pixf->hadamard_ac[PIXEL_8x8]   = pixel_hadamard_ac_8x8_altivec;
1632 
1633     pixf->ssim_4x4x2_core = ssim_4x4x2_core_altivec;
1634 #endif // !HIGH_BIT_DEPTH
1635 }
1636