1 /*****************************************************************************
2  * Copyright (C) 2013-2020 MulticoreWare, Inc
3  *
4  * Authors: Steve Borho <steve@borho.org>
5  *          Mandar Gurav <mandar@multicorewareinc.com>
6  *          Mahesh Pittala <mahesh@multicorewareinc.com>
7  *          Min Chen <min.chen@multicorewareinc.com>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
22  *
23  * This program is also available under a commercial proprietary license.
24  * For more information, contact us at license @ x265.com.
25  *****************************************************************************/
26 
27 #include "common.h"
28 #include "primitives.h"
29 #include "x265.h"
30 #include "ppccommon.h"
31 
32 #include <cstdlib> // abs()
33 
34 //using namespace X265_NS;
35 
36 namespace X265_NS {
37 // place functions in anonymous namespace (file static)
38 
39  /* Null vector */
40 #define LOAD_ZERO const vec_u8_t zerov = vec_splat_u8( 0 )
41 
42 #define zero_u8v  (vec_u8_t)  zerov
43 #define zero_s8v  (vec_s8_t)  zerov
44 #define zero_u16v (vec_u16_t) zerov
45 #define zero_s16v (vec_s16_t) zerov
46 #define zero_u32v (vec_u32_t) zerov
47 #define zero_s32v (vec_s32_t) zerov
48 
49  /* 8 <-> 16 bits conversions */
50 #ifdef WORDS_BIGENDIAN
51 #define vec_u8_to_u16_h(v) (vec_u16_t) vec_mergeh( zero_u8v, (vec_u8_t) v )
52 #define vec_u8_to_u16_l(v) (vec_u16_t) vec_mergel( zero_u8v, (vec_u8_t) v )
53 #define vec_u8_to_s16_h(v) (vec_s16_t) vec_mergeh( zero_u8v, (vec_u8_t) v )
54 #define vec_u8_to_s16_l(v) (vec_s16_t) vec_mergel( zero_u8v, (vec_u8_t) v )
55 #else
56 #define vec_u8_to_u16_h(v) (vec_u16_t) vec_mergeh( (vec_u8_t) v, zero_u8v )
57 #define vec_u8_to_u16_l(v) (vec_u16_t) vec_mergel( (vec_u8_t) v, zero_u8v )
58 #define vec_u8_to_s16_h(v) (vec_s16_t) vec_mergeh( (vec_u8_t) v, zero_u8v )
59 #define vec_u8_to_s16_l(v) (vec_s16_t) vec_mergel( (vec_u8_t) v, zero_u8v )
60 #endif
61 
62 #define vec_u8_to_u16(v) vec_u8_to_u16_h(v)
63 #define vec_u8_to_s16(v) vec_u8_to_s16_h(v)
64 
65 #if defined(__GNUC__)
66 #define ALIGN_VAR_8(T, var)  T var __attribute__((aligned(8)))
67 #define ALIGN_VAR_16(T, var) T var __attribute__((aligned(16)))
68 #define ALIGN_VAR_32(T, var) T var __attribute__((aligned(32)))
69 #elif defined(_MSC_VER)
70 #define ALIGN_VAR_8(T, var)  __declspec(align(8)) T var
71 #define ALIGN_VAR_16(T, var) __declspec(align(16)) T var
72 #define ALIGN_VAR_32(T, var) __declspec(align(32)) T var
73 #endif // if defined(__GNUC__)
74 
75 typedef uint8_t  pixel;
76 typedef uint32_t sum2_t ;
77 typedef uint16_t sum_t ;
78 #define BITS_PER_SUM (8 * sizeof(sum_t))
79 
80 /***********************************************************************
81  * SAD routines - altivec implementation
82  **********************************************************************/
83 template<int lx, int ly>
sum_columns_altivec(vec_s32_t sumv,int * sum)84 void inline sum_columns_altivec(vec_s32_t sumv, int* sum){}
85 
86 template<int lx, int ly>
sad16_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)87 int inline sad16_altivec(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
88 {
89     assert(lx <=16);
90     LOAD_ZERO;
91     vec_u8_t  pix1v, pix2v;
92     vec_u8_t  absv = zero_u8v;
93     vec_s32_t sumv = zero_s32v;
94     ALIGN_VAR_16(int, sum );
95 
96     for( int y = 0; y < ly; y++ )
97     {
98         pix1v = /*vec_vsx_ld*/vec_xl( 0, pix1);
99         pix2v = /*vec_vsx_ld*/vec_xl( 0, pix2);
100         //print_vec_u8("pix1v", &pix1v);
101         //print_vec_u8("pix2v", &pix2v);
102 
103         absv = (vector unsigned char)vec_sub(vec_max(pix1v, pix2v), vec_min(pix1v, pix2v));
104         //print_vec_u8("abs sub", &absv);
105 
106         sumv = (vec_s32_t) vec_sum4s( absv, (vec_u32_t) sumv);
107         //print_vec_i("vec_sum4s 0", &sumv);
108 
109         pix1 += stride_pix1;
110         pix2 += stride_pix2;
111     }
112 
113     sum_columns_altivec<lx, ly>(sumv, &sum);
114     //printf("<%d %d>%d\n", lx, ly, sum);
115     return sum;
116 }
117 
118 template<int lx, int ly> //to be implemented later
sad16_altivec(const int16_t * pix1,intptr_t stride_pix1,const int16_t * pix2,intptr_t stride_pix2)119 int sad16_altivec(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2)
120 {
121     int sum = 0;
122     return sum;
123 }
124 
125 template<int lx, int ly>//to be implemented later
sad_altivec(const int16_t * pix1,intptr_t stride_pix1,const int16_t * pix2,intptr_t stride_pix2)126 int sad_altivec(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2)
127 {
128     int sum = 0;
129     return sum;
130 }
131 
132 template<>
sum_columns_altivec(vec_s32_t sumv,int * sum)133 void inline sum_columns_altivec<16, 4>(vec_s32_t sumv, int* sum)
134 {
135     LOAD_ZERO;
136     sumv = vec_sums( sumv, zero_s32v );
137     //print_vec_i("vec_sums", &sumv);
138     sumv = vec_splat( sumv, 3 );
139     //print_vec_i("vec_splat 3", &sumv);
140     vec_ste( sumv, 0, sum );
141 }
142 
143 template<>
sum_columns_altivec(vec_s32_t sumv,int * sum)144 void inline sum_columns_altivec<16, 8>(vec_s32_t sumv, int* sum)
145 {
146     LOAD_ZERO;
147     sumv = vec_sums( sumv, zero_s32v );
148     //print_vec_i("vec_sums", &sumv);
149     sumv = vec_splat( sumv, 3 );
150     //print_vec_i("vec_splat 3", &sumv);
151     vec_ste( sumv, 0, sum );
152 }
153 
154 template<>
sum_columns_altivec(vec_s32_t sumv,int * sum)155 void inline sum_columns_altivec<16, 12>(vec_s32_t sumv, int* sum)
156 {
157     LOAD_ZERO;
158     sumv = vec_sums( sumv, zero_s32v );
159     //print_vec_i("vec_sums", &sumv);
160     sumv = vec_splat( sumv, 3 );
161     //print_vec_i("vec_splat 3", &sumv);
162     vec_ste( sumv, 0, sum );
163 }
164 
165 template<>
sum_columns_altivec(vec_s32_t sumv,int * sum)166 void inline sum_columns_altivec<16, 16>(vec_s32_t sumv, int* sum)
167 {
168     LOAD_ZERO;
169     sumv = vec_sums( sumv, zero_s32v );
170     //print_vec_i("vec_sums", &sumv);
171     sumv = vec_splat( sumv, 3 );
172     //print_vec_i("vec_splat 3", &sumv);
173     vec_ste( sumv, 0, sum );
174 }
175 
176 template<>
sum_columns_altivec(vec_s32_t sumv,int * sum)177 void inline sum_columns_altivec<16, 24>(vec_s32_t sumv, int* sum)
178 {
179     LOAD_ZERO;
180     sumv = vec_sums( sumv, zero_s32v );
181     //print_vec_i("vec_sums", &sumv);
182     sumv = vec_splat( sumv, 3 );
183     //print_vec_i("vec_splat 3", &sumv);
184     vec_ste( sumv, 0, sum );
185 }
186 
187 template<>
sum_columns_altivec(vec_s32_t sumv,int * sum)188 void inline sum_columns_altivec<16, 32>(vec_s32_t sumv, int* sum)
189 {
190     LOAD_ZERO;
191     sumv = vec_sums( sumv, zero_s32v );
192     //print_vec_i("vec_sums", &sumv);
193     sumv = vec_splat( sumv, 3 );
194     //print_vec_i("vec_splat 3", &sumv);
195     vec_ste( sumv, 0, sum );
196 }
197 
198 template<>
sum_columns_altivec(vec_s32_t sumv,int * sum)199 void inline sum_columns_altivec<16, 48>(vec_s32_t sumv, int* sum)
200 {
201     LOAD_ZERO;
202     sumv = vec_sums( sumv, zero_s32v );
203     //print_vec_i("vec_sums", &sumv);
204     sumv = vec_splat( sumv, 3 );
205     //print_vec_i("vec_splat 3", &sumv);
206     vec_ste( sumv, 0, sum );
207 }
208 
209 template<>
sum_columns_altivec(vec_s32_t sumv,int * sum)210 void inline sum_columns_altivec<16, 64>(vec_s32_t sumv, int* sum)
211 {
212     LOAD_ZERO;
213     sumv = vec_sums( sumv, zero_s32v );
214     //print_vec_i("vec_sums", &sumv);
215     sumv = vec_splat( sumv, 3 );
216     //print_vec_i("vec_splat 3", &sumv);
217     vec_ste( sumv, 0, sum );
218 }
219 
220 
221 template<>
sum_columns_altivec(vec_s32_t sumv,int * sum)222 void inline sum_columns_altivec<8, 4>(vec_s32_t sumv, int* sum)
223 {
224     LOAD_ZERO;
225     sumv = vec_sum2s( sumv, zero_s32v );
226     //print_vec_i("vec_sums", &sumv);
227     sumv = vec_splat( sumv, 1 );
228     //print_vec_i("vec_splat 1", &sumv);
229     vec_ste( sumv, 0, sum );
230 }
231 
232 template<>
sum_columns_altivec(vec_s32_t sumv,int * sum)233 void inline sum_columns_altivec<8, 8>(vec_s32_t sumv, int* sum)
234 {
235     LOAD_ZERO;
236     sumv = vec_sum2s( sumv, zero_s32v );
237     //print_vec_i("vec_sums", &sumv);
238     sumv = vec_splat( sumv, 1 );
239     //print_vec_i("vec_splat 1", &sumv);
240     vec_ste( sumv, 0, sum );
241 }
242 
243 template<>
sum_columns_altivec(vec_s32_t sumv,int * sum)244 void inline sum_columns_altivec<8, 16>(vec_s32_t sumv, int* sum)
245 {
246     LOAD_ZERO;
247     sumv = vec_sum2s( sumv, zero_s32v );
248     //print_vec_i("vec_sums", &sumv);
249     sumv = vec_splat( sumv, 1 );
250     //print_vec_i("vec_splat 1", &sumv);
251     vec_ste( sumv, 0, sum );
252 }
253 
254 template<>
sum_columns_altivec(vec_s32_t sumv,int * sum)255 void inline sum_columns_altivec<8, 32>(vec_s32_t sumv, int* sum)
256 {
257     LOAD_ZERO;
258     sumv = vec_sum2s( sumv, zero_s32v );
259     //print_vec_i("vec_sums", &sumv);
260     sumv = vec_splat( sumv, 1 );
261     //print_vec_i("vec_splat 1", &sumv);
262     vec_ste( sumv, 0, sum );
263 }
264 
265 template<>
sum_columns_altivec(vec_s32_t sumv,int * sum)266 void inline sum_columns_altivec<4, 4>(vec_s32_t sumv, int* sum)
267 {
268     LOAD_ZERO;
269     sumv = vec_splat( sumv, 0 );
270     //print_vec_i("vec_splat 0", &sumv);
271     vec_ste( sumv, 0, sum );
272 }
273 
274 template<>
sum_columns_altivec(vec_s32_t sumv,int * sum)275 void inline sum_columns_altivec<4, 8>(vec_s32_t sumv, int* sum)
276 {
277     LOAD_ZERO;
278     sumv = vec_splat( sumv, 0 );
279     //print_vec_i("vec_splat 0", &sumv);
280     vec_ste( sumv, 0, sum );
281 }
282 
283 template<>
sum_columns_altivec(vec_s32_t sumv,int * sum)284 void inline sum_columns_altivec<4, 16>(vec_s32_t sumv, int* sum)
285 {
286     LOAD_ZERO;
287     sumv = vec_splat( sumv, 0 );
288     //print_vec_i("vec_splat 0", &sumv);
289     vec_ste( sumv, 0, sum );
290 }
291 
292 template<>
sum_columns_altivec(vec_s32_t sumv,int * sum)293 void inline sum_columns_altivec<12, 16>(vec_s32_t sumv, int* sum)
294 {
295     LOAD_ZERO;
296     vec_s32_t sum1v= vec_splat( sumv, 3);
297     sumv = vec_sums( sumv, zero_s32v );
298     //print_vec_i("vec_sums", &sumv);
299     sumv = vec_splat( sumv, 3 );
300     //print_vec_i("vec_splat 1", &sumv);
301     sumv = vec_sub(sumv, sum1v);
302     vec_ste( sumv, 0, sum );
303 }
304 
305 template<int lx, int ly>
sad_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)306 int inline sad_altivec(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2){ return 0; }
307 
308 template<>
sad_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)309 int inline sad_altivec<24, 32>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
310 {
311     ALIGN_VAR_16(int, sum );
312     sum = sad16_altivec<16, 32>(pix1, stride_pix1, pix2, stride_pix2)
313               + sad16_altivec<8, 32>(pix1+16, stride_pix1, pix2+16, stride_pix2);
314     //printf("<24 32>%d\n", sum);
315     return sum;
316 }
317 
318 template<>
sad_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)319 int inline sad_altivec<32, 8>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
320 {
321     ALIGN_VAR_16(int, sum );
322     sum = sad16_altivec<16, 8>(pix1, stride_pix1, pix2, stride_pix2)
323               + sad16_altivec<16, 8>(pix1+16, stride_pix1, pix2+16, stride_pix2);
324    //printf("<32 8>%d\n", sum);
325    return sum;
326 }
327 
328 template<>
sad_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)329 int inline sad_altivec<32, 16>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
330 {
331     ALIGN_VAR_16(int, sum );
332     sum = sad16_altivec<16, 16>(pix1, stride_pix1, pix2, stride_pix2)
333               + sad16_altivec<16, 16>(pix1+16, stride_pix1, pix2+16, stride_pix2);
334     //printf("<32 16>%d\n", sum);
335     return sum;
336 }
337 
338 template<>
sad_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)339 int inline sad_altivec<32, 24>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
340 {
341     ALIGN_VAR_16(int, sum );
342     sum = sad16_altivec<16, 24>(pix1, stride_pix1, pix2, stride_pix2)
343               + sad16_altivec<16, 24>(pix1+16, stride_pix1, pix2+16, stride_pix2);
344     //printf("<32 24>%d\n", sum);
345     return sum;
346 }
347 
348 template<>
sad_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)349 int inline sad_altivec<32, 32>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
350 {
351     ALIGN_VAR_16(int, sum );
352     sum = sad16_altivec<16, 32>(pix1, stride_pix1, pix2, stride_pix2)
353               + sad16_altivec<16, 32>(pix1+16, stride_pix1, pix2+16, stride_pix2);
354     //printf("<32 32>%d\n", sum);
355     return sum;
356 }
357 
358 template<>
sad_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)359 int inline sad_altivec<32, 64>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
360 {
361     ALIGN_VAR_16(int, sum );
362     sum = sad16_altivec<16, 64>(pix1, stride_pix1, pix2, stride_pix2)
363               + sad16_altivec<16, 64>(pix1+16, stride_pix1, pix2+16, stride_pix2);
364     //printf("<32 64>%d\n", sum);
365     return sum;
366 }
367 
368 template<>
sad_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)369 int inline sad_altivec<48, 64>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
370 {
371     ALIGN_VAR_16(int, sum );
372     sum = sad16_altivec<16, 64>(pix1, stride_pix1, pix2, stride_pix2)
373               + sad16_altivec<16, 64>(pix1+16, stride_pix1, pix2+16, stride_pix2)
374               + sad16_altivec<16, 64>(pix1+32, stride_pix1, pix2+32, stride_pix2);
375     //printf("<48 64>%d\n", sum);
376     return sum;
377 }
378 
379 template<>
sad_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)380 int inline sad_altivec<64, 16>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
381 {
382     ALIGN_VAR_16(int, sum );
383     sum = sad16_altivec<16, 16>(pix1, stride_pix1, pix2, stride_pix2)
384               + sad16_altivec<16, 16>(pix1+16, stride_pix1, pix2+16, stride_pix2)
385               + sad16_altivec<16, 16>(pix1+32, stride_pix1, pix2+32, stride_pix2)
386               + sad16_altivec<16, 16>(pix1+48, stride_pix1, pix2+48, stride_pix2);
387     //printf("<64 16>%d\n", sum);
388     return sum;
389 }
390 
391 template<>
sad_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)392 int inline sad_altivec<64, 32>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
393 {
394     ALIGN_VAR_16(int, sum );
395     sum = sad16_altivec<16, 32>(pix1, stride_pix1, pix2, stride_pix2)
396               + sad16_altivec<16, 32>(pix1+16, stride_pix1, pix2+16, stride_pix2)
397               + sad16_altivec<16, 32>(pix1+32, stride_pix1, pix2+32, stride_pix2)
398               + sad16_altivec<16, 32>(pix1+48, stride_pix1, pix2+48, stride_pix2);
399     //printf("<64 32>%d\n", sum);
400     return sum;
401 }
402 
403 template<>
sad_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)404 int inline sad_altivec<64, 48>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
405 {
406     ALIGN_VAR_16(int, sum );
407     sum = sad16_altivec<16, 48>(pix1, stride_pix1, pix2, stride_pix2)
408               + sad16_altivec<16, 48>(pix1+16, stride_pix1, pix2+16, stride_pix2)
409               + sad16_altivec<16, 48>(pix1+32, stride_pix1, pix2+32, stride_pix2)
410               + sad16_altivec<16, 48>(pix1+48, stride_pix1, pix2+48, stride_pix2);
411     //printf("<64 48>%d\n", sum);
412     return sum;
413 }
414 
415 template<>
sad_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)416 int inline sad_altivec<64, 64>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
417 {
418     ALIGN_VAR_16(int, sum );
419     sum = sad16_altivec<16, 64>(pix1, stride_pix1, pix2, stride_pix2)
420               + sad16_altivec<16, 64>(pix1+16, stride_pix1, pix2+16, stride_pix2)
421               + sad16_altivec<16, 64>(pix1+32, stride_pix1, pix2+32, stride_pix2)
422               + sad16_altivec<16, 64>(pix1+48, stride_pix1, pix2+48, stride_pix2);
423     //printf("<64 64>%d\n", sum);
424     return sum;
425 }
426 
427 /***********************************************************************
428  * SAD_X3 routines - altivec implementation
429  **********************************************************************/
430 template<int lx, int ly>
sad16_x3_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,intptr_t frefstride,int32_t * res)431 void inline sad16_x3_altivec(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
432 {
433     res[0] = 0;
434     res[1] = 0;
435     res[2] = 0;
436     assert(lx <=16);
437     LOAD_ZERO;
438     vec_u8_t  pix1v, pix2v, pix3v, pix4v;
439     vec_u8_t  absv1_2 = zero_u8v;
440     vec_u8_t  absv1_3 = zero_u8v;
441     vec_u8_t  absv1_4 = zero_u8v;
442     vec_s32_t sumv0 = zero_s32v;
443     vec_s32_t sumv1 = zero_s32v;
444     vec_s32_t sumv2 = zero_s32v;
445 
446     for( int y = 0; y < ly; y++ )
447     {
448         pix1v = vec_xl( 0, pix1); //@@RM vec_vsx_ld( 0, pix1);
449         pix2v = vec_xl( 0, pix2); //@@RM vec_vsx_ld( 0, pix2);
450         pix3v = vec_xl( 0, pix3); //@@RM vec_vsx_ld( 0, pix3);
451         pix4v = vec_xl( 0, pix4); //@@RM vec_vsx_ld( 0, pix4);
452 
453         //@@RM : using vec_abs has 2 drawbacks here:
454         //@@RM first, it produces the incorrect result (unpack should be used first)
455         //@@RM second, it is slower than sub(max,min), as noted in freescale's documentation
456         //@@RM absv = (vector unsigned char)vec_abs((vector signed char)vec_sub(pix1v, pix2v)); //@@RM vec_abs((vec_s8_t)vec_sub(pix1v, pix2v));
457         absv1_2 = (vector unsigned char)vec_sub(vec_max(pix1v, pix2v), vec_min(pix1v, pix2v)); //@@RM vec_abs((vec_s8_t)vec_sub(pix1v, pix2v));
458         sumv0 = (vec_s32_t) vec_sum4s( absv1_2, (vec_u32_t) sumv0);
459 
460         absv1_3 = (vector unsigned char)vec_sub(vec_max(pix1v, pix3v), vec_min(pix1v, pix3v)); //@@RM vec_abs((vec_s8_t)vec_sub(pix1v, pix3v));
461         sumv1 = (vec_s32_t) vec_sum4s( absv1_3, (vec_u32_t) sumv1);
462 
463         absv1_4 = (vector unsigned char)vec_sub(vec_max(pix1v, pix4v), vec_min(pix1v, pix4v)); //@@RM vec_abs((vec_s8_t)vec_sub(pix1v, pix3v));
464         sumv2 = (vec_s32_t) vec_sum4s( absv1_4, (vec_u32_t) sumv2);
465 
466         pix1 += FENC_STRIDE;
467         pix2 += frefstride;
468         pix3 += frefstride;
469         pix4 += frefstride;
470     }
471 
472     sum_columns_altivec<lx, ly>(sumv0, res+0);
473     sum_columns_altivec<lx, ly>(sumv1, res+1);
474     sum_columns_altivec<lx, ly>(sumv2, res+2);
475     //printf("<%d %d>%d %d %d\n", lx, ly, res[0], res[1], res[2]);
476 }
477 
478 template<int lx, int ly>
sad_x3_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,intptr_t frefstride,int32_t * res)479 void inline sad_x3_altivec(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res){}
480 
481 template<>
sad_x3_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,intptr_t frefstride,int32_t * res)482 void inline sad_x3_altivec<24, 32>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
483 {
484     int32_t sum[3];
485     sad16_x3_altivec<16, 32>(pix1, pix2, pix3, pix4, frefstride, sum);
486     sad16_x3_altivec<8, 32>(pix1+16, pix2+16, pix3+16, pix4+16, frefstride, res);
487     res[0] += sum[0];
488     res[1] += sum[1];
489     res[2] += sum[2];
490     //printf("<24 32>%d %d %d\n", res[0], res[1], res[2]);
491 }
492 
493 template<>
sad_x3_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,intptr_t frefstride,int32_t * res)494 void inline sad_x3_altivec<32, 8>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
495 {
496     int32_t sum[3];
497     sad16_x3_altivec<16, 8>(pix1, pix2, pix3, pix4, frefstride, sum);
498     sad16_x3_altivec<16, 8>(pix1+16, pix2+16, pix3+16, pix4+16, frefstride, res);
499     res[0] += sum[0];
500     res[1] += sum[1];
501     res[2] += sum[2];
502     //printf("<32 8>%d %d %d\n", res[0], res[1], res[2]);
503 }
504 
505 template<>
sad_x3_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,intptr_t frefstride,int32_t * res)506 void inline sad_x3_altivec<32, 16>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
507 {
508     int32_t sum[3];
509     sad16_x3_altivec<16, 16>(pix1, pix2, pix3, pix4, frefstride, sum);
510     sad16_x3_altivec<16, 16>(pix1+16, pix2+16, pix3+16, pix4+16, frefstride, res);
511     res[0] += sum[0];
512     res[1] += sum[1];
513     res[2] += sum[2];
514     //printf("<32 16>%d %d %d\n", res[0], res[1], res[2]);
515 }
516 
517 template<>
sad_x3_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,intptr_t frefstride,int32_t * res)518 void inline sad_x3_altivec<32, 24>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
519 {
520     int32_t sum[3];
521     sad16_x3_altivec<16, 24>(pix1, pix2, pix3, pix4, frefstride, sum);
522     sad16_x3_altivec<16, 24>(pix1+16, pix2+16, pix3+16, pix4+16, frefstride, res);
523     res[0] += sum[0];
524     res[1] += sum[1];
525     res[2] += sum[2];
526     //printf("<32 24>%d %d %d\n", res[0], res[1], res[2]);
527 }
528 
529 template<>
sad_x3_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,intptr_t frefstride,int32_t * res)530 void sad_x3_altivec<32, 32>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
531 {
532 
533     const int lx = 32 ;
534     const int ly = 32 ;
535 
536     vector unsigned int v_zeros = {0, 0, 0, 0} ;
537 
538     vector signed short v_results_0 = {0, 0, 0, 0, 0, 0, 0, 0} ;
539     vector signed short v_results_1 = {0, 0, 0, 0, 0, 0, 0, 0} ;
540     vector signed short v_results_2 = {0, 0, 0, 0, 0, 0, 0, 0} ;
541 
542 
543     vector signed int v_results_int_0 ;
544     vector signed int v_results_int_1 ;
545     vector signed int v_results_int_2 ;
546 
547     vector unsigned char v_pix1 ;
548     vector unsigned char v_pix2 ;
549     vector unsigned char v_pix3 ;
550     vector unsigned char v_pix4 ;
551 
552     vector unsigned char v_abs_diff_0 ;
553     vector unsigned char v_abs_diff_1 ;
554     vector unsigned char v_abs_diff_2 ;
555 
556     vector signed short v_unpack_mask = {0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF} ;
557 
558     vector signed short v_short_0_0 , v_short_0_1 ;
559     vector signed short v_short_1_0 , v_short_1_1 ;
560     vector signed short v_short_2_0 , v_short_2_1 ;
561 
562     vector signed short v_sum_0 ;
563     vector signed short v_sum_1 ;
564     vector signed short v_sum_2 ;
565 
566 
567 
568     res[0] = 0;
569     res[1] = 0;
570     res[2] = 0;
571     for (int y = 0; y < ly; y++)
572     {
573         for (int x = 0; x < lx; x+=16)
574         {
575             v_pix1 = vec_xl(x, pix1) ;
576 
577             // for(int ii=0; ii<16; ii++) { res[0] += abs(pix1[x + ii] - pix2[x + ii]); }
578             v_pix2 = vec_xl(x, pix2) ;
579             v_abs_diff_0 = vec_sub(vec_max(v_pix1, v_pix2), vec_min(v_pix1, v_pix2)) ;
580             v_short_0_0 = vec_unpackh((vector signed char)v_abs_diff_0) ;
581             v_short_0_0 = vec_and(v_short_0_0, v_unpack_mask) ;
582             v_short_0_1 = vec_unpackl((vector signed char)v_abs_diff_0) ;
583             v_short_0_1 = vec_and(v_short_0_1, v_unpack_mask) ;
584             v_sum_0 = vec_add(v_short_0_0, v_short_0_1) ;
585             v_results_0 = vec_add(v_results_0, v_sum_0) ;
586 
587             // for(int ii=0; ii<16; ii++) { res[1] += abs(pix1[x + ii] - pix3[x + ii]); }
588             v_pix3 = vec_xl(x, pix3) ;
589             v_abs_diff_1 = vec_sub(vec_max(v_pix1, v_pix3), vec_min(v_pix1, v_pix3)) ;
590             v_short_1_0 = vec_unpackh((vector signed char)v_abs_diff_1) ;
591             v_short_1_0 = vec_and(v_short_1_0, v_unpack_mask) ;
592             v_short_1_1 = vec_unpackl((vector signed char)v_abs_diff_1) ;
593             v_short_1_1 = vec_and(v_short_1_1, v_unpack_mask) ;
594             v_sum_1 = vec_add(v_short_1_0, v_short_1_1) ;
595             v_results_1 = vec_add(v_results_1, v_sum_1) ;
596 
597 
598             // for(int ii=0; ii<16; ii++) { res[2] += abs(pix1[x + ii] - pix4[x + ii]); }
599             v_pix4 = vec_xl(x, pix4) ;
600             v_abs_diff_2 = vec_sub(vec_max(v_pix1, v_pix4), vec_min(v_pix1, v_pix4)) ;
601             v_short_2_0 = vec_unpackh((vector signed char)v_abs_diff_2) ;
602             v_short_2_0 = vec_and(v_short_2_0, v_unpack_mask) ;
603             v_short_2_1 = vec_unpackl((vector signed char)v_abs_diff_2) ;
604             v_short_2_1 = vec_and(v_short_2_1, v_unpack_mask) ;
605             v_sum_2 = vec_add(v_short_2_0, v_short_2_1) ;
606             v_results_2 = vec_add(v_results_2, v_sum_2) ;
607 
608         }
609 
610         pix1 += FENC_STRIDE;
611         pix2 += frefstride;
612         pix3 += frefstride;
613         pix4 += frefstride;
614     }
615 
616 
617     v_results_int_0 = vec_sum4s((vector signed short)v_results_0, (vector signed int)v_zeros) ;
618     v_results_int_0 = vec_sums(v_results_int_0, (vector signed int)v_zeros) ;
619     res[0] = v_results_int_0[3] ;
620 
621 
622     v_results_int_1 = vec_sum4s((vector signed short)v_results_1, (vector signed int)v_zeros) ;
623     v_results_int_1 = vec_sums(v_results_int_1, (vector signed int)v_zeros) ;
624     res[1] = v_results_int_1[3] ;
625 
626 
627     v_results_int_2 = vec_sum4s((vector signed short)v_results_2, (vector signed int)v_zeros) ;
628     v_results_int_2 = vec_sums(v_results_int_2, (vector signed int)v_zeros) ;
629     res[2] = v_results_int_2[3] ;
630 
631     //printf("<32 32>%d %d %d\n", res[0], res[1], res[2]);
632 
633 } // end sad_x3_altivec
634 
635 template<>
sad_x3_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,intptr_t frefstride,int32_t * res)636 void inline sad_x3_altivec<32, 64>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
637 {
638     int32_t sum[3];
639     sad16_x3_altivec<16, 64>(pix1, pix2, pix3, pix4, frefstride, sum);
640     sad16_x3_altivec<16, 64>(pix1+16, pix2+16, pix3+16, pix4+16, frefstride, res);
641     res[0] += sum[0];
642     res[1] += sum[1];
643     res[2] += sum[2];
644     //printf("<32 64>%d %d %d\n", res[0], res[1], res[2]);
645 }
646 
647 template<>
sad_x3_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,intptr_t frefstride,int32_t * res)648 void inline sad_x3_altivec<48, 64>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
649 {
650     int32_t sum[6];
651     sad16_x3_altivec<16, 64>(pix1, pix2, pix3, pix4, frefstride, sum);
652     sad16_x3_altivec<16, 64>(pix1+16, pix2+16, pix3+16, pix4+16, frefstride, sum+3);
653     sad16_x3_altivec<16, 64>(pix1+32, pix2+32, pix3+32, pix4+32, frefstride, res);
654     res[0] = sum[0]+sum[3]+res[0];
655     res[1] = sum[1]+sum[4]+res[1];
656     res[2] = sum[2]+sum[5]+res[2];
657     //printf("<48 64>%d %d %d\n", res[0], res[1], res[2]);
658 }
659 
660 template<>
sad_x3_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,intptr_t frefstride,int32_t * res)661 void inline sad_x3_altivec<64, 16>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
662 {
663     int32_t sum[9];
664     sad16_x3_altivec<16, 16>(pix1, pix2, pix3, pix4, frefstride, sum);
665     sad16_x3_altivec<16, 16>(pix1+16, pix2+16, pix3+16, pix4+16, frefstride, sum+3);
666     sad16_x3_altivec<16, 16>(pix1+32, pix2+32, pix3+32, pix4+32, frefstride, sum+6);
667     sad16_x3_altivec<16, 16>(pix1+48, pix2+48, pix3+48, pix4+48, frefstride, res);
668     res[0] = sum[0]+sum[3]+sum[6]+res[0];
669     res[1] = sum[1]+sum[4]+sum[7]+res[1];
670     res[2] = sum[2]+sum[5]+sum[8]+res[2];
671     //printf("<64 16>%d %d %d\n", res[0], res[1], res[2]);
672 }
673 
674 template<>
sad_x3_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,intptr_t frefstride,int32_t * res)675 void inline sad_x3_altivec<64, 32>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
676 {
677     int32_t sum[9];
678     sad16_x3_altivec<16, 32>(pix1, pix2, pix3, pix4, frefstride, sum);
679     sad16_x3_altivec<16, 32>(pix1+16, pix2+16, pix3+16, pix4+16, frefstride, sum+3);
680     sad16_x3_altivec<16, 32>(pix1+32, pix2+32, pix3+32, pix4+32, frefstride, sum+6);
681     sad16_x3_altivec<16, 32>(pix1+48, pix2+48, pix3+48, pix4+48, frefstride, res);
682     res[0] = sum[0]+sum[3]+sum[6]+res[0];
683     res[1] = sum[1]+sum[4]+sum[7]+res[1];
684     res[2] = sum[2]+sum[5]+sum[8]+res[2];
685     //printf("<64 32>%d %d %d\n", res[0], res[1], res[2]);
686 }
687 
688 template<>
sad_x3_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,intptr_t frefstride,int32_t * res)689 void inline sad_x3_altivec<64, 48>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
690 {
691     int32_t sum[9];
692     sad16_x3_altivec<16, 48>(pix1, pix2, pix3, pix4, frefstride, sum);
693     sad16_x3_altivec<16, 48>(pix1+16, pix2+16, pix3+16, pix4+16, frefstride, sum+3);
694     sad16_x3_altivec<16, 48>(pix1+32, pix2+32, pix3+32, pix4+32, frefstride, sum+6);
695     sad16_x3_altivec<16, 48>(pix1+48, pix2+48, pix3+48, pix4+48, frefstride, res);
696     res[0] = sum[0]+sum[3]+sum[6]+res[0];
697     res[1] = sum[1]+sum[4]+sum[7]+res[1];
698     res[2] = sum[2]+sum[5]+sum[8]+res[2];
699     //printf("<64 48>%d %d %d\n", res[0], res[1], res[2]);
700 }
701 
702 template<>
sad_x3_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,intptr_t frefstride,int32_t * res)703 void inline sad_x3_altivec<64, 64>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
704 {
705     int32_t sum[9];
706     sad16_x3_altivec<16, 64>(pix1, pix2, pix3, pix4, frefstride, sum);
707     sad16_x3_altivec<16, 64>(pix1+16, pix2+16, pix3+16, pix4+16, frefstride, sum+3);
708     sad16_x3_altivec<16, 64>(pix1+32, pix2+32, pix3+32, pix4+32, frefstride, sum+6);
709     sad16_x3_altivec<16, 64>(pix1+48, pix2+48, pix3+48, pix4+48, frefstride, res);
710     res[0] = sum[0]+sum[3]+sum[6]+res[0];
711     res[1] = sum[1]+sum[4]+sum[7]+res[1];
712     res[2] = sum[2]+sum[5]+sum[8]+res[2];
713     //printf("<64 64>%d %d %d\n", res[0], res[1], res[2]);
714 }
715 
716 /***********************************************************************
717  * SAD_X4 routines - altivec implementation
718  **********************************************************************/
719 template<int lx, int ly>
sad16_x4_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,const pixel * pix5,intptr_t frefstride,int32_t * res)720 void inline sad16_x4_altivec(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
721 {
722     res[0] = 0;
723     res[1] = 0;
724     res[2] = 0;
725     assert(lx <=16);
726     LOAD_ZERO;
727     vec_u8_t  pix1v, pix2v, pix3v, pix4v, pix5v;
728     vec_u8_t  absv1_2 = zero_u8v;
729     vec_u8_t  absv1_3 = zero_u8v;
730     vec_u8_t  absv1_4 = zero_u8v;
731     vec_u8_t  absv1_5 = zero_u8v;
732     vec_s32_t sumv0 = zero_s32v;
733     vec_s32_t sumv1 = zero_s32v;
734     vec_s32_t sumv2 = zero_s32v;
735     vec_s32_t sumv3 = zero_s32v;
736 
737     for( int y = 0; y < ly; y++ )
738     {
739         pix1v = vec_xl( 0, pix1); //@@RM vec_vsx_ld( 0, pix1);
740         pix2v = vec_xl( 0, pix2); //@@RM vec_vsx_ld( 0, pix2);
741         pix3v = vec_xl( 0, pix3); //@@RM vec_vsx_ld( 0, pix3);
742         pix4v = vec_xl( 0, pix4); //@@RM vec_vsx_ld( 0, pix4);
743         pix5v = vec_xl( 0, pix5); //@@RM vec_vsx_ld( 0, pix4);
744 
745         //@@RM : using vec_abs has 2 drawbacks here:
746         //@@RM first, it produces the incorrect result (unpack should be used first)
747         //@@RM second, it is slower than sub(max,min), as noted in freescale's documentation
748         //@@RM absv = (vector unsigned char)vec_abs((vector signed char)vec_sub(pix1v, pix2v)); //@@RM vec_abs((vec_s8_t)vec_sub(pix1v, pix2v));
749         absv1_2 = (vector unsigned char)vec_sub(vec_max(pix1v, pix2v), vec_min(pix1v, pix2v)); //@@RM vec_abs((vec_s8_t)vec_sub(pix1v, pix2v));
750         sumv0 = (vec_s32_t) vec_sum4s( absv1_2, (vec_u32_t) sumv0);
751 
752         absv1_3 = (vector unsigned char)vec_sub(vec_max(pix1v, pix3v), vec_min(pix1v, pix3v)); //@@RM vec_abs((vec_s8_t)vec_sub(pix1v, pix3v));
753         sumv1 = (vec_s32_t) vec_sum4s( absv1_3, (vec_u32_t) sumv1);
754 
755         absv1_4 = (vector unsigned char)vec_sub(vec_max(pix1v, pix4v), vec_min(pix1v, pix4v)); //@@RM vec_abs((vec_s8_t)vec_sub(pix1v, pix3v));
756         sumv2 = (vec_s32_t) vec_sum4s( absv1_4, (vec_u32_t) sumv2);
757 
758         absv1_5 = (vector unsigned char)vec_sub(vec_max(pix1v, pix5v), vec_min(pix1v, pix5v)); //@@RM vec_abs((vec_s8_t)vec_sub(pix1v, pix3v));
759         sumv3 = (vec_s32_t) vec_sum4s( absv1_5, (vec_u32_t) sumv3);
760 
761         pix1 += FENC_STRIDE;
762         pix2 += frefstride;
763         pix3 += frefstride;
764         pix4 += frefstride;
765         pix5 += frefstride;
766     }
767 
768     sum_columns_altivec<lx, ly>(sumv0, res+0);
769     sum_columns_altivec<lx, ly>(sumv1, res+1);
770     sum_columns_altivec<lx, ly>(sumv2, res+2);
771     sum_columns_altivec<lx, ly>(sumv3, res+3);
772     //printf("<%d %d>%d %d %d %d\n", lx, ly, res[0], res[1], res[2], res[3]);
773 }
774 
775 template<int lx, int ly>
sad_x4_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,const pixel * pix5,intptr_t frefstride,int32_t * res)776 void inline sad_x4_altivec(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res){}
777 
778 
779 template<>
sad_x4_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,const pixel * pix5,intptr_t frefstride,int32_t * res)780 void inline sad_x4_altivec<24, 32>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
781 {
782     int32_t sum[4];
783     sad16_x4_altivec<16, 32>(pix1, pix2, pix3, pix4, pix5, frefstride, sum);
784     sad16_x4_altivec<8, 32>(pix1+16, pix2+16, pix3+16, pix4+16, pix5+16, frefstride, res);
785     res[0] += sum[0];
786     res[1] += sum[1];
787     res[2] += sum[2];
788     res[3] += sum[3];
789     //printf("<24 32>%d %d %d %d\n", res[0], res[1], res[2], res[3]);
790 }
791 
792 template<>
sad_x4_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,const pixel * pix5,intptr_t frefstride,int32_t * res)793 void inline sad_x4_altivec<32, 8>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
794 {
795     int32_t sum[4];
796     sad16_x4_altivec<16, 8>(pix1, pix2, pix3, pix4, pix5, frefstride, sum);
797     sad16_x4_altivec<16, 8>(pix1+16, pix2+16, pix3+16, pix4+16, pix5+16, frefstride, res);
798     res[0] += sum[0];
799     res[1] += sum[1];
800     res[2] += sum[2];
801     res[3] += sum[3];
802     //printf("<32 8>%d %d %d %d\n", res[0], res[1], res[2], res[3]);
803 }
804 
805 template<>
sad_x4_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,const pixel * pix5,intptr_t frefstride,int32_t * res)806 void sad_x4_altivec<32,16>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
807 {
808 
809     const int lx = 32 ;
810     const int ly = 16 ;
811 
812     vector unsigned int v_zeros = {0, 0, 0, 0} ;
813 
814     vector signed short v_results_0 = {0, 0, 0, 0, 0, 0, 0, 0} ;
815     vector signed short v_results_1 = {0, 0, 0, 0, 0, 0, 0, 0} ;
816     vector signed short v_results_2 = {0, 0, 0, 0, 0, 0, 0, 0} ;
817     vector signed short v_results_3 = {0, 0, 0, 0, 0, 0, 0, 0} ;
818 
819 
820     vector signed int v_results_int_0 ;
821     vector signed int v_results_int_1 ;
822     vector signed int v_results_int_2 ;
823     vector signed int v_results_int_3 ;
824 
825     vector unsigned char v_pix1 ;
826     vector unsigned char v_pix2 ;
827     vector unsigned char v_pix3 ;
828     vector unsigned char v_pix4 ;
829     vector unsigned char v_pix5 ;
830 
831     vector unsigned char v_abs_diff_0 ;
832     vector unsigned char v_abs_diff_1 ;
833     vector unsigned char v_abs_diff_2 ;
834     vector unsigned char v_abs_diff_3 ;
835 
836     vector signed short v_unpack_mask = {0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF} ;
837 
838     vector signed short v_short_0_0 , v_short_0_1 ;
839     vector signed short v_short_1_0 , v_short_1_1 ;
840     vector signed short v_short_2_0 , v_short_2_1 ;
841     vector signed short v_short_3_0 , v_short_3_1 ;
842 
843     vector signed short v_sum_0 ;
844     vector signed short v_sum_1 ;
845     vector signed short v_sum_2 ;
846     vector signed short v_sum_3 ;
847 
848 
849     res[0] = 0;
850     res[1] = 0;
851     res[2] = 0;
852     res[3] = 0;
853     for (int y = 0; y < ly; y++)
854     {
855         for (int x = 0; x < lx; x+=16)
856         {
857             v_pix1 = vec_xl(x, pix1) ;
858 
859             // for(int ii=0; ii<16; ii++) { res[0] += abs(pix1[x + ii] - pix2[x + ii]); }
860             v_pix2 = vec_xl(x, pix2) ;
861             v_abs_diff_0 = vec_sub(vec_max(v_pix1, v_pix2), vec_min(v_pix1, v_pix2)) ;
862             v_short_0_0 = vec_unpackh((vector signed char)v_abs_diff_0) ;
863             v_short_0_0 = vec_and(v_short_0_0, v_unpack_mask) ;
864             v_short_0_1 = vec_unpackl((vector signed char)v_abs_diff_0) ;
865             v_short_0_1 = vec_and(v_short_0_1, v_unpack_mask) ;
866             v_sum_0 = vec_add(v_short_0_0, v_short_0_1) ;
867             v_results_0 = vec_add(v_results_0, v_sum_0) ;
868 
869             // for(int ii=0; ii<16; ii++) { res[1] += abs(pix1[x + ii] - pix3[x + ii]); }
870             v_pix3 = vec_xl(x, pix3) ;
871             v_abs_diff_1 = vec_sub(vec_max(v_pix1, v_pix3), vec_min(v_pix1, v_pix3)) ;
872             v_short_1_0 = vec_unpackh((vector signed char)v_abs_diff_1) ;
873             v_short_1_0 = vec_and(v_short_1_0, v_unpack_mask) ;
874             v_short_1_1 = vec_unpackl((vector signed char)v_abs_diff_1) ;
875             v_short_1_1 = vec_and(v_short_1_1, v_unpack_mask) ;
876             v_sum_1 = vec_add(v_short_1_0, v_short_1_1) ;
877             v_results_1 = vec_add(v_results_1, v_sum_1) ;
878 
879 
880             // for(int ii=0; ii<16; ii++) { res[2] += abs(pix1[x + ii] - pix4[x + ii]); }
881             v_pix4 = vec_xl(x, pix4) ;
882             v_abs_diff_2 = vec_sub(vec_max(v_pix1, v_pix4), vec_min(v_pix1, v_pix4)) ;
883             v_short_2_0 = vec_unpackh((vector signed char)v_abs_diff_2) ;
884             v_short_2_0 = vec_and(v_short_2_0, v_unpack_mask) ;
885             v_short_2_1 = vec_unpackl((vector signed char)v_abs_diff_2) ;
886             v_short_2_1 = vec_and(v_short_2_1, v_unpack_mask) ;
887             v_sum_2 = vec_add(v_short_2_0, v_short_2_1) ;
888             v_results_2 = vec_add(v_results_2, v_sum_2) ;
889 
890 
891             // for(int ii=0; ii<16; ii++) { res[3] += abs(pix1[x + ii] - pix5[x + ii]); }
892             v_pix5 = vec_xl(x, pix5) ;
893             v_abs_diff_3 = vec_sub(vec_max(v_pix1, v_pix5), vec_min(v_pix1, v_pix5)) ;
894             v_short_3_0 = vec_unpackh((vector signed char)v_abs_diff_3) ;
895             v_short_3_0 = vec_and(v_short_3_0, v_unpack_mask) ;
896             v_short_3_1 = vec_unpackl((vector signed char)v_abs_diff_3) ;
897             v_short_3_1 = vec_and(v_short_3_1, v_unpack_mask) ;
898             v_sum_3 = vec_add(v_short_3_0, v_short_3_1) ;
899             v_results_3 = vec_add(v_results_3, v_sum_3) ;
900         }
901 
902         pix1 += FENC_STRIDE;
903         pix2 += frefstride;
904         pix3 += frefstride;
905         pix4 += frefstride;
906         pix5 += frefstride;
907     }
908 
909 
910     v_results_int_0 = vec_sum4s((vector signed short)v_results_0, (vector signed int)v_zeros) ;
911     v_results_int_0 = vec_sums(v_results_int_0, (vector signed int)v_zeros) ;
912     res[0] = v_results_int_0[3] ;
913 
914 
915     v_results_int_1 = vec_sum4s((vector signed short)v_results_1, (vector signed int)v_zeros) ;
916     v_results_int_1 = vec_sums(v_results_int_1, (vector signed int)v_zeros) ;
917     res[1] = v_results_int_1[3] ;
918 
919 
920     v_results_int_2 = vec_sum4s((vector signed short)v_results_2, (vector signed int)v_zeros) ;
921     v_results_int_2 = vec_sums(v_results_int_2, (vector signed int)v_zeros) ;
922     res[2] = v_results_int_2[3] ;
923 
924 
925     v_results_int_3 = vec_sum4s((vector signed short)v_results_3, (vector signed int)v_zeros) ;
926     v_results_int_3 = vec_sums(v_results_int_3, (vector signed int)v_zeros) ;
927     res[3] = v_results_int_3[3] ;
928     //printf("<32 16>%d %d %d %d\n", res[0], res[1], res[2], res[3]);
929 } // end sad_x4_altivec
930 
931 template<>
sad_x4_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,const pixel * pix5,intptr_t frefstride,int32_t * res)932 void inline sad_x4_altivec<32, 24>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
933 {
934     int32_t sum[4];
935     sad16_x4_altivec<16, 24>(pix1, pix2, pix3, pix4, pix5, frefstride, sum);
936     sad16_x4_altivec<16, 24>(pix1+16, pix2+16, pix3+16, pix4+16, pix5+16, frefstride, res);
937     res[0] += sum[0];
938     res[1] += sum[1];
939     res[2] += sum[2];
940     res[3] += sum[3];
941     //printf("<32 24>%d %d %d %d\n", res[0], res[1], res[2], res[3]);
942 }
943 
944 template<>
sad_x4_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,const pixel * pix5,intptr_t frefstride,int32_t * res)945 void sad_x4_altivec<32,32>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
946 {
947 
948     const int lx = 32 ;
949     const int ly = 32 ;
950 
951     vector unsigned int v_zeros = {0, 0, 0, 0} ;
952 
953     vector signed short v_results_0 = {0, 0, 0, 0, 0, 0, 0, 0} ;
954     vector signed short v_results_1 = {0, 0, 0, 0, 0, 0, 0, 0} ;
955     vector signed short v_results_2 = {0, 0, 0, 0, 0, 0, 0, 0} ;
956     vector signed short v_results_3 = {0, 0, 0, 0, 0, 0, 0, 0} ;
957 
958 
959     vector signed int v_results_int_0 ;
960     vector signed int v_results_int_1 ;
961     vector signed int v_results_int_2 ;
962     vector signed int v_results_int_3 ;
963 
964     vector unsigned char v_pix1 ;
965     vector unsigned char v_pix2 ;
966     vector unsigned char v_pix3 ;
967     vector unsigned char v_pix4 ;
968     vector unsigned char v_pix5 ;
969 
970     vector unsigned char v_abs_diff_0 ;
971     vector unsigned char v_abs_diff_1 ;
972     vector unsigned char v_abs_diff_2 ;
973     vector unsigned char v_abs_diff_3 ;
974 
975     vector signed short v_unpack_mask = {0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF} ;
976 
977     vector signed short v_short_0_0 , v_short_0_1 ;
978     vector signed short v_short_1_0 , v_short_1_1 ;
979     vector signed short v_short_2_0 , v_short_2_1 ;
980     vector signed short v_short_3_0 , v_short_3_1 ;
981 
982     vector signed short v_sum_0 ;
983     vector signed short v_sum_1 ;
984     vector signed short v_sum_2 ;
985     vector signed short v_sum_3 ;
986 
987 
988     res[0] = 0;
989     res[1] = 0;
990     res[2] = 0;
991     res[3] = 0;
992     for (int y = 0; y < ly; y++)
993     {
994         for (int x = 0; x < lx; x+=16)
995         {
996             v_pix1 = vec_xl(x, pix1) ;
997 
998             // for(int ii=0; ii<16; ii++) { res[0] += abs(pix1[x + ii] - pix2[x + ii]); }
999             v_pix2 = vec_xl(x, pix2) ;
1000             v_abs_diff_0 = vec_sub(vec_max(v_pix1, v_pix2), vec_min(v_pix1, v_pix2)) ;
1001             v_short_0_0 = vec_unpackh((vector signed char)v_abs_diff_0) ;
1002             v_short_0_0 = vec_and(v_short_0_0, v_unpack_mask) ;
1003             v_short_0_1 = vec_unpackl((vector signed char)v_abs_diff_0) ;
1004             v_short_0_1 = vec_and(v_short_0_1, v_unpack_mask) ;
1005             v_sum_0 = vec_add(v_short_0_0, v_short_0_1) ;
1006             v_results_0 = vec_add(v_results_0, v_sum_0) ;
1007 
1008             // for(int ii=0; ii<16; ii++) { res[1] += abs(pix1[x + ii] - pix3[x + ii]); }
1009             v_pix3 = vec_xl(x, pix3) ;
1010             v_abs_diff_1 = vec_sub(vec_max(v_pix1, v_pix3), vec_min(v_pix1, v_pix3)) ;
1011             v_short_1_0 = vec_unpackh((vector signed char)v_abs_diff_1) ;
1012             v_short_1_0 = vec_and(v_short_1_0, v_unpack_mask) ;
1013             v_short_1_1 = vec_unpackl((vector signed char)v_abs_diff_1) ;
1014             v_short_1_1 = vec_and(v_short_1_1, v_unpack_mask) ;
1015             v_sum_1 = vec_add(v_short_1_0, v_short_1_1) ;
1016             v_results_1 = vec_add(v_results_1, v_sum_1) ;
1017 
1018 
1019             // for(int ii=0; ii<16; ii++) { res[2] += abs(pix1[x + ii] - pix4[x + ii]); }
1020             v_pix4 = vec_xl(x, pix4) ;
1021             v_abs_diff_2 = vec_sub(vec_max(v_pix1, v_pix4), vec_min(v_pix1, v_pix4)) ;
1022             v_short_2_0 = vec_unpackh((vector signed char)v_abs_diff_2) ;
1023             v_short_2_0 = vec_and(v_short_2_0, v_unpack_mask) ;
1024             v_short_2_1 = vec_unpackl((vector signed char)v_abs_diff_2) ;
1025             v_short_2_1 = vec_and(v_short_2_1, v_unpack_mask) ;
1026             v_sum_2 = vec_add(v_short_2_0, v_short_2_1) ;
1027             v_results_2 = vec_add(v_results_2, v_sum_2) ;
1028 
1029 
1030             // for(int ii=0; ii<16; ii++) { res[3] += abs(pix1[x + ii] - pix5[x + ii]); }
1031             v_pix5 = vec_xl(x, pix5) ;
1032             v_abs_diff_3 = vec_sub(vec_max(v_pix1, v_pix5), vec_min(v_pix1, v_pix5)) ;
1033             v_short_3_0 = vec_unpackh((vector signed char)v_abs_diff_3) ;
1034             v_short_3_0 = vec_and(v_short_3_0, v_unpack_mask) ;
1035             v_short_3_1 = vec_unpackl((vector signed char)v_abs_diff_3) ;
1036             v_short_3_1 = vec_and(v_short_3_1, v_unpack_mask) ;
1037             v_sum_3 = vec_add(v_short_3_0, v_short_3_1) ;
1038             v_results_3 = vec_add(v_results_3, v_sum_3) ;
1039         }
1040 
1041         pix1 += FENC_STRIDE;
1042         pix2 += frefstride;
1043         pix3 += frefstride;
1044         pix4 += frefstride;
1045         pix5 += frefstride;
1046     }
1047 
1048 
1049     v_results_int_0 = vec_sum4s((vector signed short)v_results_0, (vector signed int)v_zeros) ;
1050     v_results_int_0 = vec_sums(v_results_int_0, (vector signed int)v_zeros) ;
1051     res[0] = v_results_int_0[3] ;
1052 
1053 
1054     v_results_int_1 = vec_sum4s((vector signed short)v_results_1, (vector signed int)v_zeros) ;
1055     v_results_int_1 = vec_sums(v_results_int_1, (vector signed int)v_zeros) ;
1056     res[1] = v_results_int_1[3] ;
1057 
1058 
1059     v_results_int_2 = vec_sum4s((vector signed short)v_results_2, (vector signed int)v_zeros) ;
1060     v_results_int_2 = vec_sums(v_results_int_2, (vector signed int)v_zeros) ;
1061     res[2] = v_results_int_2[3] ;
1062 
1063 
1064     v_results_int_3 = vec_sum4s((vector signed short)v_results_3, (vector signed int)v_zeros) ;
1065     v_results_int_3 = vec_sums(v_results_int_3, (vector signed int)v_zeros) ;
1066     res[3] = v_results_int_3[3] ;
1067 
1068     //printf("<32 32>%d %d %d %d\n", res[0], res[1], res[2], res[3]);
1069 } // end sad_x4_altivec
1070 
1071 template<>
sad_x4_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,const pixel * pix5,intptr_t frefstride,int32_t * res)1072 void inline sad_x4_altivec<32, 64>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
1073 {
1074     int32_t sum[4];
1075     sad16_x4_altivec<16, 64>(pix1, pix2, pix3, pix4, pix5, frefstride, sum);
1076     sad16_x4_altivec<16, 64>(pix1+16, pix2+16, pix3+16, pix4+16, pix5+16, frefstride, res);
1077     res[0] += sum[0];
1078     res[1] += sum[1];
1079     res[2] += sum[2];
1080     res[3] += sum[3];
1081     //printf("<32 64>%d %d %d %d\n", res[0], res[1], res[2], res[3]);
1082 }
1083 
1084 template<>
sad_x4_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,const pixel * pix5,intptr_t frefstride,int32_t * res)1085 void inline sad_x4_altivec<48, 64>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
1086 {
1087     int32_t sum[8];
1088     sad16_x4_altivec<16, 64>(pix1, pix2, pix3, pix4, pix5, frefstride, sum);
1089     sad16_x4_altivec<16, 64>(pix1+16, pix2+16, pix3+16, pix4+16, pix5+16, frefstride, sum+4);
1090     sad16_x4_altivec<16, 64>(pix1+32, pix2+32, pix3+32, pix4+32, pix5+32, frefstride, res);
1091     res[0] = sum[0]+sum[4]+res[0];
1092     res[1] = sum[1]+sum[5]+res[1];
1093     res[2] = sum[2]+sum[6]+res[2];
1094     res[3] = sum[3]+sum[7]+res[3];
1095     //printf("<48 64>%d %d %d %d\n", res[0], res[1], res[2], res[3]);
1096 }
1097 
1098 template<>
sad_x4_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,const pixel * pix5,intptr_t frefstride,int32_t * res)1099 void inline sad_x4_altivec<64, 16>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
1100 {
1101     int32_t sum[12];
1102     sad16_x4_altivec<16, 16>(pix1, pix2, pix3, pix4, pix5, frefstride, sum);
1103     sad16_x4_altivec<16, 16>(pix1+16, pix2+16, pix3+16, pix4+16, pix5+16, frefstride, sum+4);
1104     sad16_x4_altivec<16, 16>(pix1+32, pix2+32, pix3+32, pix4+32, pix5+32, frefstride, sum+8);
1105     sad16_x4_altivec<16, 16>(pix1+48, pix2+48, pix3+48, pix4+48, pix5+48, frefstride, res);
1106     res[0] = sum[0]+sum[4]+sum[8]+res[0];
1107     res[1] = sum[1]+sum[5]+sum[9]+res[1];
1108     res[2] = sum[2]+sum[6]+sum[10]+res[2];
1109     res[3] = sum[3]+sum[7]+sum[11]+res[3];
1110     //printf("<64 16>%d %d %d %d\n", res[0], res[1], res[2], res[3]);
1111 }
1112 
1113 template<>
sad_x4_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,const pixel * pix5,intptr_t frefstride,int32_t * res)1114 void inline sad_x4_altivec<64, 32>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
1115 {
1116     int32_t sum[12];
1117     sad16_x4_altivec<16, 32>(pix1, pix2, pix3, pix4, pix5, frefstride, sum);
1118     sad16_x4_altivec<16, 32>(pix1+16, pix2+16, pix3+16, pix4+16, pix5+16, frefstride, sum+4);
1119     sad16_x4_altivec<16, 32>(pix1+32, pix2+32, pix3+32, pix4+32, pix5+32, frefstride, sum+8);
1120     sad16_x4_altivec<16, 32>(pix1+48, pix2+48, pix3+48, pix4+48, pix5+48, frefstride, res);
1121     res[0] = sum[0]+sum[4]+sum[8]+res[0];
1122     res[1] = sum[1]+sum[5]+sum[9]+res[1];
1123     res[2] = sum[2]+sum[6]+sum[10]+res[2];
1124     res[3] = sum[3]+sum[7]+sum[11]+res[3];
1125     //printf("<64 32>%d %d %d %d\n", res[0], res[1], res[2], res[3]);
1126 }
1127 
1128 template<>
sad_x4_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,const pixel * pix5,intptr_t frefstride,int32_t * res)1129 void inline sad_x4_altivec<64, 48>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
1130 {
1131     int32_t sum[12];
1132     sad16_x4_altivec<16, 48>(pix1, pix2, pix3, pix4, pix5, frefstride, sum);
1133     sad16_x4_altivec<16, 48>(pix1+16, pix2+16, pix3+16, pix4+16, pix5+16, frefstride, sum+4);
1134     sad16_x4_altivec<16, 48>(pix1+32, pix2+32, pix3+32, pix4+32, pix5+32, frefstride, sum+8);
1135     sad16_x4_altivec<16, 48>(pix1+48, pix2+48, pix3+48, pix4+48, pix5+48, frefstride, res);
1136     res[0] = sum[0]+sum[4]+sum[8]+res[0];
1137     res[1] = sum[1]+sum[5]+sum[9]+res[1];
1138     res[2] = sum[2]+sum[6]+sum[10]+res[2];
1139     res[3] = sum[3]+sum[7]+sum[11]+res[3];
1140     //printf("<64 48>%d %d %d %d\n", res[0], res[1], res[2], res[3]);
1141 }
1142 
1143 template<>
sad_x4_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,const pixel * pix5,intptr_t frefstride,int32_t * res)1144 void inline sad_x4_altivec<64, 64>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
1145 {
1146     int32_t sum[12];
1147     sad16_x4_altivec<16, 64>(pix1, pix2, pix3, pix4, pix5, frefstride, sum);
1148     sad16_x4_altivec<16, 64>(pix1+16, pix2+16, pix3+16, pix4+16, pix5+16, frefstride, sum+4);
1149     sad16_x4_altivec<16, 64>(pix1+32, pix2+32, pix3+32, pix4+32, pix5+32, frefstride, sum+8);
1150     sad16_x4_altivec<16, 64>(pix1+48, pix2+48, pix3+48, pix4+48, pix5+48, frefstride, res);
1151     res[0] = sum[0]+sum[4]+sum[8]+res[0];
1152     res[1] = sum[1]+sum[5]+sum[9]+res[1];
1153     res[2] = sum[2]+sum[6]+sum[10]+res[2];
1154     res[3] = sum[3]+sum[7]+sum[11]+res[3];
1155     //printf("<64 64>%d %d %d %d\n", res[0], res[1], res[2], res[3]);
1156 }
1157 
1158 
1159 /***********************************************************************
1160  * SATD routines - altivec implementation
1161  **********************************************************************/
1162 #define HADAMARD4_VEC(s0, s1, s2, s3, d0, d1, d2, d3) \
1163 {\
1164     vec_s16_t t0, t1, t2, t3;\
1165     t0 = vec_add(s0, s1);\
1166     t1 = vec_sub(s0, s1);\
1167     t2 = vec_add(s2, s3);\
1168     t3 = vec_sub(s2, s3);\
1169     d0 = vec_add(t0, t2);\
1170     d2 = vec_sub(t0, t2);\
1171     d1 = vec_add(t1, t3);\
1172     d3 = vec_sub(t1, t3);\
1173 }
1174 
1175 #define VEC_TRANSPOSE_4(a0,a1,a2,a3,b0,b1,b2,b3) \
1176     b0 = vec_mergeh( a0, a0 ); \
1177     b1 = vec_mergeh( a1, a0 ); \
1178     b2 = vec_mergeh( a2, a0 ); \
1179     b3 = vec_mergeh( a3, a0 ); \
1180     a0 = vec_mergeh( b0, b2 ); \
1181     a1 = vec_mergel( b0, b2 ); \
1182     a2 = vec_mergeh( b1, b3 ); \
1183     a3 = vec_mergel( b1, b3 ); \
1184     b0 = vec_mergeh( a0, a2 ); \
1185     b1 = vec_mergel( a0, a2 ); \
1186     b2 = vec_mergeh( a1, a3 ); \
1187     b3 = vec_mergel( a1, a3 )
1188 
1189 #define VEC_TRANSPOSE_8(a0,a1,a2,a3,a4,a5,a6,a7,b0,b1,b2,b3,b4,b5,b6,b7) \
1190     b0 = vec_mergeh( a0, a4 ); \
1191     b1 = vec_mergel( a0, a4 ); \
1192     b2 = vec_mergeh( a1, a5 ); \
1193     b3 = vec_mergel( a1, a5 ); \
1194     b4 = vec_mergeh( a2, a6 ); \
1195     b5 = vec_mergel( a2, a6 ); \
1196     b6 = vec_mergeh( a3, a7 ); \
1197     b7 = vec_mergel( a3, a7 ); \
1198     a0 = vec_mergeh( b0, b4 ); \
1199     a1 = vec_mergel( b0, b4 ); \
1200     a2 = vec_mergeh( b1, b5 ); \
1201     a3 = vec_mergel( b1, b5 ); \
1202     a4 = vec_mergeh( b2, b6 ); \
1203     a5 = vec_mergel( b2, b6 ); \
1204     a6 = vec_mergeh( b3, b7 ); \
1205     a7 = vec_mergel( b3, b7 ); \
1206     b0 = vec_mergeh( a0, a4 ); \
1207     b1 = vec_mergel( a0, a4 ); \
1208     b2 = vec_mergeh( a1, a5 ); \
1209     b3 = vec_mergel( a1, a5 ); \
1210     b4 = vec_mergeh( a2, a6 ); \
1211     b5 = vec_mergel( a2, a6 ); \
1212     b6 = vec_mergeh( a3, a7 ); \
1213     b7 = vec_mergel( a3, a7 )
1214 
satd_4x4_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)1215 int satd_4x4_altivec(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
1216 {
1217     ALIGN_VAR_16( int, sum );
1218 
1219     LOAD_ZERO;
1220     vec_s16_t pix1v, pix2v;
1221     vec_s16_t diff0v, diff1v, diff2v, diff3v;
1222     vec_s16_t temp0v, temp1v, temp2v, temp3v;
1223     vec_s32_t satdv, satdv1, satdv2, satdv3;
1224 
1225     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1226     pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
1227     diff0v = vec_sub( pix1v, pix2v );
1228     pix1   += stride_pix1;
1229     pix2   += stride_pix2;
1230 
1231     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1232     pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
1233     diff1v = vec_sub( pix1v, pix2v );
1234     pix1   += stride_pix1;
1235     pix2   += stride_pix2;
1236 
1237     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1238     pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
1239     diff2v = vec_sub( pix1v, pix2v );
1240     pix1   += stride_pix1;
1241     pix2   += stride_pix2;
1242 
1243     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1244     pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
1245     diff3v = vec_sub( pix1v, pix2v );
1246     pix1   += stride_pix1;
1247     pix2   += stride_pix2;
1248 
1249     /* Hadamar H */
1250     HADAMARD4_VEC(diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v);
1251     VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v, diff0v, diff1v, diff2v, diff3v );
1252     /* Hadamar V */
1253     HADAMARD4_VEC(diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v);
1254 
1255 #if 1
1256     temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
1257     satdv = vec_sum4s( temp0v, zero_s32v);
1258 
1259     temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
1260     satdv1 = vec_sum4s( temp1v, zero_s32v );
1261 
1262     temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
1263     satdv2 = vec_sum4s( temp2v, zero_s32v );
1264 
1265     temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
1266     satdv3 = vec_sum4s( temp3v, zero_s32v );
1267 
1268     satdv += satdv1;
1269     satdv2 += satdv3;
1270     satdv += satdv2;
1271 
1272     satdv = vec_sum2s( satdv, zero_s32v );
1273     //satdv = vec_splat( satdv, 1 );
1274     //vec_ste( satdv, 0, &sum );
1275     sum = vec_extract(satdv, 1);
1276     //print(sum);
1277 #else
1278     temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
1279     satdv = vec_sum4s( temp0v, zero_s32v);
1280 
1281     temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
1282     satdv= vec_sum4s( temp1v, satdv );
1283 
1284     temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
1285     satdv= vec_sum4s( temp2v, satdv );
1286 
1287     temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
1288     satdv= vec_sum4s( temp3v, satdv );
1289 
1290     satdv = vec_sum2s( satdv, zero_s32v );
1291     //satdv = vec_splat( satdv, 1 );
1292     //vec_ste( satdv, 0, &sum );
1293     sum = vec_extract(satdv, 1);
1294     //print(sum);
1295 #endif
1296     return sum >> 1;
1297 }
1298 
1299 #define HADAMARD4_x2vec(v_out0, v_out1, v_in0, v_in1, v_perm_l0_0, v_perm_l0_1) \
1300 { \
1301     \
1302     vector unsigned int v_l0_input_0, v_l0_input_1 ;  \
1303     v_l0_input_0 = vec_perm((vector unsigned int)v_in0, (vector unsigned int)v_in1, v_perm_l0_0) ;    \
1304     v_l0_input_1 = vec_perm((vector unsigned int)v_in0, (vector unsigned int)v_in1, v_perm_l0_1) ;    \
1305     \
1306     vector unsigned int v_l0_add_result, v_l0_sub_result ;    \
1307     v_l0_add_result = vec_add(v_l0_input_0, v_l0_input_1) ; \
1308     v_l0_sub_result = vec_sub(v_l0_input_0, v_l0_input_1) ; \
1309     \
1310     vector unsigned char v_perm_l1_0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17} ; \
1311     vector unsigned char v_perm_l1_1 = {0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0xF,  0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F} ; \
1312     \
1313     vector unsigned int v_l1_input_0, v_l1_input_1 ;  \
1314     v_l1_input_0 = vec_perm(v_l0_add_result, v_l0_sub_result, v_perm_l1_0) ;    \
1315     v_l1_input_1 = vec_perm(v_l0_add_result, v_l0_sub_result, v_perm_l1_1) ;    \
1316     \
1317     vector unsigned int v_l1_add_result, v_l1_sub_result ;    \
1318     v_l1_add_result = vec_add(v_l1_input_0, v_l1_input_1) ; \
1319     v_l1_sub_result = vec_sub(v_l1_input_0, v_l1_input_1) ; \
1320     \
1321     \
1322     v_out0 = v_l1_add_result ;    \
1323     v_out1 = v_l1_sub_result ;    \
1324 \
1325 \
1326 }
1327 
satd_4x8_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)1328 int satd_4x8_altivec(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
1329 {
1330     ALIGN_VAR_16( int, sum );
1331 
1332     LOAD_ZERO;
1333     vec_s16_t pix1v, pix2v;
1334     vec_s16_t diff0v, diff1v, diff2v, diff3v;
1335     vec_s16_t temp0v, temp1v, temp2v, temp3v;
1336     vec_s32_t satdv, satdv1, satdv2, satdv3;;
1337 
1338     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1339     pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
1340     diff0v = vec_sub( pix1v, pix2v );
1341     pix1   += stride_pix1;
1342     pix2   += stride_pix2;
1343 
1344     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1345     pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
1346     diff1v = vec_sub( pix1v, pix2v );
1347     pix1   += stride_pix1;
1348     pix2   += stride_pix2;
1349 
1350     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1351     pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
1352     diff2v = vec_sub( pix1v, pix2v );
1353     pix1   += stride_pix1;
1354     pix2   += stride_pix2;
1355 
1356     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1357     pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
1358     diff3v = vec_sub( pix1v, pix2v );
1359     pix1   += stride_pix1;
1360     pix2   += stride_pix2;
1361 
1362     /* Hadamar H */
1363     HADAMARD4_VEC(diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v);
1364     VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v, diff0v, diff1v, diff2v, diff3v );
1365     /* Hadamar V */
1366     HADAMARD4_VEC(diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v);
1367 
1368 #if 1
1369     temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
1370     satdv = vec_sum4s( temp0v, zero_s32v);
1371 
1372     temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
1373     satdv1= vec_sum4s( temp1v, zero_s32v );
1374 
1375     temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
1376     satdv2= vec_sum4s( temp2v, zero_s32v );
1377 
1378     temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
1379     satdv3= vec_sum4s( temp3v, zero_s32v );
1380 
1381     satdv += satdv1;
1382     satdv2 += satdv3;
1383     satdv += satdv2;
1384 #else
1385     temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
1386     satdv = vec_sum4s( temp0v, zero_s32v);
1387 
1388     temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
1389     satdv= vec_sum4s( temp1v, satdv );
1390 
1391     temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
1392     satdv= vec_sum4s( temp2v, satdv );
1393 
1394     temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
1395     satdv= vec_sum4s( temp3v, satdv );
1396 #endif
1397 
1398     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1399     pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
1400     diff0v = vec_sub( pix1v, pix2v );
1401     pix1   += stride_pix1;
1402     pix2   += stride_pix2;
1403 
1404     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1405     pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
1406     diff1v = vec_sub( pix1v, pix2v );
1407     pix1   += stride_pix1;
1408     pix2   += stride_pix2;
1409 
1410     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1411     pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
1412     diff2v = vec_sub( pix1v, pix2v );
1413     pix1   += stride_pix1;
1414     pix2   += stride_pix2;
1415 
1416     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1417     pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
1418     diff3v = vec_sub( pix1v, pix2v );
1419     pix1   += stride_pix1;
1420     pix2   += stride_pix2;
1421 
1422     /* Hadamar H */
1423     HADAMARD4_VEC(diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v);
1424     VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v, diff0v, diff1v, diff2v, diff3v );
1425     /* Hadamar V */
1426     HADAMARD4_VEC(diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v);
1427 
1428 #if 1
1429     temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
1430     satdv += vec_sum4s( temp0v, zero_s32v);
1431 
1432     temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
1433     satdv1 = vec_sum4s( temp1v, zero_s32v );
1434 
1435     temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
1436     satdv2 = vec_sum4s( temp2v, zero_s32v );
1437 
1438     temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
1439     satdv3 = vec_sum4s( temp3v, zero_s32v );
1440 
1441     satdv += satdv1;
1442     satdv2 += satdv3;
1443     satdv += satdv2;
1444 
1445     satdv = vec_sum2s( satdv, zero_s32v );
1446     sum = vec_extract(satdv, 1);
1447 #else
1448     temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
1449     satdv = vec_sum4s( temp0v, satdv);
1450 
1451     temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
1452     satdv= vec_sum4s( temp1v, satdv );
1453 
1454     temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
1455     satdv= vec_sum4s( temp2v, satdv );
1456 
1457     temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
1458     satdv= vec_sum4s( temp3v, satdv );
1459 
1460     satdv = vec_sum2s( satdv, zero_s32v );
1461     satdv = vec_splat( satdv, 1 );
1462     vec_ste( satdv, 0, &sum );
1463 #endif
1464     return sum >> 1;
1465 }
1466 
1467 #if 1
satd_8x4_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)1468 static int satd_8x4_altivec(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
1469 {
1470     const vector signed short v_unsigned_short_mask = {0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF} ;
1471     vector unsigned char v_pix1_ub, v_pix2_ub ;
1472     vector signed short v_pix1_ss, v_pix2_ss ;
1473     vector signed short v_sub ;
1474     vector signed int v_sub_sw_0, v_sub_sw_1 ;
1475     vector signed int v_packed_sub_0, v_packed_sub_1 ;
1476     vector unsigned int v_hadamard_result_0, v_hadamard_result_1, v_hadamard_result_2, v_hadamard_result_3 ;
1477 
1478    // for (int i = 0; i < 4; i+=2, pix1 += 2*stride_pix1, pix2 += 2*stride_pix2)
1479    // {
1480         //a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM);
1481         //a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM);
1482         //a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM);
1483         //a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM);
1484 
1485     // Load 16 elements from each pix array
1486     v_pix1_ub = vec_xl(0, pix1) ;
1487     v_pix2_ub = vec_xl(0, pix2) ;
1488 
1489     // We only care about the top 8, and in short format
1490     v_pix1_ss = vec_unpackh((vector signed char)v_pix1_ub) ;
1491     v_pix2_ss = vec_unpackh((vector signed char)v_pix2_ub) ;
1492 
1493     // Undo the sign extend of the unpacks
1494     v_pix1_ss = vec_and(v_pix1_ss, v_unsigned_short_mask) ;
1495     v_pix2_ss = vec_and(v_pix2_ss, v_unsigned_short_mask) ;
1496 
1497     // Peform the subtraction
1498     v_sub = vec_sub(v_pix1_ss, v_pix2_ss) ;
1499 
1500     // Unpack the sub results into ints
1501     v_sub_sw_0 = vec_unpackh(v_sub) ;
1502     v_sub_sw_1 = vec_unpackl(v_sub) ;
1503     v_sub_sw_1 = vec_sl(v_sub_sw_1, (vector unsigned int){16,16,16,16}) ;
1504 
1505     // Add the int sub results (compatibility with the original code)
1506     v_packed_sub_0 = vec_add(v_sub_sw_0, v_sub_sw_1) ;
1507 
1508     //a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM);
1509     //a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM);
1510     //a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM);
1511     //a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM);
1512 
1513     // Load 16 elements from each pix array
1514     v_pix1_ub = vec_xl(stride_pix1, pix1) ;
1515     v_pix2_ub = vec_xl(stride_pix2, pix2) ;
1516 
1517     // We only care about the top 8, and in short format
1518     v_pix1_ss = vec_unpackh((vector signed char)v_pix1_ub) ;
1519     v_pix2_ss = vec_unpackh((vector signed char)v_pix2_ub) ;
1520 
1521     // Undo the sign extend of the unpacks
1522     v_pix1_ss = vec_and(v_pix1_ss, v_unsigned_short_mask) ;
1523     v_pix2_ss = vec_and(v_pix2_ss, v_unsigned_short_mask) ;
1524 
1525     // Peform the subtraction
1526     v_sub = vec_sub(v_pix1_ss, v_pix2_ss) ;
1527 
1528     // Unpack the sub results into ints
1529     v_sub_sw_0 = vec_unpackh(v_sub) ;
1530     v_sub_sw_1 = vec_unpackl(v_sub) ;
1531     v_sub_sw_1 = vec_sl(v_sub_sw_1, (vector unsigned int){16,16,16,16}) ;
1532 
1533     // Add the int sub results (compatibility with the original code)
1534     v_packed_sub_1 = vec_add(v_sub_sw_0, v_sub_sw_1) ;
1535 
1536     // original: HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3);
1537     // modified while vectorizing: HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], v_packed_sub_0[0], v_packed_sub_0[1], v_packed_sub_0[2], v_packed_sub_0[3]);
1538 
1539     // original: HADAMARD4(tmp[i+1][0], tmp[i+1][1], tmp[i+1][2], tmp[i+1][3], a0, a1, a2, a3);
1540     // modified while vectorizing: HADAMARD4(tmp[i+1][0], tmp[i+1][1], tmp[i+1][2], tmp[i+1][3], v_packed_sub_1[0], v_packed_sub_1[1], v_packed_sub_1[2], v_packed_sub_1[3]);
1541 
1542     // Go after two hadamard4(int) at once, fully utilizing the vector width
1543     // Note that the hadamard4(int) provided by x264/x265 is actually two hadamard4(short) simultaneously
1544     const vector unsigned char v_perm_l0_0 = {0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B} ;
1545     const vector unsigned char v_perm_l0_1 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F} ;
1546     HADAMARD4_x2vec(v_hadamard_result_0, v_hadamard_result_1, v_packed_sub_0, v_packed_sub_1, v_perm_l0_0, v_perm_l0_1) ;
1547 
1548     //##
1549     // tmp[0][0] = v_hadamard_result_0[0] ;
1550     // tmp[0][1] = v_hadamard_result_0[2] ;
1551     // tmp[0][2] = v_hadamard_result_1[0] ;
1552     // tmp[0][3] = v_hadamard_result_1[2] ;
1553 
1554     // tmp[1][0] = v_hadamard_result_0[1] ;
1555     // tmp[1][1] = v_hadamard_result_0[3] ;
1556     // tmp[1][2] = v_hadamard_result_1[1] ;
1557     // tmp[1][3] = v_hadamard_result_1[3] ;
1558     //##
1559 
1560     //a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM);
1561     //a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM);
1562     //a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM);
1563     //a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM);
1564 
1565     // Load 16 elements from each pix array
1566     v_pix1_ub = vec_xl(2*stride_pix1, pix1) ;
1567     v_pix2_ub = vec_xl(2*stride_pix1, pix2) ;
1568 
1569     // We only care about the top 8, and in short format
1570     v_pix1_ss = vec_unpackh((vector signed char)v_pix1_ub) ;
1571     v_pix2_ss = vec_unpackh((vector signed char)v_pix2_ub) ;
1572 
1573     // Undo the sign extend of the unpacks
1574     v_pix1_ss = vec_and(v_pix1_ss, v_unsigned_short_mask) ;
1575     v_pix2_ss = vec_and(v_pix2_ss, v_unsigned_short_mask) ;
1576 
1577     // Peform the subtraction
1578     v_sub = vec_sub(v_pix1_ss, v_pix2_ss) ;
1579 
1580     // Unpack the sub results into ints
1581     v_sub_sw_0 = vec_unpackh(v_sub) ;
1582     v_sub_sw_1 = vec_unpackl(v_sub) ;
1583     v_sub_sw_1 = vec_sl(v_sub_sw_1, (vector unsigned int){16,16,16,16}) ;
1584 
1585     // Add the int sub results (compatibility with the original code)
1586     v_packed_sub_0 = vec_add(v_sub_sw_0, v_sub_sw_1) ;
1587 
1588     //a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM);
1589     //a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM);
1590     //a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM);
1591     //a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM);
1592 
1593     // Load 16 elements from each pix array
1594     v_pix1_ub = vec_xl(3*stride_pix1, pix1) ;
1595     v_pix2_ub = vec_xl(3*stride_pix2, pix2) ;
1596 
1597     // We only care about the top 8, and in short format
1598     v_pix1_ss = vec_unpackh((vector signed char)v_pix1_ub) ;
1599     v_pix2_ss = vec_unpackh((vector signed char)v_pix2_ub) ;
1600 
1601     // Undo the sign extend of the unpacks
1602     v_pix1_ss = vec_and(v_pix1_ss, v_unsigned_short_mask) ;
1603     v_pix2_ss = vec_and(v_pix2_ss, v_unsigned_short_mask) ;
1604 
1605     // Peform the subtraction
1606     v_sub = vec_sub(v_pix1_ss, v_pix2_ss) ;
1607 
1608     // Unpack the sub results into ints
1609     v_sub_sw_0 = vec_unpackh(v_sub) ;
1610     v_sub_sw_1 = vec_unpackl(v_sub) ;
1611     v_sub_sw_1 = vec_sl(v_sub_sw_1, (vector unsigned int){16,16,16,16}) ;
1612 
1613     // Add the int sub results (compatibility with the original code)
1614     v_packed_sub_1 = vec_add(v_sub_sw_0, v_sub_sw_1) ;
1615 
1616 
1617     // original: HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3);
1618     // modified while vectorizing: HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], v_packed_sub_0[0], v_packed_sub_0[1], v_packed_sub_0[2], v_packed_sub_0[3]);
1619 
1620     // original: HADAMARD4(tmp[i+1][0], tmp[i+1][1], tmp[i+1][2], tmp[i+1][3], a0, a1, a2, a3);
1621     // modified while vectorizing: HADAMARD4(tmp[i+1][0], tmp[i+1][1], tmp[i+1][2], tmp[i+1][3], v_packed_sub_1[0], v_packed_sub_1[1], v_packed_sub_1[2], v_packed_sub_1[3]);
1622 
1623     // Go after two hadamard4(int) at once, fully utilizing the vector width
1624     // Note that the hadamard4(int) provided by x264/x265 is actually two hadamard4(short) simultaneously
1625     HADAMARD4_x2vec(v_hadamard_result_2, v_hadamard_result_3, v_packed_sub_0, v_packed_sub_1, v_perm_l0_0, v_perm_l0_1) ;
1626 
1627     //##
1628     //## tmp[2][0] = v_hadamard_result_2[0] ;
1629     //## tmp[2][1] = v_hadamard_result_2[2] ;
1630     //## tmp[2][2] = v_hadamard_result_3[0] ;
1631     //## tmp[2][3] = v_hadamard_result_3[2] ;
1632     //##
1633     //##        tmp[3][0] = v_hadamard_result_2[1] ;
1634     //##        tmp[3][1] = v_hadamard_result_2[3] ;
1635     //##        tmp[3][2] = v_hadamard_result_3[1] ;
1636     //##        tmp[3][3] = v_hadamard_result_3[3] ;
1637     //##
1638    // }
1639    //    for (int i = 0; i < 4; i++)
1640    //    {
1641        // HADAMARD4(a0, a1, a2, a3, tmp[0][0], tmp[1][0], tmp[2][0], tmp[3][0]);
1642        // sum += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
1643 
1644        // HADAMARD4(a0, a1, a2, a3, tmp[0][1], tmp[1][1], tmp[2][1], tmp[3][1]);
1645        // sum += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
1646     const vector unsigned char v_lowerloop_perm_l0_0 = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B} ;
1647     const vector unsigned char v_lowerloop_perm_l0_1 = {0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F, 0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F} ;
1648     HADAMARD4_x2vec(v_hadamard_result_0, v_hadamard_result_2, v_hadamard_result_0, v_hadamard_result_2, v_lowerloop_perm_l0_0, v_lowerloop_perm_l0_1) ;
1649 
1650     const vector unsigned int v_15      = {15, 15, 15, 15} ;
1651     const vector unsigned int v_0x10001 = (vector unsigned int){ 0x10001, 0x10001, 0x10001, 0x10001 };
1652     const vector unsigned int v_0xffff  = (vector unsigned int){ 0xffff, 0xffff, 0xffff, 0xffff };
1653 
1654 
1655     vector unsigned int v_hadamard_result_s_0 ;
1656     v_hadamard_result_s_0 = vec_sra(v_hadamard_result_0, v_15) ;
1657     v_hadamard_result_s_0 = vec_and(v_hadamard_result_s_0, v_0x10001) ;
1658     asm ("vmuluwm %0,%1,%2"
1659               : "=v" (v_hadamard_result_s_0)
1660               : "v"  (v_hadamard_result_s_0) , "v" (v_0xffff)
1661             ) ;
1662     v_hadamard_result_0 = vec_add(v_hadamard_result_0, v_hadamard_result_s_0) ;
1663     v_hadamard_result_0 = vec_xor(v_hadamard_result_0, v_hadamard_result_s_0) ;
1664 
1665     vector unsigned int v_hadamard_result_s_2 ;
1666     v_hadamard_result_s_2 = vec_sra(v_hadamard_result_2, v_15) ;
1667     v_hadamard_result_s_2 = vec_and(v_hadamard_result_s_2, v_0x10001) ;
1668     asm ("vmuluwm %0,%1,%2"
1669               : "=v" (v_hadamard_result_s_2)
1670               : "v"  (v_hadamard_result_s_2) , "v" (v_0xffff)
1671             ) ;
1672     v_hadamard_result_2 = vec_add(v_hadamard_result_2, v_hadamard_result_s_2) ;
1673     v_hadamard_result_2 = vec_xor(v_hadamard_result_2, v_hadamard_result_s_2) ;
1674 
1675     // HADAMARD4(a0, a1, a2, a3, tmp[0][2], tmp[1][2], tmp[2][2], tmp[3][2]);
1676     // sum += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
1677 
1678     // HADAMARD4(a0, a1, a2, a3, tmp[0][3], tmp[1][3], tmp[2][3], tmp[3][3]);
1679     // sum += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
1680 
1681     HADAMARD4_x2vec(v_hadamard_result_1, v_hadamard_result_3, v_hadamard_result_1, v_hadamard_result_3, v_lowerloop_perm_l0_0, v_lowerloop_perm_l0_1) ;
1682 
1683     vector unsigned int v_hadamard_result_s_1 ;
1684     v_hadamard_result_s_1 = vec_sra(v_hadamard_result_1, v_15) ;
1685     v_hadamard_result_s_1 = vec_and(v_hadamard_result_s_1, v_0x10001) ;
1686     asm ("vmuluwm %0,%1,%2"
1687               : "=v" (v_hadamard_result_s_1)
1688               : "v"  (v_hadamard_result_s_1) , "v" (v_0xffff)
1689             ) ;
1690     v_hadamard_result_1 = vec_add(v_hadamard_result_1, v_hadamard_result_s_1) ;
1691     v_hadamard_result_1 = vec_xor(v_hadamard_result_1, v_hadamard_result_s_1) ;
1692 
1693     vector unsigned int v_hadamard_result_s_3 ;
1694     v_hadamard_result_s_3 = vec_sra(v_hadamard_result_3, v_15) ;
1695     v_hadamard_result_s_3 = vec_and(v_hadamard_result_s_3, v_0x10001) ;
1696     asm ("vmuluwm %0,%1,%2"
1697               : "=v" (v_hadamard_result_s_3)
1698               : "v"  (v_hadamard_result_s_3) , "v" (v_0xffff)
1699             ) ;
1700     v_hadamard_result_3 = vec_add(v_hadamard_result_3, v_hadamard_result_s_3) ;
1701     v_hadamard_result_3 = vec_xor(v_hadamard_result_3, v_hadamard_result_s_3) ;
1702 
1703 //   }
1704 
1705 
1706     vector unsigned int v_sum_0, v_sum_1 ;
1707     vector signed int v_sum ;
1708 
1709     v_sum_0 = vec_add(v_hadamard_result_0, v_hadamard_result_2) ;
1710     v_sum_1 = vec_add(v_hadamard_result_1, v_hadamard_result_3) ;
1711 
1712     v_sum_0 = vec_add(v_sum_0, v_sum_1) ;
1713 
1714     vector signed int v_zero = {0, 0, 0, 0} ;
1715     v_sum = vec_sums((vector signed int)v_sum_0, v_zero) ;
1716 
1717     // return (((sum_t)sum) + (sum >> BITS_PER_SUM)) >> 1;
1718     return (((sum_t)v_sum[3]) + (v_sum[3] >> BITS_PER_SUM)) >> 1;
1719 }
1720 #else
satd_8x4_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)1721 int satd_8x4_altivec(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
1722 {
1723     ALIGN_VAR_16( int, sum );
1724     LOAD_ZERO;
1725     vec_s16_t pix1v, pix2v;
1726     vec_s16_t diff0v, diff1v, diff2v, diff3v, diff4v, diff5v, diff6v, diff7v;
1727     vec_s16_t temp0v, temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v;
1728     vec_s32_t satdv;
1729 
1730 
1731     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1732     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
1733     diff0v = vec_sub( pix1v, pix2v );
1734     pix1   += stride_pix1;
1735     pix2   += stride_pix2;
1736 
1737     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1738     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
1739     diff1v = vec_sub( pix1v, pix2v );
1740     pix1   += stride_pix1;
1741     pix2   += stride_pix2;
1742 
1743     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1744     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
1745     diff2v = vec_sub( pix1v, pix2v );
1746     pix1   += stride_pix1;
1747     pix2   += stride_pix2;
1748 
1749     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1750     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
1751     diff3v = vec_sub( pix1v, pix2v );
1752     pix1   += stride_pix1;
1753     pix2   += stride_pix2;
1754 
1755     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1756     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
1757     diff4v = vec_sub( pix1v, pix2v );
1758     pix1   += stride_pix1;
1759     pix2   += stride_pix2;
1760 
1761     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1762     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
1763     diff5v = vec_sub( pix1v, pix2v );
1764     pix1   += stride_pix1;
1765     pix2   += stride_pix2;
1766 
1767     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1768     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
1769     diff6v = vec_sub( pix1v, pix2v );
1770     pix1   += stride_pix1;
1771     pix2   += stride_pix2;
1772 
1773     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1774     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
1775     diff7v = vec_sub( pix1v, pix2v );
1776     pix1   += stride_pix1;
1777     pix2   += stride_pix2;
1778 
1779     HADAMARD4_VEC( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v );
1780     //HADAMARD4_VEC( diff4v, diff5v, diff6v, diff7v, temp4v, temp5v, temp6v, temp7v );
1781 
1782     VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
1783                      temp4v, temp5v, temp6v, temp7v,
1784                      diff0v, diff1v, diff2v, diff3v,
1785                      diff4v, diff5v, diff6v, diff7v );
1786 
1787     HADAMARD4_VEC( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v );
1788     HADAMARD4_VEC( diff4v, diff5v, diff6v, diff7v, temp4v, temp5v, temp6v, temp7v );
1789 
1790     temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
1791     satdv = vec_sum4s( temp0v, satdv);
1792 
1793     temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
1794     satdv= vec_sum4s( temp1v, satdv );
1795 
1796     temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
1797     satdv= vec_sum4s( temp2v, satdv );
1798 
1799     temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
1800     satdv= vec_sum4s( temp3v, satdv );
1801 
1802     temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
1803     satdv = vec_sum4s( temp4v, satdv);
1804 
1805     temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
1806     satdv= vec_sum4s( temp5v, satdv );
1807 
1808     temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
1809     satdv= vec_sum4s( temp6v, satdv );
1810 
1811     temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
1812     satdv= vec_sum4s( temp7v, satdv );
1813 
1814     satdv = vec_sums( satdv, zero_s32v );
1815     satdv = vec_splat( satdv, 3 );
1816     vec_ste( satdv, 0, &sum );
1817 
1818     //print(sum);
1819     return sum>>1;
1820 }
1821 #endif
1822 
satd_8x8_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)1823 int satd_8x8_altivec(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
1824 {
1825     ALIGN_VAR_16( int, sum );
1826     LOAD_ZERO;
1827     vec_s16_t pix1v, pix2v;
1828     vec_s16_t diff0v, diff1v, diff2v, diff3v, diff4v, diff5v, diff6v, diff7v;
1829     vec_s16_t temp0v, temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v;
1830     vec_s32_t satdv, satdv1, satdv2, satdv3, satdv4, satdv5, satdv6, satdv7;
1831     //vec_s32_t satdv=(vec_s32_t){0,0,0,0};
1832 
1833 
1834     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1835     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
1836     diff0v = vec_sub( pix1v, pix2v );
1837     pix1   += stride_pix1;
1838     pix2   += stride_pix2;
1839 
1840     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1841     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
1842     diff1v = vec_sub( pix1v, pix2v );
1843     pix1   += stride_pix1;
1844     pix2   += stride_pix2;
1845 
1846     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1847     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
1848     diff2v = vec_sub( pix1v, pix2v );
1849     pix1   += stride_pix1;
1850     pix2   += stride_pix2;
1851 
1852     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1853     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
1854     diff3v = vec_sub( pix1v, pix2v );
1855     pix1   += stride_pix1;
1856     pix2   += stride_pix2;
1857 
1858     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1859     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
1860     diff4v = vec_sub( pix1v, pix2v );
1861     pix1   += stride_pix1;
1862     pix2   += stride_pix2;
1863 
1864     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1865     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
1866     diff5v = vec_sub( pix1v, pix2v );
1867     pix1   += stride_pix1;
1868     pix2   += stride_pix2;
1869 
1870     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1871     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
1872     diff6v = vec_sub( pix1v, pix2v );
1873     pix1   += stride_pix1;
1874     pix2   += stride_pix2;
1875 
1876     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1877     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
1878     diff7v = vec_sub( pix1v, pix2v );
1879     pix1   += stride_pix1;
1880     pix2   += stride_pix2;
1881 
1882     HADAMARD4_VEC( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v );
1883     HADAMARD4_VEC( diff4v, diff5v, diff6v, diff7v, temp4v, temp5v, temp6v, temp7v );
1884 
1885     VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
1886                      temp4v, temp5v, temp6v, temp7v,
1887                      diff0v, diff1v, diff2v, diff3v,
1888                      diff4v, diff5v, diff6v, diff7v );
1889 
1890     HADAMARD4_VEC( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v );
1891     HADAMARD4_VEC( diff4v, diff5v, diff6v, diff7v, temp4v, temp5v, temp6v, temp7v );
1892 
1893 #if 1
1894     temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
1895     satdv = vec_sum4s( temp0v, zero_s32v);
1896 
1897     temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
1898     satdv1= vec_sum4s( temp1v, zero_s32v );
1899 
1900     temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
1901     satdv2= vec_sum4s( temp2v, zero_s32v );
1902 
1903     temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
1904     satdv3= vec_sum4s( temp3v, zero_s32v );
1905 
1906     temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
1907     satdv4 = vec_sum4s( temp4v, zero_s32v);
1908 
1909     temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
1910     satdv5= vec_sum4s( temp5v, zero_s32v );
1911 
1912     temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
1913     satdv6= vec_sum4s( temp6v, zero_s32v );
1914 
1915     temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
1916     satdv7= vec_sum4s( temp7v, zero_s32v );
1917 
1918     satdv  += satdv1;
1919     satdv2 += satdv3;
1920     satdv4 += satdv5;
1921     satdv6 += satdv7;
1922 
1923     satdv  += satdv2;
1924     satdv4 += satdv6;
1925     satdv  += satdv4;
1926 
1927     satdv = vec_sums( satdv, zero_s32v );
1928     sum = vec_extract(satdv, 3);
1929 #else
1930     temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
1931     satdv = vec_sum4s( temp0v, satdv);
1932 
1933     temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
1934     satdv= vec_sum4s( temp1v, satdv );
1935 
1936     temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
1937     satdv= vec_sum4s( temp2v, satdv );
1938 
1939     temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
1940     satdv= vec_sum4s( temp3v, satdv );
1941 
1942     temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
1943     satdv = vec_sum4s( temp4v, satdv);
1944 
1945     temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
1946     satdv= vec_sum4s( temp5v, satdv );
1947 
1948     temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
1949     satdv= vec_sum4s( temp6v, satdv );
1950 
1951     temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
1952     satdv= vec_sum4s( temp7v, satdv );
1953 
1954     satdv = vec_sums( satdv, zero_s32v );
1955     satdv = vec_splat( satdv, 3 );
1956     vec_ste( satdv, 0, &sum );
1957 #endif
1958     return sum>>1;
1959 }
1960 
satd_8x16_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)1961 int satd_8x16_altivec(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
1962 {
1963     ALIGN_VAR_16( int, sum );
1964 
1965     LOAD_ZERO;
1966     vec_s16_t pix1v, pix2v;
1967     vec_s16_t diff0v, diff1v, diff2v, diff3v, diff4v, diff5v, diff6v, diff7v;
1968     vec_s16_t temp0v, temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v;
1969     //vec_s32_t satdv=(vec_s32_t){0,0,0,0};
1970     vec_s32_t satdv, satdv1, satdv2, satdv3, satdv4, satdv5, satdv6, satdv7;
1971 
1972 
1973     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1974     pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
1975     diff0v = vec_sub( pix1v, pix2v );
1976     pix1   += stride_pix1;
1977     pix2   += stride_pix2;
1978 
1979     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1980     pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
1981     diff1v = vec_sub( pix1v, pix2v );
1982     pix1   += stride_pix1;
1983     pix2   += stride_pix2;
1984 
1985     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1986     pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
1987     diff2v = vec_sub( pix1v, pix2v );
1988     pix1   += stride_pix1;
1989     pix2   += stride_pix2;
1990 
1991     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1992     pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
1993     diff3v = vec_sub( pix1v, pix2v );
1994     pix1   += stride_pix1;
1995     pix2   += stride_pix2;
1996 
1997     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1998     pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
1999     diff4v = vec_sub( pix1v, pix2v );
2000     pix1   += stride_pix1;
2001     pix2   += stride_pix2;
2002 
2003     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
2004     pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
2005     diff5v = vec_sub( pix1v, pix2v );
2006     pix1   += stride_pix1;
2007     pix2   += stride_pix2;
2008 
2009     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
2010     pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
2011     diff6v = vec_sub( pix1v, pix2v );
2012     pix1   += stride_pix1;
2013     pix2   += stride_pix2;
2014 
2015     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
2016     pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
2017     diff7v = vec_sub( pix1v, pix2v );
2018     pix1   += stride_pix1;
2019     pix2   += stride_pix2;
2020 
2021     HADAMARD4_VEC( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v );
2022     HADAMARD4_VEC( diff4v, diff5v, diff6v, diff7v, temp4v, temp5v, temp6v, temp7v );
2023 
2024     VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
2025                      temp4v, temp5v, temp6v, temp7v,
2026                      diff0v, diff1v, diff2v, diff3v,
2027                      diff4v, diff5v, diff6v, diff7v );
2028 
2029     HADAMARD4_VEC( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v );
2030     HADAMARD4_VEC( diff4v, diff5v, diff6v, diff7v, temp4v, temp5v, temp6v, temp7v );
2031 
2032 #if 1
2033     temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
2034     satdv = vec_sum4s( temp0v, zero_s32v);
2035 
2036     temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
2037     satdv1= vec_sum4s( temp1v, zero_s32v );
2038 
2039     temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
2040     satdv2= vec_sum4s( temp2v, zero_s32v );
2041 
2042     temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
2043     satdv3= vec_sum4s( temp3v, zero_s32v );
2044 
2045     temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
2046     satdv4 = vec_sum4s( temp4v, zero_s32v);
2047 
2048     temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
2049     satdv5= vec_sum4s( temp5v, zero_s32v );
2050 
2051     temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
2052     satdv6= vec_sum4s( temp6v, zero_s32v );
2053 
2054     temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
2055     satdv7= vec_sum4s( temp7v, zero_s32v );
2056 
2057     satdv  += satdv1;
2058     satdv2 += satdv3;
2059     satdv4 += satdv5;
2060     satdv6 += satdv7;
2061 
2062     satdv  += satdv2;
2063     satdv4 += satdv6;
2064     satdv  += satdv4;
2065 #else
2066     temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
2067     satdv = vec_sum4s( temp0v, satdv);
2068 
2069     temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
2070     satdv= vec_sum4s( temp1v, satdv );
2071 
2072     temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
2073     satdv= vec_sum4s( temp2v, satdv );
2074 
2075     temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
2076     satdv= vec_sum4s( temp3v, satdv );
2077 
2078     temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
2079     satdv = vec_sum4s( temp4v, satdv);
2080 
2081     temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
2082     satdv= vec_sum4s( temp5v, satdv );
2083 
2084     temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
2085     satdv= vec_sum4s( temp6v, satdv );
2086 
2087     temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
2088     satdv= vec_sum4s( temp7v, satdv );
2089 #endif
2090 
2091     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
2092     pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
2093     diff0v = vec_sub( pix1v, pix2v );
2094     pix1   += stride_pix1;
2095     pix2   += stride_pix2;
2096 
2097     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
2098     pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
2099     diff1v = vec_sub( pix1v, pix2v );
2100     pix1   += stride_pix1;
2101     pix2   += stride_pix2;
2102 
2103     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
2104     pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
2105     diff2v = vec_sub( pix1v, pix2v );
2106     pix1   += stride_pix1;
2107     pix2   += stride_pix2;
2108 
2109     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
2110     pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
2111     diff3v = vec_sub( pix1v, pix2v );
2112     pix1   += stride_pix1;
2113     pix2   += stride_pix2;
2114 
2115     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
2116     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
2117     diff4v = vec_sub( pix1v, pix2v );
2118     pix1   += stride_pix1;
2119     pix2   += stride_pix2;
2120 
2121     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
2122     pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
2123     diff5v = vec_sub( pix1v, pix2v );
2124     pix1   += stride_pix1;
2125     pix2   += stride_pix2;
2126 
2127     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
2128     pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
2129     diff6v = vec_sub( pix1v, pix2v );
2130     pix1   += stride_pix1;
2131     pix2   += stride_pix2;
2132 
2133     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
2134     pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
2135     diff7v = vec_sub( pix1v, pix2v );
2136     pix1   += stride_pix1;
2137     pix2   += stride_pix2;
2138 
2139     HADAMARD4_VEC( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v );
2140     HADAMARD4_VEC( diff4v, diff5v, diff6v, diff7v, temp4v, temp5v, temp6v, temp7v );
2141 
2142     VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
2143                      temp4v, temp5v, temp6v, temp7v,
2144                      diff0v, diff1v, diff2v, diff3v,
2145                      diff4v, diff5v, diff6v, diff7v );
2146 
2147     HADAMARD4_VEC( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v );
2148     HADAMARD4_VEC( diff4v, diff5v, diff6v, diff7v, temp4v, temp5v, temp6v, temp7v );
2149 
2150 #if 1
2151     temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
2152     satdv += vec_sum4s( temp0v, zero_s32v);
2153 
2154     temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
2155     satdv1= vec_sum4s( temp1v, zero_s32v );
2156 
2157     temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
2158     satdv2= vec_sum4s( temp2v, zero_s32v );
2159 
2160     temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
2161     satdv3= vec_sum4s( temp3v, zero_s32v );
2162 
2163     temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
2164     satdv4 = vec_sum4s( temp4v, zero_s32v);
2165 
2166     temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
2167     satdv5= vec_sum4s( temp5v, zero_s32v );
2168 
2169     temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
2170     satdv6= vec_sum4s( temp6v, zero_s32v );
2171 
2172     temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
2173     satdv7= vec_sum4s( temp7v, zero_s32v );
2174 
2175     satdv  += satdv1;
2176     satdv2 += satdv3;
2177     satdv4 += satdv5;
2178     satdv6 += satdv7;
2179 
2180     satdv  += satdv2;
2181     satdv4 += satdv6;
2182     satdv  += satdv4;
2183 
2184     satdv = vec_sums( satdv, zero_s32v );
2185     sum = vec_extract(satdv, 3);
2186 #else
2187     temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
2188     satdv = vec_sum4s( temp0v, satdv);
2189 
2190     temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
2191     satdv= vec_sum4s( temp1v, satdv );
2192 
2193     temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
2194     satdv= vec_sum4s( temp2v, satdv );
2195 
2196     temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
2197     satdv= vec_sum4s( temp3v, satdv );
2198 
2199     temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
2200     satdv = vec_sum4s( temp4v, satdv);
2201 
2202     temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
2203     satdv= vec_sum4s( temp5v, satdv );
2204 
2205     temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
2206     satdv= vec_sum4s( temp6v, satdv );
2207 
2208     temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
2209     satdv= vec_sum4s( temp7v, satdv );
2210 
2211     satdv = vec_sums( satdv, zero_s32v );
2212     satdv = vec_splat( satdv, 3 );
2213     vec_ste( satdv, 0, &sum );
2214 #endif
2215     return sum >> 1;
2216 }
2217 
2218 #define VEC_DIFF_S16(p1,i1,p2,i2,dh,dl)\
2219 {\
2220     pix1v = (vec_s16_t)vec_xl(0, p1);\
2221     temp0v = vec_u8_to_s16_h( pix1v );\
2222     temp1v = vec_u8_to_s16_l( pix1v );\
2223     pix2v = (vec_s16_t)vec_xl(0, p2);\
2224     temp2v = vec_u8_to_s16_h( pix2v );\
2225     temp3v = vec_u8_to_s16_l( pix2v );\
2226     dh = vec_sub( temp0v, temp2v );\
2227     dl  = vec_sub( temp1v, temp3v );\
2228     p1 += i1;\
2229     p2 += i2;\
2230 }
2231 
2232 
satd_16x4_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2233 int satd_16x4_altivec(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2234 {
2235     ALIGN_VAR_16( int, sum );
2236     LOAD_ZERO;
2237     //vec_s32_t satdv=(vec_s32_t){0,0,0,0};
2238     vec_s32_t satdv, satdv1, satdv2, satdv3, satdv4, satdv5, satdv6, satdv7;
2239     vec_s16_t pix1v, pix2v;
2240     vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v;
2241     vec_s16_t diffl0v, diffl1v, diffl2v, diffl3v;
2242     vec_s16_t temp0v, temp1v, temp2v, temp3v,
2243               temp4v, temp5v, temp6v, temp7v;
2244 
2245     VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh0v,diffl0v);
2246     VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh1v, diffl1v);
2247     VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh2v, diffl2v);
2248     VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh3v, diffl3v);
2249 
2250 
2251     HADAMARD4_VEC( diffh0v, diffh1v, diffh2v, diffh3v, temp0v, temp1v, temp2v, temp3v );
2252     HADAMARD4_VEC( diffl0v, diffl1v, diffl2v, diffl3v, temp4v, temp5v, temp6v, temp7v );
2253 
2254     VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
2255                      temp4v, temp5v, temp6v, temp7v,
2256                      diffh0v, diffh1v, diffh2v, diffh3v,
2257                      diffl0v, diffl1v, diffl2v, diffl3v);
2258 
2259     HADAMARD4_VEC( diffh0v, diffh1v, diffh2v, diffh3v, temp0v, temp1v, temp2v, temp3v );
2260     HADAMARD4_VEC( diffl0v, diffl1v, diffl2v, diffl3v, temp4v, temp5v, temp6v, temp7v );
2261 
2262 #if 1
2263     temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
2264     satdv = vec_sum4s( temp0v, zero_s32v);
2265 
2266     temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
2267     satdv1= vec_sum4s( temp1v, zero_s32v );
2268 
2269     temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
2270     satdv2= vec_sum4s( temp2v, zero_s32v );
2271 
2272     temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
2273     satdv3= vec_sum4s( temp3v, zero_s32v );
2274 
2275     temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
2276     satdv4 = vec_sum4s( temp4v, zero_s32v);
2277 
2278     temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
2279     satdv5= vec_sum4s( temp5v, zero_s32v );
2280 
2281     temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
2282     satdv6= vec_sum4s( temp6v, zero_s32v );
2283 
2284     temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
2285     satdv7= vec_sum4s( temp7v, zero_s32v );
2286 
2287     satdv  += satdv1;
2288     satdv2 += satdv3;
2289     satdv4 += satdv5;
2290     satdv6 += satdv7;
2291 
2292     satdv  += satdv2;
2293     satdv4 += satdv6;
2294     satdv  += satdv4;
2295 
2296     satdv = vec_sums( satdv, zero_s32v );
2297     sum = vec_extract(satdv, 3);
2298 #else
2299     temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
2300     satdv = vec_sum4s( temp0v, zero_s32v);
2301 
2302     temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
2303     satdv= vec_sum4s( temp1v, satdv );
2304 
2305     temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
2306     satdv= vec_sum4s( temp2v, satdv );
2307 
2308     temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
2309     satdv= vec_sum4s( temp3v, satdv );
2310 
2311     temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
2312     satdv = vec_sum4s( temp4v, satdv);
2313 
2314     temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
2315     satdv= vec_sum4s( temp5v, satdv );
2316 
2317     temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
2318     satdv= vec_sum4s( temp6v, satdv );
2319 
2320     temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
2321     satdv= vec_sum4s( temp7v, satdv );
2322 
2323     satdv = vec_sums( satdv, zero_s32v );
2324     satdv = vec_splat( satdv, 3 );
2325     vec_ste( satdv, 0, &sum );
2326 #endif
2327     return sum >> 1;
2328 }
2329 
satd_16x8_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2330 int satd_16x8_altivec(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2331 {
2332     ALIGN_VAR_16( int, sum );
2333     LOAD_ZERO;
2334     //vec_s32_t satdv=(vec_s32_t){0,0,0,0};
2335     vec_s32_t satdv, satdv1, satdv2, satdv3, satdv4, satdv5, satdv6, satdv7;
2336     vec_s16_t pix1v, pix2v;
2337     vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v,
2338               diffh4v, diffh5v, diffh6v, diffh7v;
2339     vec_s16_t diffl0v, diffl1v, diffl2v, diffl3v,
2340               diffl4v, diffl5v, diffl6v, diffl7v;
2341     vec_s16_t temp0v, temp1v, temp2v, temp3v,
2342               temp4v, temp5v, temp6v, temp7v;
2343 
2344     VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh0v,diffl0v);
2345     VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh1v, diffl1v);
2346     VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh2v, diffl2v);
2347     VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh3v, diffl3v);
2348     VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh4v, diffl4v);
2349     VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh5v, diffl5v);
2350     VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh6v, diffl6v);
2351     VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh7v, diffl7v);
2352 
2353     HADAMARD4_VEC( diffh0v, diffh1v, diffh2v, diffh3v, temp0v, temp1v, temp2v, temp3v );
2354     HADAMARD4_VEC( diffh4v, diffh5v, diffh6v, diffh7v, temp4v, temp5v, temp6v, temp7v );
2355 
2356     VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
2357                      temp4v, temp5v, temp6v, temp7v,
2358                      diffh0v, diffh1v, diffh2v, diffh3v,
2359                      diffh4v, diffh5v, diffh6v, diffh7v );
2360 
2361     HADAMARD4_VEC( diffh0v, diffh1v, diffh2v, diffh3v, temp0v, temp1v, temp2v, temp3v );
2362     HADAMARD4_VEC( diffh4v, diffh5v, diffh6v, diffh7v, temp4v, temp5v, temp6v, temp7v );
2363 
2364 #if 1
2365     temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
2366     satdv = vec_sum4s( temp0v, zero_s32v);
2367 
2368     temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
2369     satdv1= vec_sum4s( temp1v, zero_s32v );
2370 
2371     temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
2372     satdv2= vec_sum4s( temp2v, zero_s32v );
2373 
2374     temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
2375     satdv3= vec_sum4s( temp3v, zero_s32v );
2376 
2377     temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
2378     satdv4 = vec_sum4s( temp4v, zero_s32v);
2379 
2380     temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
2381     satdv5= vec_sum4s( temp5v, zero_s32v );
2382 
2383     temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
2384     satdv6= vec_sum4s( temp6v, zero_s32v );
2385 
2386     temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
2387     satdv7= vec_sum4s( temp7v, zero_s32v );
2388 
2389     satdv  += satdv1;
2390     satdv2 += satdv3;
2391     satdv4 += satdv5;
2392     satdv6 += satdv7;
2393 
2394     satdv  += satdv2;
2395     satdv4 += satdv6;
2396     satdv  += satdv4;
2397 #else
2398     temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
2399     satdv = vec_sum4s( temp0v, zero_s32v);
2400 
2401     temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
2402     satdv= vec_sum4s( temp1v, satdv );
2403 
2404     temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
2405     satdv= vec_sum4s( temp2v, satdv );
2406 
2407     temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
2408     satdv= vec_sum4s( temp3v, satdv );
2409 
2410     temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
2411     satdv = vec_sum4s( temp4v, satdv);
2412 
2413     temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
2414     satdv= vec_sum4s( temp5v, satdv );
2415 
2416     temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
2417     satdv= vec_sum4s( temp6v, satdv );
2418 
2419     temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
2420     satdv= vec_sum4s( temp7v, satdv );
2421 #endif
2422 
2423     HADAMARD4_VEC( diffl0v, diffl1v, diffl2v, diffl3v, temp0v, temp1v, temp2v, temp3v );
2424     HADAMARD4_VEC( diffl4v, diffl5v, diffl6v, diffl7v, temp4v, temp5v, temp6v, temp7v );
2425 
2426     VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
2427                      temp4v, temp5v, temp6v, temp7v,
2428                      diffl0v, diffl1v, diffl2v, diffl3v,
2429                      diffl4v, diffl5v, diffl6v, diffl7v );
2430 
2431     HADAMARD4_VEC( diffl0v, diffl1v, diffl2v, diffl3v, temp0v, temp1v, temp2v, temp3v );
2432     HADAMARD4_VEC( diffl4v, diffl5v, diffl6v, diffl7v,  temp4v, temp5v, temp6v, temp7v );
2433 
2434 #if 1
2435     temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
2436     satdv += vec_sum4s( temp0v, zero_s32v);
2437 
2438     temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
2439     satdv1= vec_sum4s( temp1v, zero_s32v );
2440 
2441     temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
2442     satdv2= vec_sum4s( temp2v, zero_s32v );
2443 
2444     temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
2445     satdv3= vec_sum4s( temp3v, zero_s32v );
2446 
2447     temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
2448     satdv4 = vec_sum4s( temp4v, zero_s32v);
2449 
2450     temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
2451     satdv5= vec_sum4s( temp5v, zero_s32v );
2452 
2453     temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
2454     satdv6= vec_sum4s( temp6v, zero_s32v );
2455 
2456     temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
2457     satdv7= vec_sum4s( temp7v, zero_s32v );
2458 
2459     satdv  += satdv1;
2460     satdv2 += satdv3;
2461     satdv4 += satdv5;
2462     satdv6 += satdv7;
2463 
2464     satdv  += satdv2;
2465     satdv4 += satdv6;
2466     satdv  += satdv4;
2467 
2468     satdv = vec_sums( satdv, zero_s32v );
2469     sum = vec_extract(satdv, 3);
2470 #else
2471     temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
2472     satdv = vec_sum4s( temp0v, satdv);
2473 
2474     temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
2475     satdv= vec_sum4s( temp1v, satdv );
2476 
2477     temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
2478     satdv= vec_sum4s( temp2v, satdv );
2479 
2480     temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
2481     satdv= vec_sum4s( temp3v, satdv );
2482 
2483     temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
2484     satdv = vec_sum4s( temp4v, satdv);
2485 
2486     temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
2487     satdv= vec_sum4s( temp5v, satdv );
2488 
2489     temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
2490     satdv= vec_sum4s( temp6v, satdv );
2491 
2492     temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
2493     satdv= vec_sum4s( temp7v, satdv );
2494 
2495     satdv = vec_sums( satdv, zero_s32v );
2496     satdv = vec_splat( satdv, 3 );
2497     vec_ste( satdv, 0, &sum );
2498 #endif
2499     return sum >> 1;
2500 }
2501 
satd_16x16_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2502 int satd_16x16_altivec(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2503 {
2504     ALIGN_VAR_16( int, sum );
2505     LOAD_ZERO;
2506     //vec_s32_t satdv=(vec_s32_t){0,0,0,0};
2507     vec_s32_t satdv, satdv1, satdv2, satdv3, satdv4, satdv5, satdv6, satdv7;
2508     vec_s16_t pix1v, pix2v;
2509     vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v,
2510               diffh4v, diffh5v, diffh6v, diffh7v;
2511     vec_s16_t diffl0v, diffl1v, diffl2v, diffl3v,
2512               diffl4v, diffl5v, diffl6v, diffl7v;
2513     vec_s16_t temp0v, temp1v, temp2v, temp3v,
2514               temp4v, temp5v, temp6v, temp7v;
2515 
2516     VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh0v,diffl0v);
2517     VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh1v, diffl1v);
2518     VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh2v, diffl2v);
2519     VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh3v, diffl3v);
2520     VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh4v, diffl4v);
2521     VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh5v, diffl5v);
2522     VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh6v, diffl6v);
2523     VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh7v, diffl7v);
2524 
2525     HADAMARD4_VEC( diffh0v, diffh1v, diffh2v, diffh3v, temp0v, temp1v, temp2v, temp3v );
2526     HADAMARD4_VEC( diffh4v, diffh5v, diffh6v, diffh7v, temp4v, temp5v, temp6v, temp7v );
2527 
2528     VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
2529                      temp4v, temp5v, temp6v, temp7v,
2530                      diffh0v, diffh1v, diffh2v, diffh3v,
2531                      diffh4v, diffh5v, diffh6v, diffh7v );
2532 
2533     HADAMARD4_VEC( diffh0v, diffh1v, diffh2v, diffh3v, temp0v, temp1v, temp2v, temp3v );
2534     HADAMARD4_VEC( diffh4v, diffh5v, diffh6v, diffh7v, temp4v, temp5v, temp6v, temp7v );
2535 
2536 #if 1
2537     temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
2538     satdv = vec_sum4s( temp0v, zero_s32v);
2539 
2540     temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
2541     satdv1= vec_sum4s( temp1v, zero_s32v );
2542 
2543     temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
2544     satdv2= vec_sum4s( temp2v, zero_s32v );
2545 
2546     temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
2547     satdv3= vec_sum4s( temp3v, zero_s32v );
2548 
2549     temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
2550     satdv4 = vec_sum4s( temp4v, zero_s32v);
2551 
2552     temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
2553     satdv5= vec_sum4s( temp5v, zero_s32v );
2554 
2555     temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
2556     satdv6= vec_sum4s( temp6v, zero_s32v );
2557 
2558     temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
2559     satdv7= vec_sum4s( temp7v, zero_s32v );
2560 
2561     satdv  += satdv1;
2562     satdv2 += satdv3;
2563     satdv4 += satdv5;
2564     satdv6 += satdv7;
2565 
2566     satdv  += satdv2;
2567     satdv4 += satdv6;
2568     satdv  += satdv4;
2569 #else
2570     temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
2571     satdv = vec_sum4s( temp0v, zero_s32v);
2572 
2573     temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
2574     satdv= vec_sum4s( temp1v, satdv );
2575 
2576     temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
2577     satdv= vec_sum4s( temp2v, satdv );
2578 
2579     temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
2580     satdv= vec_sum4s( temp3v, satdv );
2581 
2582     temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
2583     satdv = vec_sum4s( temp4v, satdv);
2584 
2585     temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
2586     satdv= vec_sum4s( temp5v, satdv );
2587 
2588     temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
2589     satdv= vec_sum4s( temp6v, satdv );
2590 
2591     temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
2592     satdv= vec_sum4s( temp7v, satdv );
2593 #endif
2594 
2595     HADAMARD4_VEC( diffl0v, diffl1v, diffl2v, diffl3v, temp0v, temp1v, temp2v, temp3v );
2596     HADAMARD4_VEC( diffl4v, diffl5v, diffl6v, diffl7v, temp4v, temp5v, temp6v, temp7v );
2597 
2598     VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
2599                      temp4v, temp5v, temp6v, temp7v,
2600                      diffl0v, diffl1v, diffl2v, diffl3v,
2601                      diffl4v, diffl5v, diffl6v, diffl7v );
2602 
2603     HADAMARD4_VEC( diffl0v, diffl1v, diffl2v, diffl3v, temp0v, temp1v, temp2v, temp3v );
2604     HADAMARD4_VEC( diffl4v, diffl5v, diffl6v, diffl7v,  temp4v, temp5v, temp6v, temp7v );
2605 
2606 #if 1
2607     temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
2608     satdv += vec_sum4s( temp0v, zero_s32v);
2609 
2610     temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
2611     satdv1= vec_sum4s( temp1v, zero_s32v );
2612 
2613     temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
2614     satdv2= vec_sum4s( temp2v, zero_s32v );
2615 
2616     temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
2617     satdv3= vec_sum4s( temp3v, zero_s32v );
2618 
2619     temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
2620     satdv4 = vec_sum4s( temp4v, zero_s32v);
2621 
2622     temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
2623     satdv5= vec_sum4s( temp5v, zero_s32v );
2624 
2625     temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
2626     satdv6= vec_sum4s( temp6v, zero_s32v );
2627 
2628     temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
2629     satdv7= vec_sum4s( temp7v, zero_s32v );
2630 
2631     satdv  += satdv1;
2632     satdv2 += satdv3;
2633     satdv4 += satdv5;
2634     satdv6 += satdv7;
2635 
2636     satdv  += satdv2;
2637     satdv4 += satdv6;
2638     satdv  += satdv4;
2639 
2640 #else
2641     temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
2642     satdv = vec_sum4s( temp0v, satdv);
2643 
2644     temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
2645     satdv= vec_sum4s( temp1v, satdv );
2646 
2647     temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
2648     satdv= vec_sum4s( temp2v, satdv );
2649 
2650     temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
2651     satdv= vec_sum4s( temp3v, satdv );
2652 
2653     temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
2654     satdv = vec_sum4s( temp4v, satdv);
2655 
2656     temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
2657     satdv= vec_sum4s( temp5v, satdv );
2658 
2659     temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
2660     satdv= vec_sum4s( temp6v, satdv );
2661 
2662     temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
2663     satdv= vec_sum4s( temp7v, satdv );
2664 #endif
2665     VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh0v,diffl0v);
2666     VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh1v, diffl1v);
2667     VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh2v, diffl2v);
2668     VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh3v, diffl3v);
2669     VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh4v, diffl4v);
2670     VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh5v, diffl5v);
2671     VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh6v, diffl6v);
2672     VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh7v, diffl7v);
2673 
2674     HADAMARD4_VEC( diffh0v, diffh1v, diffh2v, diffh3v, temp0v, temp1v, temp2v, temp3v );
2675     HADAMARD4_VEC( diffh4v, diffh5v, diffh6v, diffh7v, temp4v, temp5v, temp6v, temp7v );
2676 
2677     VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
2678                      temp4v, temp5v, temp6v, temp7v,
2679                      diffh0v, diffh1v, diffh2v, diffh3v,
2680                      diffh4v, diffh5v, diffh6v, diffh7v );
2681 
2682     HADAMARD4_VEC( diffh0v, diffh1v, diffh2v, diffh3v, temp0v, temp1v, temp2v, temp3v );
2683     HADAMARD4_VEC( diffh4v, diffh5v, diffh6v, diffh7v, temp4v, temp5v, temp6v, temp7v );
2684 
2685 #if 1
2686     temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
2687     satdv += vec_sum4s( temp0v, zero_s32v);
2688 
2689     temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
2690     satdv1= vec_sum4s( temp1v, zero_s32v );
2691 
2692     temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
2693     satdv2= vec_sum4s( temp2v, zero_s32v );
2694 
2695     temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
2696     satdv3= vec_sum4s( temp3v, zero_s32v );
2697 
2698     temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
2699     satdv4 = vec_sum4s( temp4v, zero_s32v);
2700 
2701     temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
2702     satdv5= vec_sum4s( temp5v, zero_s32v );
2703 
2704     temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
2705     satdv6= vec_sum4s( temp6v, zero_s32v );
2706 
2707     temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
2708     satdv7= vec_sum4s( temp7v, zero_s32v );
2709 
2710     satdv  += satdv1;
2711     satdv2 += satdv3;
2712     satdv4 += satdv5;
2713     satdv6 += satdv7;
2714 
2715     satdv  += satdv2;
2716     satdv4 += satdv6;
2717     satdv  += satdv4;
2718 #else
2719     temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
2720     satdv = vec_sum4s( temp0v, satdv);
2721 
2722     temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
2723     satdv= vec_sum4s( temp1v, satdv );
2724 
2725     temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
2726     satdv= vec_sum4s( temp2v, satdv );
2727 
2728     temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
2729     satdv= vec_sum4s( temp3v, satdv );
2730 
2731     temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
2732     satdv = vec_sum4s( temp4v, satdv);
2733 
2734     temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
2735     satdv= vec_sum4s( temp5v, satdv );
2736 
2737     temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
2738     satdv= vec_sum4s( temp6v, satdv );
2739 
2740     temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
2741     satdv= vec_sum4s( temp7v, satdv );
2742 #endif
2743     HADAMARD4_VEC( diffl0v, diffl1v, diffl2v, diffl3v, temp0v, temp1v, temp2v, temp3v );
2744     HADAMARD4_VEC( diffl4v, diffl5v, diffl6v, diffl7v, temp4v, temp5v, temp6v, temp7v );
2745 
2746     VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
2747                      temp4v, temp5v, temp6v, temp7v,
2748                      diffl0v, diffl1v, diffl2v, diffl3v,
2749                      diffl4v, diffl5v, diffl6v, diffl7v );
2750 
2751     HADAMARD4_VEC( diffl0v, diffl1v, diffl2v, diffl3v, temp0v, temp1v, temp2v, temp3v );
2752     HADAMARD4_VEC( diffl4v, diffl5v, diffl6v, diffl7v,  temp4v, temp5v, temp6v, temp7v );
2753 
2754 #if 1
2755     temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
2756     satdv += vec_sum4s( temp0v, zero_s32v);
2757 
2758     temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
2759     satdv1= vec_sum4s( temp1v, zero_s32v );
2760 
2761     temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
2762     satdv2= vec_sum4s( temp2v, zero_s32v );
2763 
2764     temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
2765     satdv3= vec_sum4s( temp3v, zero_s32v );
2766 
2767     temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
2768     satdv4 = vec_sum4s( temp4v, zero_s32v);
2769 
2770     temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
2771     satdv5= vec_sum4s( temp5v, zero_s32v );
2772 
2773     temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
2774     satdv6= vec_sum4s( temp6v, zero_s32v );
2775 
2776     temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
2777     satdv7= vec_sum4s( temp7v, zero_s32v );
2778 
2779     satdv  += satdv1;
2780     satdv2 += satdv3;
2781     satdv4 += satdv5;
2782     satdv6 += satdv7;
2783 
2784     satdv  += satdv2;
2785     satdv4 += satdv6;
2786     satdv  += satdv4;
2787 
2788     satdv = vec_sums( satdv, zero_s32v );
2789     sum = vec_extract(satdv, 3);
2790 #else
2791     temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
2792     satdv = vec_sum4s( temp0v, satdv);
2793 
2794     temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
2795     satdv= vec_sum4s( temp1v, satdv );
2796 
2797     temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
2798     satdv= vec_sum4s( temp2v, satdv );
2799 
2800     temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
2801     satdv= vec_sum4s( temp3v, satdv );
2802 
2803     temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
2804     satdv = vec_sum4s( temp4v, satdv);
2805 
2806     temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
2807     satdv= vec_sum4s( temp5v, satdv );
2808 
2809     temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
2810     satdv= vec_sum4s( temp6v, satdv );
2811 
2812     temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
2813     satdv= vec_sum4s( temp7v, satdv );
2814 
2815     satdv = vec_sums( satdv, zero_s32v );
2816     satdv = vec_splat( satdv, 3 );
2817     vec_ste( satdv, 0, &sum );
2818 #endif
2819     return sum >> 1;
2820 }
2821 
2822 
2823 template<int w, int h>
2824 int satd_altivec(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
2825 
2826 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2827 int satd_altivec<4, 4>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2828 {
2829     return satd_4x4_altivec(pix1, stride_pix1, pix2, stride_pix2);
2830 }
2831 
2832 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2833 int satd_altivec<4, 8>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2834 {
2835     return  satd_4x8_altivec(pix1, stride_pix1, pix2, stride_pix2);
2836 }
2837 
2838 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2839 int satd_altivec<4, 12>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2840 {
2841     int satd = 0;
2842     satd = satd_4x4_altivec(pix1, stride_pix1, pix2, stride_pix2)
2843 		+ satd_4x8_altivec(pix1+4*stride_pix1, stride_pix1, pix2+4*stride_pix2, stride_pix2);
2844 
2845     return satd;
2846 }
2847 
2848 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2849 int satd_altivec<4, 16>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2850 {
2851     int satd = 0;
2852     satd = satd_4x8_altivec(pix1, stride_pix1, pix2, stride_pix2)
2853 		+ satd_4x8_altivec(pix1+8*stride_pix1, stride_pix1, pix2+8*stride_pix2, stride_pix2);
2854 
2855     return satd;
2856 }
2857 
2858 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2859 int satd_altivec<4, 24>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2860 {
2861     int satd = 0;
2862     satd = satd_4x8_altivec(pix1, stride_pix1, pix2, stride_pix2)
2863 		+ satd_4x8_altivec(pix1+8*stride_pix1, stride_pix1, pix2+8*stride_pix2, stride_pix2)
2864 		+ satd_4x8_altivec(pix1+16*stride_pix1, stride_pix1, pix2+16*stride_pix2, stride_pix2);
2865 
2866     return satd;
2867 }
2868 
2869 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2870 int satd_altivec<4, 32>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2871 {
2872     int satd = 0;
2873     satd = satd_4x8_altivec(pix1, stride_pix1, pix2, stride_pix2)
2874 		+ satd_4x8_altivec(pix1+8*stride_pix1, stride_pix1, pix2+8*stride_pix2, stride_pix2)
2875 		+ satd_4x8_altivec(pix1+16*stride_pix1, stride_pix1, pix2+16*stride_pix2, stride_pix2)
2876 		+ satd_4x8_altivec(pix1+24*stride_pix1, stride_pix1, pix2+24*stride_pix2, stride_pix2);
2877 
2878     return satd;
2879 }
2880 
2881 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2882 int satd_altivec<4, 64>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2883 {
2884     int satd = 0;
2885     satd = satd_altivec<4, 32>(pix1, stride_pix1, pix2, stride_pix2)
2886 		+ satd_altivec<4, 32>(pix1+32*stride_pix1, stride_pix1, pix2+32*stride_pix2, stride_pix2);
2887 
2888     return satd;
2889 }
2890 
2891 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2892 int satd_altivec<8, 4>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2893 {
2894     return satd_8x4_altivec(pix1, stride_pix1, pix2, stride_pix2);
2895 }
2896 
2897 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2898 int satd_altivec<8, 8>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2899 {
2900     return satd_8x8_altivec(pix1, stride_pix1, pix2, stride_pix2);
2901 }
2902 
2903 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2904 int satd_altivec<8, 12>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2905 {
2906     int satd = 0;
2907     satd = satd_8x8_altivec(pix1, stride_pix1, pix2, stride_pix2)
2908 		+ satd_8x4_altivec(pix1+8*stride_pix1, stride_pix1, pix2+8*stride_pix2, stride_pix2);
2909     return satd;
2910 }
2911 
2912 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2913 int satd_altivec<8,16>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2914 {
2915     return satd_8x16_altivec(pix1, stride_pix1, pix2, stride_pix2);
2916 }
2917 
2918 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2919 int satd_altivec<8,24>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2920 {
2921     int satd = 0;
2922     satd = satd_8x8_altivec(pix1, stride_pix1, pix2, stride_pix2)
2923 		+ satd_8x16_altivec(pix1+8*stride_pix1, stride_pix1, pix2+8*stride_pix2, stride_pix2);
2924     return satd;
2925 }
2926 
2927 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2928 int satd_altivec<8,32>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2929 {
2930     int satd = 0;
2931     satd = satd_8x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
2932 		+ satd_8x16_altivec(pix1+16*stride_pix1, stride_pix1, pix2+16*stride_pix2, stride_pix2);
2933     return satd;
2934 }
2935 
2936 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2937 int satd_altivec<8,64>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2938 {
2939     int satd = 0;
2940     satd = satd_8x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
2941 		+ satd_8x16_altivec(pix1+16*stride_pix1, stride_pix1, pix2+16*stride_pix2, stride_pix2)
2942 		+ satd_8x16_altivec(pix1+32*stride_pix1, stride_pix1, pix2+32*stride_pix2, stride_pix2)
2943 		+ satd_8x16_altivec(pix1+48*stride_pix1, stride_pix1, pix2+48*stride_pix2, stride_pix2);
2944     return satd;
2945 }
2946 
2947 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2948 int satd_altivec<12, 4>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2949 {
2950     int satd = 0;
2951     satd = satd_8x4_altivec(pix1, stride_pix1, pix2, stride_pix2)
2952 		+ satd_4x4_altivec(pix1+8, stride_pix1, pix2+8, stride_pix2);
2953     return satd;
2954 }
2955 
2956 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2957 int satd_altivec<12, 8>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2958 {
2959     int satd = 0;
2960     satd = satd_8x8_altivec(pix1, stride_pix1, pix2, stride_pix2)
2961 		+ satd_4x8_altivec(pix1+8, stride_pix1, pix2+8, stride_pix2);
2962     return satd;
2963 }
2964 
2965 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2966 int satd_altivec<12, 12>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2967 {
2968     int satd = 0;
2969     const pixel *pix3 = pix1 + 8*stride_pix1;
2970     const pixel *pix4 = pix2 + 8*stride_pix2;
2971     satd = satd_8x8_altivec(pix1, stride_pix1, pix2, stride_pix2)
2972 		+ satd_4x8_altivec(pix1+8, stride_pix1, pix2+8, stride_pix2);
2973               + satd_8x4_altivec(pix3, stride_pix1, pix4, stride_pix2)
2974 		+ satd_4x4_altivec(pix3+8, stride_pix1, pix4+8, stride_pix2);
2975     return satd;
2976 }
2977 
2978 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2979 int satd_altivec<12, 16>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2980 {
2981     int satd = 0;
2982     const pixel *pix3 = pix1 + 8*stride_pix1;
2983     const pixel *pix4 = pix2 + 8*stride_pix2;
2984     satd = satd_8x8_altivec(pix1, stride_pix1, pix2, stride_pix2)
2985 		+ satd_4x8_altivec(pix1+8, stride_pix1, pix2+8, stride_pix2)
2986 		+ satd_8x8_altivec(pix3, stride_pix1, pix4, stride_pix2)
2987 		+ satd_4x8_altivec(pix3+8, stride_pix1, pix4+8, stride_pix2);
2988     return satd;
2989 }
2990 
2991 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2992 int satd_altivec<12, 24>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2993 {
2994     int satd = 0;
2995     const pixel *pix3 = pix1 + 8*stride_pix1;
2996     const pixel *pix4 = pix2 + 8*stride_pix2;
2997     satd = satd_8x8_altivec(pix1, stride_pix1, pix2, stride_pix2)
2998 		+ satd_4x8_altivec(pix1+8, stride_pix1, pix2+8, stride_pix2)
2999 		+ satd_8x16_altivec(pix3, stride_pix1, pix4, stride_pix2)
3000 		+ satd_altivec<4, 16>(pix3+8, stride_pix1, pix4+8, stride_pix2);
3001     return satd;
3002 }
3003 
3004 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3005 int satd_altivec<12, 32>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3006 {
3007     int satd = 0;
3008     const pixel *pix3 = pix1 + 16*stride_pix1;
3009     const pixel *pix4 = pix2 + 16*stride_pix2;
3010     satd = satd_8x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3011 		+ satd_altivec<4, 16>(pix1+8, stride_pix1, pix2+8, stride_pix2)
3012 		+ satd_8x16_altivec(pix3, stride_pix1, pix4, stride_pix2)
3013 		+ satd_altivec<4, 16>(pix3+8, stride_pix1, pix4+8, stride_pix2);
3014     return satd;
3015 }
3016 
3017 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3018 int satd_altivec<12, 64>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3019 {
3020     int satd = 0;
3021     const pixel *pix3 = pix1 + 16*stride_pix1;
3022     const pixel *pix4 = pix2 + 16*stride_pix2;
3023     const pixel *pix5 = pix1 + 32*stride_pix1;
3024     const pixel *pix6 = pix2 + 32*stride_pix2;
3025     const pixel *pix7 = pix1 + 48*stride_pix1;
3026     const pixel *pix8 = pix2 + 48*stride_pix2;
3027     satd = satd_8x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3028 		+ satd_altivec<4, 16>(pix1+8, stride_pix1, pix2+8, stride_pix2)
3029 		+ satd_8x16_altivec(pix3, stride_pix1, pix4, stride_pix2)
3030 		+ satd_altivec<4, 16>(pix3+8, stride_pix1, pix4+8, stride_pix2)
3031 		+ satd_8x16_altivec(pix5, stride_pix1, pix6, stride_pix2)
3032 		+ satd_altivec<4, 16>(pix5+8, stride_pix1, pix6+8, stride_pix2)
3033 		+ satd_8x16_altivec(pix7, stride_pix1, pix8, stride_pix2)
3034 		+ satd_altivec<4, 16>(pix7+8, stride_pix1, pix8+8, stride_pix2);
3035     return satd;
3036 }
3037 
3038 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3039 int satd_altivec<16, 4>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3040 {
3041     return satd_16x4_altivec(pix1, stride_pix1, pix2, stride_pix2);
3042 }
3043 
3044 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3045 int satd_altivec<16, 8>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3046 {
3047     return satd_16x8_altivec(pix1, stride_pix1, pix2, stride_pix2);
3048 }
3049 
3050 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3051 int satd_altivec<16, 12>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3052 {
3053     int satd = 0;
3054     satd = satd_16x4_altivec(pix1, stride_pix1, pix2, stride_pix2)
3055 		+ satd_16x8_altivec(pix1+4*stride_pix1, stride_pix1, pix2+4*stride_pix2, stride_pix2);
3056     return satd;
3057 }
3058 
3059 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3060 int satd_altivec<16, 16>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3061 {
3062     return satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2);
3063 }
3064 
3065 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3066 int satd_altivec<16, 24>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3067 {
3068     int satd = 0;
3069     satd = satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3070 		+ satd_16x8_altivec(pix1+16*stride_pix1, stride_pix1, pix2+16*stride_pix2, stride_pix2);
3071     return satd;
3072 }
3073 
3074 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3075 int satd_altivec<16, 32>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3076 {
3077     int satd = 0;
3078     satd = satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3079 		+ satd_16x16_altivec(pix1+16*stride_pix1, stride_pix1, pix2+16*stride_pix2, stride_pix2);
3080     return satd;
3081 }
3082 
3083 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3084 int satd_altivec<16, 64>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3085 {
3086     int satd = 0;
3087     satd = satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3088 		+ satd_16x16_altivec(pix1+16*stride_pix1, stride_pix1, pix2+16*stride_pix2, stride_pix2)
3089 		+ satd_16x16_altivec(pix1+32*stride_pix1, stride_pix1, pix2+32*stride_pix2, stride_pix2)
3090 		+ satd_16x16_altivec(pix1+48*stride_pix1, stride_pix1, pix2+48*stride_pix2, stride_pix2);
3091     return satd;
3092 }
3093 
3094 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3095 int satd_altivec<24, 4>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3096 {
3097     int satd = 0;
3098     satd = satd_16x4_altivec(pix1, stride_pix1, pix2, stride_pix2)
3099 		+ satd_8x4_altivec(pix1+16, stride_pix1, pix2+16, stride_pix2);
3100     return satd;
3101 }
3102 
3103 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3104 int satd_altivec<24, 8>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3105 {
3106     int satd = 0;
3107     satd = satd_16x8_altivec(pix1, stride_pix1, pix2, stride_pix2)
3108 		+ satd_8x8_altivec(pix1+16, stride_pix1, pix2+16, stride_pix2);
3109     return satd;
3110 }
3111 
3112 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3113 int satd_altivec<24, 12>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3114 {
3115     int satd = 0;
3116     const pixel *pix3 = pix1 + 8*stride_pix1;
3117     const pixel *pix4 = pix2 + 8*stride_pix2;
3118     satd = satd_16x8_altivec(pix1, stride_pix1, pix2, stride_pix2)
3119 		+ satd_8x8_altivec(pix1+16, stride_pix1, pix2+16, stride_pix2)
3120               + satd_16x4_altivec(pix3, stride_pix1, pix4, stride_pix2)
3121 		+ satd_8x4_altivec(pix3+16, stride_pix1, pix4+16, stride_pix2);
3122     return satd;
3123 }
3124 
3125 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3126 int satd_altivec<24, 16>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3127 {
3128     int satd = 0;
3129     satd = satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3130 		+ satd_8x16_altivec(pix1+16, stride_pix1, pix2+16, stride_pix2);
3131     return satd;
3132 }
3133 
3134 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3135 int satd_altivec<24, 24>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3136 {
3137     int satd = 0;
3138     satd = satd_altivec<24, 16>(pix1, stride_pix1, pix2, stride_pix2)
3139 		+ satd_altivec<24, 8>(pix1+16*stride_pix1, stride_pix1, pix2+16*stride_pix2, stride_pix2);
3140     return satd;
3141 }
3142 
3143 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3144 int satd_altivec<24, 32>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3145 {
3146     int satd = 0;
3147     const pixel *pix3 = pix1 + 16*stride_pix1;
3148     const pixel *pix4 = pix2 + 16*stride_pix2;
3149     satd = satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3150 		+ satd_8x16_altivec(pix1+16, stride_pix1, pix2+16, stride_pix2)
3151               + satd_16x16_altivec(pix3, stride_pix1, pix4, stride_pix2)
3152 		+ satd_8x16_altivec(pix3+16, stride_pix1, pix4+16, stride_pix2);
3153     return satd;
3154 }
3155 
3156 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3157 int satd_altivec<24, 64>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3158 {
3159     int satd = 0;
3160     const pixel *pix3 = pix1 + 16*stride_pix1;
3161     const pixel *pix4 = pix2 + 16*stride_pix2;
3162     const pixel *pix5 = pix1 + 32*stride_pix1;
3163     const pixel *pix6 = pix2 + 32*stride_pix2;
3164     const pixel *pix7 = pix1 + 48*stride_pix1;
3165     const pixel *pix8 = pix2 + 48*stride_pix2;
3166     satd = satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3167               + satd_8x16_altivec(pix1+16, stride_pix1, pix2+16, stride_pix2)
3168 		+ satd_16x16_altivec(pix3, stride_pix1, pix4, stride_pix2)
3169 		+ satd_8x16_altivec(pix3+16, stride_pix1, pix4+16, stride_pix2)
3170 		+ satd_16x16_altivec(pix5, stride_pix1, pix6, stride_pix2)
3171 		+ satd_8x16_altivec(pix5+16, stride_pix1, pix6+16, stride_pix2)
3172 		+ satd_16x16_altivec(pix7, stride_pix1, pix8, stride_pix2)
3173 		+ satd_8x16_altivec(pix7+16, stride_pix1, pix8+16, stride_pix2);
3174     return satd;
3175 }
3176 
3177 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3178 int satd_altivec<32, 4>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3179 {
3180     int satd = 0;
3181     satd = satd_16x4_altivec(pix1, stride_pix1, pix2, stride_pix2)
3182 		+ satd_16x4_altivec(pix1 + 16, stride_pix1, pix2 + 16, stride_pix2);
3183     return satd;
3184 }
3185 
3186 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3187 int satd_altivec<32, 8>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3188 {
3189     int satd = 0;
3190     satd = satd_16x8_altivec(pix1, stride_pix1, pix2, stride_pix2)
3191 		+ satd_16x8_altivec(pix1 + 16, stride_pix1, pix2 + 16, stride_pix2);
3192     return satd;
3193 }
3194 
3195 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3196 int satd_altivec<32, 12>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3197 {
3198     int satd = 0;
3199     const pixel *pix3 = pix1 + 8*stride_pix1;
3200     const pixel *pix4 = pix2 + 8*stride_pix2;
3201     satd = satd_16x8_altivec(pix1, stride_pix1, pix2, stride_pix2)
3202 		+ satd_16x8_altivec(pix1 + 16, stride_pix1, pix2 + 16, stride_pix2)
3203 		+ satd_16x4_altivec(pix3, stride_pix1, pix4, stride_pix2)
3204 		+ satd_16x4_altivec(pix3 + 16, stride_pix1, pix4 + 16, stride_pix2);
3205     return satd;
3206 }
3207 
3208 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3209 int satd_altivec<32, 16>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3210 {
3211     int satd = 0;
3212     satd = satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3213 		+ satd_16x16_altivec(pix1 + 16, stride_pix1, pix2 + 16, stride_pix2);
3214     return satd;
3215 }
3216 
3217 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3218 int satd_altivec<32, 24>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3219 {
3220     int satd = 0;
3221     const pixel *pix3 = pix1 + 16*stride_pix1;
3222     const pixel *pix4 = pix2 + 16*stride_pix2;
3223     satd = satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3224 		+ satd_16x16_altivec(pix1 + 16, stride_pix1, pix2 + 16, stride_pix2)
3225 		+ satd_16x8_altivec(pix3, stride_pix1, pix4, stride_pix2)
3226 		+ satd_16x8_altivec(pix3 + 16, stride_pix1, pix4 + 16, stride_pix2);
3227     return satd;
3228 }
3229 
3230 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3231 int satd_altivec<32, 32>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3232 {
3233     int satd = 0;
3234     const pixel *pix3 = pix1 + 16*stride_pix1;
3235     const pixel *pix4 = pix2 + 16*stride_pix2;
3236     satd = satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3237 		+ satd_16x16_altivec(pix1 + 16, stride_pix1, pix2 + 16, stride_pix2)
3238 		+ satd_16x16_altivec(pix3, stride_pix1, pix4, stride_pix2)
3239 		+ satd_16x16_altivec(pix3 + 16, stride_pix1, pix4 + 16, stride_pix2);
3240     return satd;
3241 }
3242 
3243 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3244 int satd_altivec<32, 48>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3245 {
3246     int satd = 0;
3247     const pixel *pix3 = pix1 + 16*stride_pix1;
3248     const pixel *pix4 = pix2 + 16*stride_pix2;
3249     const pixel *pix5 = pix1 + 32*stride_pix1;
3250     const pixel *pix6 = pix2 + 32*stride_pix2;
3251     satd = satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3252 		+ satd_16x16_altivec(pix1 + 16, stride_pix1, pix2 + 16, stride_pix2)
3253 		+ satd_16x16_altivec(pix3, stride_pix1, pix4, stride_pix2)
3254 		+ satd_16x16_altivec(pix3 + 16, stride_pix1, pix4 + 16, stride_pix2)
3255 		+ satd_16x16_altivec(pix5, stride_pix1, pix6, stride_pix2)
3256 		+ satd_16x16_altivec(pix5+16, stride_pix1, pix6+16, stride_pix2);
3257     return satd;
3258 }
3259 
3260 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3261 int satd_altivec<32, 64>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3262 {
3263     int satd = 0;
3264     const pixel *pix3 = pix1 + 16*stride_pix1;
3265     const pixel *pix4 = pix2 + 16*stride_pix2;
3266     const pixel *pix5 = pix1 + 32*stride_pix1;
3267     const pixel *pix6 = pix2 + 32*stride_pix2;
3268     const pixel *pix7 = pix1 + 48*stride_pix1;
3269     const pixel *pix8 = pix2 + 48*stride_pix2;
3270     satd = satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3271               + satd_16x16_altivec(pix1+16, stride_pix1, pix2+16, stride_pix2)
3272 		+ satd_16x16_altivec(pix3, stride_pix1, pix4, stride_pix2)
3273 		+ satd_16x16_altivec(pix3+16, stride_pix1, pix4+16, stride_pix2)
3274 		+ satd_16x16_altivec(pix5, stride_pix1, pix6, stride_pix2)
3275 		+ satd_16x16_altivec(pix5+16, stride_pix1, pix6+16, stride_pix2)
3276 		+ satd_16x16_altivec(pix7, stride_pix1, pix8, stride_pix2)
3277 		+ satd_16x16_altivec(pix7+16, stride_pix1, pix8+16, stride_pix2);
3278     return satd;
3279 }
3280 
3281 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3282 int satd_altivec<48, 4>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3283 {
3284     int satd = 0;
3285     satd = satd_16x4_altivec(pix1, stride_pix1, pix2, stride_pix2)
3286               + satd_16x4_altivec(pix1+16, stride_pix1, pix2+16, stride_pix2)
3287               + satd_16x4_altivec(pix1+32, stride_pix1, pix2+32, stride_pix2);
3288     return satd;
3289 }
3290 
3291 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3292 int satd_altivec<48, 8>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3293 {
3294     int satd = 0;
3295     satd = satd_16x8_altivec(pix1, stride_pix1, pix2, stride_pix2)
3296               + satd_16x8_altivec(pix1+16, stride_pix1, pix2+16, stride_pix2)
3297               + satd_16x8_altivec(pix1+32, stride_pix1, pix2+32, stride_pix2);
3298     return satd;
3299 }
3300 
3301 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3302 int satd_altivec<48, 12>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3303 {
3304     int satd = 0;
3305     const pixel *pix3 = pix1 + 8*stride_pix1;
3306     const pixel *pix4 = pix2 + 8*stride_pix2;
3307     satd = satd_16x8_altivec(pix1, stride_pix1, pix2, stride_pix2)
3308               + satd_16x8_altivec(pix1+16, stride_pix1, pix2+16, stride_pix2)
3309               + satd_16x8_altivec(pix1+32, stride_pix1, pix2+32, stride_pix2)
3310               +satd_16x4_altivec(pix3, stride_pix1, pix4, stride_pix2)
3311               + satd_16x4_altivec(pix3+16, stride_pix1, pix4+16, stride_pix2)
3312               + satd_16x4_altivec(pix3+32, stride_pix1, pix4+32, stride_pix2);
3313     return satd;
3314 }
3315 
3316 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3317 int satd_altivec<48, 16>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3318 {
3319     int satd = 0;
3320     satd = satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3321               + satd_16x16_altivec(pix1+16, stride_pix1, pix2+16, stride_pix2)
3322               + satd_16x16_altivec(pix1+32, stride_pix1, pix2+32, stride_pix2);
3323     return satd;
3324 }
3325 
3326 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3327 int satd_altivec<48, 24>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3328 {
3329     int satd = 0;
3330     const pixel *pix3 = pix1 + 8*stride_pix1;
3331     const pixel *pix4 = pix2 + 8*stride_pix2;
3332     satd = satd_16x8_altivec(pix1, stride_pix1, pix2, stride_pix2)
3333               + satd_16x8_altivec(pix1+16, stride_pix1, pix2+16, stride_pix2)
3334               + satd_16x8_altivec(pix1+32, stride_pix1, pix2+32, stride_pix2)
3335               +satd_16x16_altivec(pix3, stride_pix1, pix4, stride_pix2)
3336               + satd_16x16_altivec(pix3+16, stride_pix1, pix4+16, stride_pix2)
3337               + satd_16x16_altivec(pix3+32, stride_pix1, pix4+32, stride_pix2);
3338     return satd;
3339 }
3340 
3341 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3342 int satd_altivec<48, 32>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3343 {
3344     int satd = 0;
3345     const pixel *pix3 = pix1 + 16*stride_pix1;
3346     const pixel *pix4 = pix2 + 16*stride_pix2;
3347     satd = satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3348               + satd_16x16_altivec(pix1+16, stride_pix1, pix2+16, stride_pix2)
3349               + satd_16x16_altivec(pix1+32, stride_pix1, pix2+32, stride_pix2)
3350               +satd_16x16_altivec(pix3, stride_pix1, pix4, stride_pix2)
3351               + satd_16x16_altivec(pix3+16, stride_pix1, pix4+16, stride_pix2)
3352               + satd_16x16_altivec(pix3+32, stride_pix1, pix4+32, stride_pix2);
3353     return satd;
3354 }
3355 
3356 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3357 int satd_altivec<48, 64>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3358 {
3359     int satd = 0;
3360     const pixel *pix3 = pix1 + 16*stride_pix1;
3361     const pixel *pix4 = pix2 + 16*stride_pix2;
3362     const pixel *pix5 = pix1 + 32*stride_pix1;
3363     const pixel *pix6 = pix2 + 32*stride_pix2;
3364     const pixel *pix7 = pix1 + 48*stride_pix1;
3365     const pixel *pix8 = pix2 + 48*stride_pix2;
3366     satd = satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3367               + satd_16x16_altivec(pix1+16, stride_pix1, pix2+16, stride_pix2)
3368               + satd_16x16_altivec(pix1+32, stride_pix1, pix2+32, stride_pix2)
3369               +satd_16x16_altivec(pix3, stride_pix1, pix4, stride_pix2)
3370               + satd_16x16_altivec(pix3+16, stride_pix1, pix4+16, stride_pix2)
3371               + satd_16x16_altivec(pix3+32, stride_pix1, pix4+32, stride_pix2)
3372               +satd_16x16_altivec(pix5, stride_pix1, pix6, stride_pix2)
3373               + satd_16x16_altivec(pix5+16, stride_pix1, pix6+16, stride_pix2)
3374               + satd_16x16_altivec(pix5+32, stride_pix1, pix6+32, stride_pix2)
3375               +satd_16x16_altivec(pix7, stride_pix1, pix8, stride_pix2)
3376               + satd_16x16_altivec(pix7+16, stride_pix1,pix8+16, stride_pix2)
3377               + satd_16x16_altivec(pix7+32, stride_pix1, pix8+32, stride_pix2);
3378     return satd;
3379 }
3380 
3381 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3382 int satd_altivec<64, 4>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3383 {
3384     int satd = 0;
3385     satd = satd_altivec<32, 4>(pix1, stride_pix1, pix2, stride_pix2)
3386               + satd_altivec<32, 4>(pix1+32, stride_pix1, pix2+32, stride_pix2);
3387     return satd;
3388 }
3389 
3390 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3391 int satd_altivec<64, 8>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3392 {
3393     int satd = 0;
3394     satd = satd_altivec<32, 8>(pix1, stride_pix1, pix2, stride_pix2)
3395               + satd_altivec<32, 8>(pix1+32, stride_pix1, pix2+32, stride_pix2);
3396     return satd;
3397 }
3398 
3399 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3400 int satd_altivec<64, 12>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3401 {
3402     int satd = 0;
3403     satd = satd_altivec<32, 12>(pix1, stride_pix1, pix2, stride_pix2)
3404               + satd_altivec<32, 12>(pix1+32, stride_pix1, pix2+32, stride_pix2);
3405     return satd;
3406 }
3407 
3408 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3409 int satd_altivec<64, 16>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3410 {
3411     int satd = 0;
3412     satd = satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3413               + satd_16x16_altivec(pix1+16, stride_pix1, pix2+16, stride_pix2)
3414               + satd_16x16_altivec(pix1+32, stride_pix1, pix2+32, stride_pix2)
3415               + satd_16x16_altivec(pix1+48, stride_pix1, pix2+48, stride_pix2);
3416     return satd;
3417 }
3418 
3419 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3420 int satd_altivec<64, 24>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3421 {
3422     int satd = 0;
3423     satd = satd_altivec<32, 24>(pix1, stride_pix1, pix2, stride_pix2)
3424               + satd_altivec<32, 24>(pix1+32, stride_pix1, pix2+32, stride_pix2);
3425     return satd;
3426 }
3427 
3428 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3429 int satd_altivec<64, 32>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3430 {
3431     int satd = 0;
3432     const pixel *pix3 = pix1 + 16*stride_pix1;
3433     const pixel *pix4 = pix2 + 16*stride_pix2;
3434     satd = satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3435               + satd_16x16_altivec(pix1+16, stride_pix1, pix2+16, stride_pix2)
3436               + satd_16x16_altivec(pix1+32, stride_pix1, pix2+32, stride_pix2)
3437               + satd_16x16_altivec(pix1+48, stride_pix1, pix2+48, stride_pix2)
3438               + satd_16x16_altivec(pix3, stride_pix1, pix4, stride_pix2)
3439               + satd_16x16_altivec(pix3+16, stride_pix1, pix4+16, stride_pix2)
3440               + satd_16x16_altivec(pix3+32, stride_pix1, pix4+32, stride_pix2)
3441               + satd_16x16_altivec(pix3+48, stride_pix1, pix4+48, stride_pix2);
3442     return satd;
3443 }
3444 
3445 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3446 int satd_altivec<64, 48>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3447 {
3448     int satd = 0;
3449     const pixel *pix3 = pix1 + 16*stride_pix1;
3450     const pixel *pix4 = pix2 + 16*stride_pix2;
3451     const pixel *pix5 = pix1 + 32*stride_pix1;
3452     const pixel *pix6 = pix2 + 32*stride_pix2;
3453     satd = satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3454               + satd_16x16_altivec(pix1+16, stride_pix1, pix2+16, stride_pix2)
3455               + satd_16x16_altivec(pix1+32, stride_pix1, pix2+32, stride_pix2)
3456               + satd_16x16_altivec(pix1+48, stride_pix1, pix2+48, stride_pix2)
3457               + satd_16x16_altivec(pix3, stride_pix1, pix4, stride_pix2)
3458               + satd_16x16_altivec(pix3+16, stride_pix1, pix4+16, stride_pix2)
3459               + satd_16x16_altivec(pix3+32, stride_pix1, pix4+32, stride_pix2)
3460               + satd_16x16_altivec(pix3+48, stride_pix1, pix4+48, stride_pix2)
3461               + satd_16x16_altivec(pix5, stride_pix1, pix6, stride_pix2)
3462               + satd_16x16_altivec(pix5+16, stride_pix1, pix6+16, stride_pix2)
3463               + satd_16x16_altivec(pix5+32, stride_pix1, pix6+32, stride_pix2)
3464               + satd_16x16_altivec(pix5+48, stride_pix1, pix6+48, stride_pix2);
3465     return satd;
3466 }
3467 
3468 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3469 int satd_altivec<64, 64>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3470 {
3471     int satd = 0;
3472     const pixel *pix3 = pix1 + 16*stride_pix1;
3473     const pixel *pix4 = pix2 + 16*stride_pix2;
3474     const pixel *pix5 = pix1 + 32*stride_pix1;
3475     const pixel *pix6 = pix2 + 32*stride_pix2;
3476     const pixel *pix7 = pix1 + 48*stride_pix1;
3477     const pixel *pix8 = pix2 + 48*stride_pix2;
3478     satd = satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3479               + satd_16x16_altivec(pix1+16, stride_pix1, pix2+16, stride_pix2)
3480               + satd_16x16_altivec(pix1+32, stride_pix1, pix2+32, stride_pix2)
3481               + satd_16x16_altivec(pix1+48, stride_pix1, pix2+48, stride_pix2)
3482               + satd_16x16_altivec(pix3, stride_pix1, pix4, stride_pix2)
3483               + satd_16x16_altivec(pix3+16, stride_pix1, pix4+16, stride_pix2)
3484               + satd_16x16_altivec(pix3+32, stride_pix1, pix4+32, stride_pix2)
3485               + satd_16x16_altivec(pix3+48, stride_pix1, pix4+48, stride_pix2)
3486               + satd_16x16_altivec(pix5, stride_pix1, pix6, stride_pix2)
3487               + satd_16x16_altivec(pix5+16, stride_pix1, pix6+16, stride_pix2)
3488               + satd_16x16_altivec(pix5+32, stride_pix1, pix6+32, stride_pix2)
3489               + satd_16x16_altivec(pix5+48, stride_pix1, pix6+48, stride_pix2)
3490               + satd_16x16_altivec(pix7, stride_pix1, pix8, stride_pix2)
3491               + satd_16x16_altivec(pix7+16, stride_pix1, pix8+16, stride_pix2)
3492               + satd_16x16_altivec(pix7+32, stride_pix1, pix8+32, stride_pix2)
3493               + satd_16x16_altivec(pix7+48, stride_pix1, pix8+48, stride_pix2);
3494     return satd;
3495 }
3496 
3497 
3498 /***********************************************************************
3499  * SA8D routines - altivec implementation
3500  **********************************************************************/
3501 #define SA8D_1D_ALTIVEC( sa8d0v, sa8d1v, sa8d2v, sa8d3v,  \
3502                          sa8d4v, sa8d5v, sa8d6v, sa8d7v ) \
3503 {                                                         \
3504     /* int    a0  =        SRC(0) + SRC(4) */             \
3505     vec_s16_t a0v = vec_add(sa8d0v, sa8d4v);              \
3506     /* int    a4  =        SRC(0) - SRC(4) */             \
3507     vec_s16_t a4v = vec_sub(sa8d0v, sa8d4v);              \
3508     /* int    a1  =        SRC(1) + SRC(5) */             \
3509     vec_s16_t a1v = vec_add(sa8d1v, sa8d5v);              \
3510     /* int    a5  =        SRC(1) - SRC(5) */             \
3511     vec_s16_t a5v = vec_sub(sa8d1v, sa8d5v);              \
3512     /* int    a2  =        SRC(2) + SRC(6) */             \
3513     vec_s16_t a2v = vec_add(sa8d2v, sa8d6v);              \
3514     /* int    a6  =        SRC(2) - SRC(6) */             \
3515     vec_s16_t a6v = vec_sub(sa8d2v, sa8d6v);              \
3516     /* int    a3  =        SRC(3) + SRC(7) */             \
3517     vec_s16_t a3v = vec_add(sa8d3v, sa8d7v);              \
3518     /* int    a7  =        SRC(3) - SRC(7) */             \
3519     vec_s16_t a7v = vec_sub(sa8d3v, sa8d7v);              \
3520                                                           \
3521     /* int    b0  =         a0 + a2  */                   \
3522     vec_s16_t b0v = vec_add(a0v, a2v);                    \
3523     /* int    b2  =         a0 - a2; */                   \
3524     vec_s16_t  b2v = vec_sub(a0v, a2v);                   \
3525     /* int    b1  =         a1 + a3; */                   \
3526     vec_s16_t b1v = vec_add(a1v, a3v);                    \
3527     /* int    b3  =         a1 - a3; */                   \
3528     vec_s16_t b3v = vec_sub(a1v, a3v);                    \
3529     /* int    b4  =         a4 + a6; */                   \
3530     vec_s16_t b4v = vec_add(a4v, a6v);                    \
3531     /* int    b6  =         a4 - a6; */                   \
3532     vec_s16_t b6v = vec_sub(a4v, a6v);                    \
3533     /* int    b5  =         a5 + a7; */                   \
3534     vec_s16_t b5v = vec_add(a5v, a7v);                    \
3535     /* int    b7  =         a5 - a7; */                   \
3536     vec_s16_t b7v = vec_sub(a5v, a7v);                    \
3537                                                           \
3538     /* DST(0,        b0 + b1) */                          \
3539     sa8d0v = vec_add(b0v, b1v);                           \
3540     /* DST(1,        b0 - b1) */                          \
3541     sa8d1v = vec_sub(b0v, b1v);                           \
3542     /* DST(2,        b2 + b3) */                          \
3543     sa8d2v = vec_add(b2v, b3v);                           \
3544     /* DST(3,        b2 - b3) */                          \
3545     sa8d3v = vec_sub(b2v, b3v);                           \
3546     /* DST(4,        b4 + b5) */                          \
3547     sa8d4v = vec_add(b4v, b5v);                           \
3548     /* DST(5,        b4 - b5) */                          \
3549     sa8d5v = vec_sub(b4v, b5v);                           \
3550     /* DST(6,        b6 + b7) */                          \
3551     sa8d6v = vec_add(b6v, b7v);                           \
3552     /* DST(7,        b6 - b7) */                          \
3553     sa8d7v = vec_sub(b6v, b7v);                           \
3554 }
3555 
sa8d_8x8_altivec(const pixel * pix1,intptr_t i_pix1,const pixel * pix2,intptr_t i_pix2)3556 inline int sa8d_8x8_altivec(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
3557 {
3558     ALIGN_VAR_16(int, sum);
3559 
3560     LOAD_ZERO;
3561     vec_s16_t pix1v, pix2v;
3562     vec_s16_t diff0v, diff1v, diff2v, diff3v, diff4v, diff5v, diff6v, diff7v;
3563     vec_s16_t sa8d0v, sa8d1v, sa8d2v, sa8d3v, sa8d4v, sa8d5v, sa8d6v, sa8d7v;
3564 
3565     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3566     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3567     diff0v = vec_sub( pix1v, pix2v );
3568     pix1   += i_pix1;
3569     pix2   += i_pix2;
3570 
3571     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3572     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3573     diff1v = vec_sub( pix1v, pix2v );
3574     pix1   += i_pix1;
3575     pix2   += i_pix2;
3576 
3577     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3578     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3579     diff2v = vec_sub( pix1v, pix2v );
3580     pix1   += i_pix1;
3581     pix2   += i_pix2;
3582 
3583     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3584     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3585     diff3v = vec_sub( pix1v, pix2v );
3586     pix1   += i_pix1;
3587     pix2   += i_pix2;
3588 
3589     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3590     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3591     diff4v = vec_sub( pix1v, pix2v );
3592     pix1   += i_pix1;
3593     pix2   += i_pix2;
3594 
3595     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3596     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3597     diff5v = vec_sub( pix1v, pix2v );
3598     pix1   += i_pix1;
3599     pix2   += i_pix2;
3600 
3601     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3602     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3603     diff6v = vec_sub( pix1v, pix2v );
3604     pix1   += i_pix1;
3605     pix2   += i_pix2;
3606 
3607     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3608     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3609     diff7v = vec_sub( pix1v, pix2v );
3610     pix1   += i_pix1;
3611     pix2   += i_pix2;
3612 
3613 
3614     SA8D_1D_ALTIVEC(diff0v, diff1v, diff2v, diff3v,
3615                     diff4v, diff5v, diff6v, diff7v);
3616     VEC_TRANSPOSE_8(diff0v, diff1v, diff2v, diff3v,
3617                     diff4v, diff5v, diff6v, diff7v,
3618                     sa8d0v, sa8d1v, sa8d2v, sa8d3v,
3619                     sa8d4v, sa8d5v, sa8d6v, sa8d7v );
3620     SA8D_1D_ALTIVEC(sa8d0v, sa8d1v, sa8d2v, sa8d3v,
3621                     sa8d4v, sa8d5v, sa8d6v, sa8d7v );
3622 
3623     /* accumulation of the absolute value of all elements of the resulting bloc */
3624     vec_s16_t abs0v = vec_max( sa8d0v, vec_sub( zero_s16v, sa8d0v ) );
3625     vec_s16_t abs1v = vec_max( sa8d1v, vec_sub( zero_s16v, sa8d1v ) );
3626     vec_s16_t sum01v = vec_add(abs0v, abs1v);
3627 
3628     vec_s16_t abs2v = vec_max( sa8d2v, vec_sub( zero_s16v, sa8d2v ) );
3629     vec_s16_t abs3v = vec_max( sa8d3v, vec_sub( zero_s16v, sa8d3v ) );
3630     vec_s16_t sum23v = vec_add(abs2v, abs3v);
3631 
3632     vec_s16_t abs4v = vec_max( sa8d4v, vec_sub( zero_s16v, sa8d4v ) );
3633     vec_s16_t abs5v = vec_max( sa8d5v, vec_sub( zero_s16v, sa8d5v ) );
3634     vec_s16_t sum45v = vec_add(abs4v, abs5v);
3635 
3636     vec_s16_t abs6v = vec_max( sa8d6v, vec_sub( zero_s16v, sa8d6v ) );
3637     vec_s16_t abs7v = vec_max( sa8d7v, vec_sub( zero_s16v, sa8d7v ) );
3638     vec_s16_t sum67v = vec_add(abs6v, abs7v);
3639 
3640     vec_s16_t sum0123v = vec_add(sum01v, sum23v);
3641     vec_s16_t sum4567v = vec_add(sum45v, sum67v);
3642 
3643     vec_s32_t sumblocv;
3644 
3645     sumblocv = vec_sum4s(sum0123v, (vec_s32_t)zerov );
3646     //print_vec_s("sum0123v", &sum0123v);
3647     //print_vec_i("sumblocv = vec_sum4s(sum0123v, 0 )", &sumblocv);
3648     sumblocv = vec_sum4s(sum4567v, sumblocv );
3649     //print_vec_s("sum4567v", &sum4567v);
3650     //print_vec_i("sumblocv = vec_sum4s(sum4567v, sumblocv )", &sumblocv);
3651     sumblocv = vec_sums(sumblocv, (vec_s32_t)zerov );
3652     //print_vec_i("sumblocv=vec_sums(sumblocv,0 )", &sumblocv);
3653     sumblocv = vec_splat(sumblocv, 3);
3654     //print_vec_i("sumblocv = vec_splat(sumblocv, 3)", &sumblocv);
3655     vec_ste(sumblocv, 0, &sum);
3656 
3657     return (sum + 2) >> 2;
3658 }
3659 
3660 
sa8d_8x8_altivec(const int16_t * pix1,intptr_t i_pix1)3661 int sa8d_8x8_altivec(const int16_t* pix1, intptr_t i_pix1)
3662 {
3663     int sum = 0;
3664     return ((sum+2)>>2);
3665 }
3666 
sa8d_8x16_altivec(const pixel * pix1,intptr_t i_pix1,const pixel * pix2,intptr_t i_pix2)3667 inline int sa8d_8x16_altivec(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
3668 {
3669     ALIGN_VAR_16(int, sum);
3670     ALIGN_VAR_16(int, sum1);
3671 
3672     LOAD_ZERO;
3673     vec_s16_t pix1v, pix2v;
3674     vec_s16_t diff0v, diff1v, diff2v, diff3v, diff4v, diff5v, diff6v, diff7v;
3675     vec_s16_t sa8d0v, sa8d1v, sa8d2v, sa8d3v, sa8d4v, sa8d5v, sa8d6v, sa8d7v;
3676 
3677     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3678     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3679     diff0v = vec_sub( pix1v, pix2v );
3680     pix1   += i_pix1;
3681     pix2   += i_pix2;
3682 
3683     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3684     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3685     diff1v = vec_sub( pix1v, pix2v );
3686     pix1   += i_pix1;
3687     pix2   += i_pix2;
3688 
3689     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3690     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3691     diff2v = vec_sub( pix1v, pix2v );
3692     pix1   += i_pix1;
3693     pix2   += i_pix2;
3694 
3695     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3696     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3697     diff3v = vec_sub( pix1v, pix2v );
3698     pix1   += i_pix1;
3699     pix2   += i_pix2;
3700 
3701     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3702     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3703     diff4v = vec_sub( pix1v, pix2v );
3704     pix1   += i_pix1;
3705     pix2   += i_pix2;
3706 
3707     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3708     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3709     diff5v = vec_sub( pix1v, pix2v );
3710     pix1   += i_pix1;
3711     pix2   += i_pix2;
3712 
3713     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3714     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3715     diff6v = vec_sub( pix1v, pix2v );
3716     pix1   += i_pix1;
3717     pix2   += i_pix2;
3718 
3719     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3720     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3721     diff7v = vec_sub( pix1v, pix2v );
3722     pix1   += i_pix1;
3723     pix2   += i_pix2;
3724 
3725 
3726     SA8D_1D_ALTIVEC(diff0v, diff1v, diff2v, diff3v,
3727                     diff4v, diff5v, diff6v, diff7v);
3728     VEC_TRANSPOSE_8(diff0v, diff1v, diff2v, diff3v,
3729                     diff4v, diff5v, diff6v, diff7v,
3730                     sa8d0v, sa8d1v, sa8d2v, sa8d3v,
3731                     sa8d4v, sa8d5v, sa8d6v, sa8d7v );
3732     SA8D_1D_ALTIVEC(sa8d0v, sa8d1v, sa8d2v, sa8d3v,
3733                     sa8d4v, sa8d5v, sa8d6v, sa8d7v );
3734 
3735     /* accumulation of the absolute value of all elements of the resulting bloc */
3736     vec_s16_t abs0v = vec_max( sa8d0v, vec_sub( zero_s16v, sa8d0v ) );
3737     vec_s16_t abs1v = vec_max( sa8d1v, vec_sub( zero_s16v, sa8d1v ) );
3738     vec_s16_t sum01v = vec_add(abs0v, abs1v);
3739 
3740     vec_s16_t abs2v = vec_max( sa8d2v, vec_sub( zero_s16v, sa8d2v ) );
3741     vec_s16_t abs3v = vec_max( sa8d3v, vec_sub( zero_s16v, sa8d3v ) );
3742     vec_s16_t sum23v = vec_add(abs2v, abs3v);
3743 
3744     vec_s16_t abs4v = vec_max( sa8d4v, vec_sub( zero_s16v, sa8d4v ) );
3745     vec_s16_t abs5v = vec_max( sa8d5v, vec_sub( zero_s16v, sa8d5v ) );
3746     vec_s16_t sum45v = vec_add(abs4v, abs5v);
3747 
3748     vec_s16_t abs6v = vec_max( sa8d6v, vec_sub( zero_s16v, sa8d6v ) );
3749     vec_s16_t abs7v = vec_max( sa8d7v, vec_sub( zero_s16v, sa8d7v ) );
3750     vec_s16_t sum67v = vec_add(abs6v, abs7v);
3751 
3752     vec_s16_t sum0123v = vec_add(sum01v, sum23v);
3753     vec_s16_t sum4567v = vec_add(sum45v, sum67v);
3754 
3755     vec_s32_t sumblocv, sumblocv1;
3756 
3757     sumblocv = vec_sum4s(sum0123v, (vec_s32_t)zerov );
3758     sumblocv = vec_sum4s(sum4567v, sumblocv );
3759     sumblocv = vec_sums(sumblocv, (vec_s32_t)zerov );
3760     sumblocv = vec_splat(sumblocv, 3);
3761     vec_ste(sumblocv, 0, &sum);
3762 
3763     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3764     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3765     diff0v = vec_sub( pix1v, pix2v );
3766     pix1   += i_pix1;
3767     pix2   += i_pix2;
3768 
3769     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3770     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3771     diff1v = vec_sub( pix1v, pix2v );
3772     pix1   += i_pix1;
3773     pix2   += i_pix2;
3774 
3775     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3776     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3777     diff2v = vec_sub( pix1v, pix2v );
3778     pix1   += i_pix1;
3779     pix2   += i_pix2;
3780 
3781     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3782     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3783     diff3v = vec_sub( pix1v, pix2v );
3784     pix1   += i_pix1;
3785     pix2   += i_pix2;
3786 
3787     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3788     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3789     diff4v = vec_sub( pix1v, pix2v );
3790     pix1   += i_pix1;
3791     pix2   += i_pix2;
3792 
3793     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3794     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3795     diff5v = vec_sub( pix1v, pix2v );
3796     pix1   += i_pix1;
3797     pix2   += i_pix2;
3798 
3799     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3800     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3801     diff6v = vec_sub( pix1v, pix2v );
3802     pix1   += i_pix1;
3803     pix2   += i_pix2;
3804 
3805     pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3806     pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3807     diff7v = vec_sub( pix1v, pix2v );
3808     pix1   += i_pix1;
3809     pix2   += i_pix2;
3810 
3811 
3812     SA8D_1D_ALTIVEC(diff0v, diff1v, diff2v, diff3v,
3813                     diff4v, diff5v, diff6v, diff7v);
3814     VEC_TRANSPOSE_8(diff0v, diff1v, diff2v, diff3v,
3815                     diff4v, diff5v, diff6v, diff7v,
3816                     sa8d0v, sa8d1v, sa8d2v, sa8d3v,
3817                     sa8d4v, sa8d5v, sa8d6v, sa8d7v );
3818     SA8D_1D_ALTIVEC(sa8d0v, sa8d1v, sa8d2v, sa8d3v,
3819                     sa8d4v, sa8d5v, sa8d6v, sa8d7v );
3820 
3821     /* accumulation of the absolute value of all elements of the resulting bloc */
3822     abs0v = vec_max( sa8d0v, vec_sub( zero_s16v, sa8d0v ) );
3823     abs1v = vec_max( sa8d1v, vec_sub( zero_s16v, sa8d1v ) );
3824     sum01v = vec_add(abs0v, abs1v);
3825 
3826     abs2v = vec_max( sa8d2v, vec_sub( zero_s16v, sa8d2v ) );
3827     abs3v = vec_max( sa8d3v, vec_sub( zero_s16v, sa8d3v ) );
3828     sum23v = vec_add(abs2v, abs3v);
3829 
3830     abs4v = vec_max( sa8d4v, vec_sub( zero_s16v, sa8d4v ) );
3831     abs5v = vec_max( sa8d5v, vec_sub( zero_s16v, sa8d5v ) );
3832     sum45v = vec_add(abs4v, abs5v);
3833 
3834     abs6v = vec_max( sa8d6v, vec_sub( zero_s16v, sa8d6v ) );
3835     abs7v = vec_max( sa8d7v, vec_sub( zero_s16v, sa8d7v ) );
3836     sum67v = vec_add(abs6v, abs7v);
3837 
3838     sum0123v = vec_add(sum01v, sum23v);
3839     sum4567v = vec_add(sum45v, sum67v);
3840 
3841     sumblocv1 = vec_sum4s(sum0123v, (vec_s32_t)zerov );
3842     sumblocv1 = vec_sum4s(sum4567v, sumblocv1 );
3843     sumblocv1 = vec_sums(sumblocv1, (vec_s32_t)zerov );
3844     sumblocv1 = vec_splat(sumblocv1, 3);
3845     vec_ste(sumblocv1, 0, &sum1);
3846 
3847     sum = (sum + 2) >> 2;
3848     sum1 = (sum1 + 2) >> 2;
3849     sum += sum1;
3850     return (sum);
3851 }
3852 
sa8d_16x8_altivec(const pixel * pix1,intptr_t i_pix1,const pixel * pix2,intptr_t i_pix2)3853 inline int sa8d_16x8_altivec(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
3854 {
3855     ALIGN_VAR_16(int, sumh);
3856     ALIGN_VAR_16(int, suml);
3857 
3858     LOAD_ZERO;
3859     vec_s16_t pix1v, pix2v;
3860     vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v,
3861               diffh4v, diffh5v, diffh6v, diffh7v;
3862     vec_s16_t diffl0v, diffl1v, diffl2v, diffl3v,
3863               diffl4v, diffl5v, diffl6v, diffl7v;
3864     vec_s16_t sa8dh0v, sa8dh1v, sa8dh2v, sa8dh3v, sa8dh4v, sa8dh5v, sa8dh6v, sa8dh7v;
3865     vec_s16_t sa8dl0v, sa8dl1v, sa8dl2v, sa8dl3v, sa8dl4v, sa8dl5v, sa8dl6v, sa8dl7v;
3866     vec_s16_t temp0v, temp1v, temp2v, temp3v;
3867 
3868     VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh0v,diffl0v);
3869     VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh1v, diffl1v);
3870     VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh2v, diffl2v);
3871     VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh3v, diffl3v);
3872     VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh4v, diffl4v);
3873     VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh5v, diffl5v);
3874     VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh6v, diffl6v);
3875     VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh7v, diffl7v);
3876 
3877     SA8D_1D_ALTIVEC(diffh0v, diffh1v, diffh2v, diffh3v, diffh4v, diffh5v, diffh6v, diffh7v);
3878     VEC_TRANSPOSE_8(diffh0v, diffh1v, diffh2v, diffh3v, diffh4v, diffh5v, diffh6v, diffh7v,
3879                     sa8dh0v, sa8dh1v, sa8dh2v, sa8dh3v, sa8dh4v, sa8dh5v, sa8dh6v, sa8dh7v );
3880     SA8D_1D_ALTIVEC(sa8dh0v, sa8dh1v, sa8dh2v, sa8dh3v, sa8dh4v, sa8dh5v, sa8dh6v, sa8dh7v);
3881 
3882     SA8D_1D_ALTIVEC(diffl0v, diffl1v, diffl2v, diffl3v, diffl4v, diffl5v, diffl6v, diffl7v);
3883     VEC_TRANSPOSE_8(diffl0v, diffl1v, diffl2v, diffl3v, diffl4v, diffl5v, diffl6v, diffl7v,
3884                     sa8dl0v, sa8dl1v, sa8dl2v, sa8dl3v, sa8dl4v, sa8dl5v, sa8dl6v, sa8dl7v );
3885     SA8D_1D_ALTIVEC(sa8dl0v, sa8dl1v, sa8dl2v, sa8dl3v, sa8dl4v, sa8dl5v, sa8dl6v, sa8dl7v);
3886 
3887     /* accumulation of the absolute value of all elements of the resulting bloc */
3888     sa8dh0v = vec_max( sa8dh0v, vec_sub( zero_s16v, sa8dh0v ) );
3889     sa8dh1v = vec_max( sa8dh1v, vec_sub( zero_s16v, sa8dh1v ) );
3890     vec_s16_t sumh01v = vec_add(sa8dh0v, sa8dh1v);
3891 
3892     sa8dh2v = vec_max( sa8dh2v, vec_sub( zero_s16v, sa8dh2v ) );
3893     sa8dh3v = vec_max( sa8dh3v, vec_sub( zero_s16v, sa8dh3v ) );
3894     vec_s16_t sumh23v = vec_add(sa8dh2v, sa8dh3v);
3895 
3896     sa8dh4v = vec_max( sa8dh4v, vec_sub( zero_s16v, sa8dh4v ) );
3897     sa8dh5v = vec_max( sa8dh5v, vec_sub( zero_s16v, sa8dh5v ) );
3898     vec_s16_t sumh45v = vec_add(sa8dh4v, sa8dh5v);
3899 
3900     sa8dh6v = vec_max( sa8dh6v, vec_sub( zero_s16v, sa8dh6v ) );
3901     sa8dh7v = vec_max( sa8dh7v, vec_sub( zero_s16v, sa8dh7v ) );
3902     vec_s16_t sumh67v = vec_add(sa8dh6v, sa8dh7v);
3903 
3904     vec_s16_t sumh0123v = vec_add(sumh01v, sumh23v);
3905     vec_s16_t sumh4567v = vec_add(sumh45v, sumh67v);
3906 
3907     vec_s32_t sumblocv_h;
3908 
3909     sumblocv_h = vec_sum4s(sumh0123v, (vec_s32_t)zerov );
3910     //print_vec_s("sum0123v", &sum0123v);
3911     //print_vec_i("sumblocv = vec_sum4s(sum0123v, 0 )", &sumblocv);
3912     sumblocv_h = vec_sum4s(sumh4567v, sumblocv_h );
3913     //print_vec_s("sum4567v", &sum4567v);
3914     //print_vec_i("sumblocv = vec_sum4s(sum4567v, sumblocv )", &sumblocv);
3915     sumblocv_h = vec_sums(sumblocv_h, (vec_s32_t)zerov );
3916     //print_vec_i("sumblocv=vec_sums(sumblocv,0 )", &sumblocv);
3917     sumblocv_h = vec_splat(sumblocv_h, 3);
3918     //print_vec_i("sumblocv = vec_splat(sumblocv, 3)", &sumblocv);
3919     vec_ste(sumblocv_h, 0, &sumh);
3920 
3921     sa8dl0v = vec_max( sa8dl0v, vec_sub( zero_s16v, sa8dl0v ) );
3922     sa8dl1v = vec_max( sa8dl1v, vec_sub( zero_s16v, sa8dl1v ) );
3923     vec_s16_t suml01v = vec_add(sa8dl0v, sa8dl1v);
3924 
3925     sa8dl2v = vec_max( sa8dl2v, vec_sub( zero_s16v, sa8dl2v ) );
3926     sa8dl3v = vec_max( sa8dl3v, vec_sub( zero_s16v, sa8dl3v ) );
3927     vec_s16_t suml23v = vec_add(sa8dl2v, sa8dl3v);
3928 
3929     sa8dl4v = vec_max( sa8dl4v, vec_sub( zero_s16v, sa8dl4v ) );
3930     sa8dl5v = vec_max( sa8dl5v, vec_sub( zero_s16v, sa8dl5v ) );
3931     vec_s16_t suml45v = vec_add(sa8dl4v, sa8dl5v);
3932 
3933     sa8dl6v = vec_max( sa8dl6v, vec_sub( zero_s16v, sa8dl6v ) );
3934     sa8dl7v = vec_max( sa8dl7v, vec_sub( zero_s16v, sa8dl7v ) );
3935     vec_s16_t suml67v = vec_add(sa8dl6v, sa8dl7v);
3936 
3937     vec_s16_t suml0123v = vec_add(suml01v, suml23v);
3938     vec_s16_t suml4567v = vec_add(suml45v, suml67v);
3939 
3940     vec_s32_t sumblocv_l;
3941 
3942     sumblocv_l = vec_sum4s(suml0123v, (vec_s32_t)zerov );
3943     //print_vec_s("sum0123v", &sum0123v);
3944     //print_vec_i("sumblocv = vec_sum4s(sum0123v, 0 )", &sumblocv);
3945     sumblocv_l = vec_sum4s(suml4567v, sumblocv_l );
3946     //print_vec_s("sum4567v", &sum4567v);
3947     //print_vec_i("sumblocv = vec_sum4s(sum4567v, sumblocv )", &sumblocv);
3948     sumblocv_l = vec_sums(sumblocv_l, (vec_s32_t)zerov );
3949     //print_vec_i("sumblocv=vec_sums(sumblocv,0 )", &sumblocv);
3950     sumblocv_l = vec_splat(sumblocv_l, 3);
3951     //print_vec_i("sumblocv = vec_splat(sumblocv, 3)", &sumblocv);
3952     vec_ste(sumblocv_l, 0, &suml);
3953 
3954     sumh = (sumh + 2) >> 2;
3955     suml= (suml + 2) >> 2;
3956     return (sumh+suml);
3957 }
3958 
sa8d_16x16_altivec(const pixel * pix1,intptr_t i_pix1,const pixel * pix2,intptr_t i_pix2)3959 inline int sa8d_16x16_altivec(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
3960 {
3961     ALIGN_VAR_16(int, sumh0);
3962     ALIGN_VAR_16(int, suml0);
3963 
3964     ALIGN_VAR_16(int, sumh1);
3965     ALIGN_VAR_16(int, suml1);
3966 
3967     ALIGN_VAR_16(int, sum);
3968 
3969     LOAD_ZERO;
3970     vec_s16_t pix1v, pix2v;
3971     vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v,
3972               diffh4v, diffh5v, diffh6v, diffh7v;
3973     vec_s16_t diffl0v, diffl1v, diffl2v, diffl3v,
3974               diffl4v, diffl5v, diffl6v, diffl7v;
3975     vec_s16_t sa8dh0v, sa8dh1v, sa8dh2v, sa8dh3v, sa8dh4v, sa8dh5v, sa8dh6v, sa8dh7v;
3976     vec_s16_t sa8dl0v, sa8dl1v, sa8dl2v, sa8dl3v, sa8dl4v, sa8dl5v, sa8dl6v, sa8dl7v;
3977     vec_s16_t temp0v, temp1v, temp2v, temp3v;
3978 
3979     VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh0v,diffl0v);
3980     VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh1v, diffl1v);
3981     VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh2v, diffl2v);
3982     VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh3v, diffl3v);
3983     VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh4v, diffl4v);
3984     VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh5v, diffl5v);
3985     VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh6v, diffl6v);
3986     VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh7v, diffl7v);
3987 
3988     SA8D_1D_ALTIVEC(diffh0v, diffh1v, diffh2v, diffh3v, diffh4v, diffh5v, diffh6v, diffh7v);
3989     VEC_TRANSPOSE_8(diffh0v, diffh1v, diffh2v, diffh3v, diffh4v, diffh5v, diffh6v, diffh7v,
3990                     sa8dh0v, sa8dh1v, sa8dh2v, sa8dh3v, sa8dh4v, sa8dh5v, sa8dh6v, sa8dh7v );
3991     SA8D_1D_ALTIVEC(sa8dh0v, sa8dh1v, sa8dh2v, sa8dh3v, sa8dh4v, sa8dh5v, sa8dh6v, sa8dh7v);
3992 
3993     SA8D_1D_ALTIVEC(diffl0v, diffl1v, diffl2v, diffl3v, diffl4v, diffl5v, diffl6v, diffl7v);
3994     VEC_TRANSPOSE_8(diffl0v, diffl1v, diffl2v, diffl3v, diffl4v, diffl5v, diffl6v, diffl7v,
3995                     sa8dl0v, sa8dl1v, sa8dl2v, sa8dl3v, sa8dl4v, sa8dl5v, sa8dl6v, sa8dl7v );
3996     SA8D_1D_ALTIVEC(sa8dl0v, sa8dl1v, sa8dl2v, sa8dl3v, sa8dl4v, sa8dl5v, sa8dl6v, sa8dl7v);
3997 
3998     /* accumulation of the absolute value of all elements of the resulting bloc */
3999     sa8dh0v = vec_max( sa8dh0v, vec_sub( zero_s16v, sa8dh0v ) );
4000     sa8dh1v = vec_max( sa8dh1v, vec_sub( zero_s16v, sa8dh1v ) );
4001     vec_s16_t sumh01v = vec_add(sa8dh0v, sa8dh1v);
4002 
4003     sa8dh2v = vec_max( sa8dh2v, vec_sub( zero_s16v, sa8dh2v ) );
4004     sa8dh3v = vec_max( sa8dh3v, vec_sub( zero_s16v, sa8dh3v ) );
4005     vec_s16_t sumh23v = vec_add(sa8dh2v, sa8dh3v);
4006 
4007     sa8dh4v = vec_max( sa8dh4v, vec_sub( zero_s16v, sa8dh4v ) );
4008     sa8dh5v = vec_max( sa8dh5v, vec_sub( zero_s16v, sa8dh5v ) );
4009     vec_s16_t sumh45v = vec_add(sa8dh4v, sa8dh5v);
4010 
4011     sa8dh6v = vec_max( sa8dh6v, vec_sub( zero_s16v, sa8dh6v ) );
4012     sa8dh7v = vec_max( sa8dh7v, vec_sub( zero_s16v, sa8dh7v ) );
4013     vec_s16_t sumh67v = vec_add(sa8dh6v, sa8dh7v);
4014 
4015     vec_s16_t sumh0123v = vec_add(sumh01v, sumh23v);
4016     vec_s16_t sumh4567v = vec_add(sumh45v, sumh67v);
4017 
4018     vec_s32_t sumblocv_h0;
4019 
4020     sumblocv_h0 = vec_sum4s(sumh0123v, (vec_s32_t)zerov );
4021     sumblocv_h0 = vec_sum4s(sumh4567v, sumblocv_h0 );
4022     sumblocv_h0 = vec_sums(sumblocv_h0, (vec_s32_t)zerov );
4023     sumblocv_h0 = vec_splat(sumblocv_h0, 3);
4024     vec_ste(sumblocv_h0, 0, &sumh0);
4025 
4026     sa8dl0v = vec_max( sa8dl0v, vec_sub( zero_s16v, sa8dl0v ) );
4027     sa8dl1v = vec_max( sa8dl1v, vec_sub( zero_s16v, sa8dl1v ) );
4028     vec_s16_t suml01v = vec_add(sa8dl0v, sa8dl1v);
4029 
4030     sa8dl2v = vec_max( sa8dl2v, vec_sub( zero_s16v, sa8dl2v ) );
4031     sa8dl3v = vec_max( sa8dl3v, vec_sub( zero_s16v, sa8dl3v ) );
4032     vec_s16_t suml23v = vec_add(sa8dl2v, sa8dl3v);
4033 
4034     sa8dl4v = vec_max( sa8dl4v, vec_sub( zero_s16v, sa8dl4v ) );
4035     sa8dl5v = vec_max( sa8dl5v, vec_sub( zero_s16v, sa8dl5v ) );
4036     vec_s16_t suml45v = vec_add(sa8dl4v, sa8dl5v);
4037 
4038     sa8dl6v = vec_max( sa8dl6v, vec_sub( zero_s16v, sa8dl6v ) );
4039     sa8dl7v = vec_max( sa8dl7v, vec_sub( zero_s16v, sa8dl7v ) );
4040     vec_s16_t suml67v = vec_add(sa8dl6v, sa8dl7v);
4041 
4042     vec_s16_t suml0123v = vec_add(suml01v, suml23v);
4043     vec_s16_t suml4567v = vec_add(suml45v, suml67v);
4044 
4045     vec_s32_t sumblocv_l0;
4046 
4047     sumblocv_l0 = vec_sum4s(suml0123v, (vec_s32_t)zerov );
4048     sumblocv_l0 = vec_sum4s(suml4567v, sumblocv_l0 );
4049     sumblocv_l0 = vec_sums(sumblocv_l0, (vec_s32_t)zerov );
4050     sumblocv_l0 = vec_splat(sumblocv_l0, 3);
4051     vec_ste(sumblocv_l0, 0, &suml0);
4052 
4053     VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh0v,diffl0v);
4054     VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh1v, diffl1v);
4055     VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh2v, diffl2v);
4056     VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh3v, diffl3v);
4057     VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh4v, diffl4v);
4058     VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh5v, diffl5v);
4059     VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh6v, diffl6v);
4060     VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh7v, diffl7v);
4061 
4062     SA8D_1D_ALTIVEC(diffh0v, diffh1v, diffh2v, diffh3v, diffh4v, diffh5v, diffh6v, diffh7v);
4063     VEC_TRANSPOSE_8(diffh0v, diffh1v, diffh2v, diffh3v, diffh4v, diffh5v, diffh6v, diffh7v,
4064                     sa8dh0v, sa8dh1v, sa8dh2v, sa8dh3v, sa8dh4v, sa8dh5v, sa8dh6v, sa8dh7v );
4065     SA8D_1D_ALTIVEC(sa8dh0v, sa8dh1v, sa8dh2v, sa8dh3v, sa8dh4v, sa8dh5v, sa8dh6v, sa8dh7v);
4066 
4067     SA8D_1D_ALTIVEC(diffl0v, diffl1v, diffl2v, diffl3v, diffl4v, diffl5v, diffl6v, diffl7v);
4068     VEC_TRANSPOSE_8(diffl0v, diffl1v, diffl2v, diffl3v, diffl4v, diffl5v, diffl6v, diffl7v,
4069                     sa8dl0v, sa8dl1v, sa8dl2v, sa8dl3v, sa8dl4v, sa8dl5v, sa8dl6v, sa8dl7v );
4070     SA8D_1D_ALTIVEC(sa8dl0v, sa8dl1v, sa8dl2v, sa8dl3v, sa8dl4v, sa8dl5v, sa8dl6v, sa8dl7v);
4071 
4072     /* accumulation of the absolute value of all elements of the resulting bloc */
4073     sa8dh0v = vec_max( sa8dh0v, vec_sub( zero_s16v, sa8dh0v ) );
4074     sa8dh1v = vec_max( sa8dh1v, vec_sub( zero_s16v, sa8dh1v ) );
4075     sumh01v = vec_add(sa8dh0v, sa8dh1v);
4076 
4077     sa8dh2v = vec_max( sa8dh2v, vec_sub( zero_s16v, sa8dh2v ) );
4078     sa8dh3v = vec_max( sa8dh3v, vec_sub( zero_s16v, sa8dh3v ) );
4079     sumh23v = vec_add(sa8dh2v, sa8dh3v);
4080 
4081     sa8dh4v = vec_max( sa8dh4v, vec_sub( zero_s16v, sa8dh4v ) );
4082     sa8dh5v = vec_max( sa8dh5v, vec_sub( zero_s16v, sa8dh5v ) );
4083     sumh45v = vec_add(sa8dh4v, sa8dh5v);
4084 
4085     sa8dh6v = vec_max( sa8dh6v, vec_sub( zero_s16v, sa8dh6v ) );
4086     sa8dh7v = vec_max( sa8dh7v, vec_sub( zero_s16v, sa8dh7v ) );
4087     sumh67v = vec_add(sa8dh6v, sa8dh7v);
4088 
4089     sumh0123v = vec_add(sumh01v, sumh23v);
4090     sumh4567v = vec_add(sumh45v, sumh67v);
4091 
4092     vec_s32_t sumblocv_h1;
4093 
4094     sumblocv_h1 = vec_sum4s(sumh0123v, (vec_s32_t)zerov );
4095     sumblocv_h1 = vec_sum4s(sumh4567v, sumblocv_h1 );
4096     sumblocv_h1 = vec_sums(sumblocv_h1, (vec_s32_t)zerov );
4097     sumblocv_h1 = vec_splat(sumblocv_h1, 3);
4098     vec_ste(sumblocv_h1, 0, &sumh1);
4099 
4100     sa8dl0v = vec_max( sa8dl0v, vec_sub( zero_s16v, sa8dl0v ) );
4101     sa8dl1v = vec_max( sa8dl1v, vec_sub( zero_s16v, sa8dl1v ) );
4102     suml01v = vec_add(sa8dl0v, sa8dl1v);
4103 
4104     sa8dl2v = vec_max( sa8dl2v, vec_sub( zero_s16v, sa8dl2v ) );
4105     sa8dl3v = vec_max( sa8dl3v, vec_sub( zero_s16v, sa8dl3v ) );
4106     suml23v = vec_add(sa8dl2v, sa8dl3v);
4107 
4108     sa8dl4v = vec_max( sa8dl4v, vec_sub( zero_s16v, sa8dl4v ) );
4109     sa8dl5v = vec_max( sa8dl5v, vec_sub( zero_s16v, sa8dl5v ) );
4110     suml45v = vec_add(sa8dl4v, sa8dl5v);
4111 
4112     sa8dl6v = vec_max( sa8dl6v, vec_sub( zero_s16v, sa8dl6v ) );
4113     sa8dl7v = vec_max( sa8dl7v, vec_sub( zero_s16v, sa8dl7v ) );
4114     suml67v = vec_add(sa8dl6v, sa8dl7v);
4115 
4116     suml0123v = vec_add(suml01v, suml23v);
4117     suml4567v = vec_add(suml45v, suml67v);
4118 
4119     vec_s32_t sumblocv_l1;
4120 
4121     sumblocv_l1 = vec_sum4s(suml0123v, (vec_s32_t)zerov );
4122     sumblocv_l1 = vec_sum4s(suml4567v, sumblocv_l1 );
4123     sumblocv_l1 = vec_sums(sumblocv_l1, (vec_s32_t)zerov );
4124     sumblocv_l1 = vec_splat(sumblocv_l1, 3);
4125     vec_ste(sumblocv_l1, 0, &suml1);
4126 
4127     sum = (sumh0+suml0+sumh1+suml1 + 2) >>2;
4128     return (sum );
4129 }
4130 
sa8d_16x32_altivec(const pixel * pix1,intptr_t i_pix1,const pixel * pix2,intptr_t i_pix2)4131 int sa8d_16x32_altivec(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
4132 {
4133     ALIGN_VAR_16(int, sum);
4134     sum =  sa8d_16x16_altivec(pix1, i_pix1, pix2, i_pix2)
4135 		+ sa8d_16x16_altivec(pix1+16*i_pix1, i_pix1, pix2+16*i_pix2, i_pix2);
4136     return sum;
4137 }
4138 
sa8d_32x32_altivec(const pixel * pix1,intptr_t i_pix1,const pixel * pix2,intptr_t i_pix2)4139 int sa8d_32x32_altivec(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
4140 {
4141     ALIGN_VAR_16(int, sum);
4142     int offset1, offset2;
4143     offset1 = 16*i_pix1;
4144     offset2 = 16*i_pix2;
4145     sum =  sa8d_16x16_altivec(pix1, i_pix1, pix2, i_pix2)
4146 		+ sa8d_16x16_altivec(pix1+16, i_pix1, pix2+16, i_pix2)
4147 		+ sa8d_16x16_altivec(pix1+offset1, i_pix1, pix2+offset2, i_pix2)
4148 		+ sa8d_16x16_altivec(pix1+16+offset1, i_pix1, pix2+16+offset2, i_pix2);
4149     return sum;
4150 }
4151 
sa8d_32x64_altivec(const pixel * pix1,intptr_t i_pix1,const pixel * pix2,intptr_t i_pix2)4152 int sa8d_32x64_altivec(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
4153 {
4154     ALIGN_VAR_16(int, sum);
4155     int offset1, offset2;
4156     offset1 = 16*i_pix1;
4157     offset2 = 16*i_pix2;
4158     sum =  sa8d_16x16_altivec(pix1, i_pix1, pix2, i_pix2)
4159 		+ sa8d_16x16_altivec(pix1+16, i_pix1, pix2+16, i_pix2)
4160 		+ sa8d_16x16_altivec(pix1+offset1, i_pix1, pix2+offset2, i_pix2)
4161 		+ sa8d_16x16_altivec(pix1+16+offset1, i_pix1, pix2+16+offset2, i_pix2)
4162 		+ sa8d_16x16_altivec(pix1+32*i_pix1, i_pix1, pix2+32*i_pix2, i_pix2)
4163 		+ sa8d_16x16_altivec(pix1+16+32*i_pix1, i_pix1, pix2+16+32*i_pix2, i_pix2)
4164 		+ sa8d_16x16_altivec(pix1+48*i_pix1, i_pix1, pix2+48*i_pix2, i_pix2)
4165 		+ sa8d_16x16_altivec(pix1+16+48*i_pix1, i_pix1, pix2+16+48*i_pix2, i_pix2);
4166     return sum;
4167 }
4168 
sa8d_64x64_altivec(const pixel * pix1,intptr_t i_pix1,const pixel * pix2,intptr_t i_pix2)4169 int sa8d_64x64_altivec(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
4170 {
4171     ALIGN_VAR_16(int, sum);
4172     int offset1, offset2;
4173     offset1 = 16*i_pix1;
4174     offset2 = 16*i_pix2;
4175     sum =  sa8d_16x16_altivec(pix1, i_pix1, pix2, i_pix2)
4176 		+ sa8d_16x16_altivec(pix1+16, i_pix1, pix2+16, i_pix2)
4177 		+ sa8d_16x16_altivec(pix1+32, i_pix1, pix2+32, i_pix2)
4178 		+ sa8d_16x16_altivec(pix1+48, i_pix1, pix2+48, i_pix2)
4179 		+ sa8d_16x16_altivec(pix1+offset1, i_pix1, pix2+offset2, i_pix2)
4180 		+ sa8d_16x16_altivec(pix1+16+offset1, i_pix1, pix2+16+offset2, i_pix2)
4181 		+ sa8d_16x16_altivec(pix1+32+offset1, i_pix1, pix2+32+offset2, i_pix2)
4182 		+ sa8d_16x16_altivec(pix1+48+offset1, i_pix1, pix2+48+offset2, i_pix2)
4183 		+ sa8d_16x16_altivec(pix1+32*i_pix1, i_pix1, pix2+32*i_pix2, i_pix2)
4184 		+ sa8d_16x16_altivec(pix1+16+32*i_pix1, i_pix1, pix2+16+32*i_pix2, i_pix2)
4185 		+ sa8d_16x16_altivec(pix1+32+32*i_pix1, i_pix1, pix2+32+32*i_pix2, i_pix2)
4186 		+ sa8d_16x16_altivec(pix1+48+32*i_pix1, i_pix1, pix2+48+32*i_pix2, i_pix2)
4187 		+ sa8d_16x16_altivec(pix1+48*i_pix1, i_pix1, pix2+48*i_pix2, i_pix2)
4188 		+ sa8d_16x16_altivec(pix1+16+48*i_pix1, i_pix1, pix2+16+48*i_pix2, i_pix2)
4189 		+ sa8d_16x16_altivec(pix1+32+48*i_pix1, i_pix1, pix2+32+48*i_pix2, i_pix2)
4190 		+ sa8d_16x16_altivec(pix1+48+48*i_pix1, i_pix1, pix2+48+48*i_pix2, i_pix2);
4191     return sum;
4192 }
4193 
4194 /* Initialize entries for pixel functions defined in this file */
setupPixelPrimitives_altivec(EncoderPrimitives & p)4195 void setupPixelPrimitives_altivec(EncoderPrimitives &p)
4196 {
4197 #define LUMA_PU(W, H) \
4198     if (W<=16) { \
4199         p.pu[LUMA_ ## W ## x ## H].sad = sad16_altivec<W, H>; \
4200         p.pu[LUMA_ ## W ## x ## H].sad_x3 = sad16_x3_altivec<W, H>; \
4201         p.pu[LUMA_ ## W ## x ## H].sad_x4 = sad16_x4_altivec<W, H>; \
4202     } \
4203     else { \
4204        p.pu[LUMA_ ## W ## x ## H].sad = sad_altivec<W, H>; \
4205        p.pu[LUMA_ ## W ## x ## H].sad_x3 = sad_x3_altivec<W, H>; \
4206        p.pu[LUMA_ ## W ## x ## H].sad_x4 = sad_x4_altivec<W, H>; \
4207     }
4208 
4209     LUMA_PU(4, 4);
4210     LUMA_PU(8, 8);
4211     LUMA_PU(16, 16);
4212     LUMA_PU(32, 32);
4213     LUMA_PU(64, 64);
4214     LUMA_PU(4, 8);
4215     LUMA_PU(8, 4);
4216     LUMA_PU(16,  8);
4217     LUMA_PU(8, 16);
4218     LUMA_PU(16, 12);
4219     LUMA_PU(12, 16);
4220     LUMA_PU(16,  4);
4221     LUMA_PU(4, 16);
4222     LUMA_PU(32, 16);
4223     LUMA_PU(16, 32);
4224     LUMA_PU(32, 24);
4225     LUMA_PU(24, 32);
4226     LUMA_PU(32,  8);
4227     LUMA_PU(8, 32);
4228     LUMA_PU(64, 32);
4229     LUMA_PU(32, 64);
4230     LUMA_PU(64, 48);
4231     LUMA_PU(48, 64);
4232     LUMA_PU(64, 16);
4233     LUMA_PU(16, 64);
4234 
4235     p.pu[LUMA_4x4].satd   = satd_4x4_altivec;//satd_4x4;
4236     p.pu[LUMA_8x8].satd   = satd_8x8_altivec;//satd8<8, 8>;
4237     p.pu[LUMA_8x4].satd   = satd_8x4_altivec;//satd_8x4;
4238     p.pu[LUMA_4x8].satd   = satd_4x8_altivec;//satd4<4, 8>;
4239     p.pu[LUMA_16x16].satd = satd_16x16_altivec;//satd8<16, 16>;
4240     p.pu[LUMA_16x8].satd  = satd_16x8_altivec;//satd8<16, 8>;
4241     p.pu[LUMA_8x16].satd  = satd_8x16_altivec;//satd8<8, 16>;
4242     p.pu[LUMA_16x12].satd = satd_altivec<16, 12>;//satd8<16, 12>;
4243     p.pu[LUMA_12x16].satd = satd_altivec<12, 16>;//satd4<12, 16>;
4244     p.pu[LUMA_16x4].satd  = satd_altivec<16, 4>;//satd8<16, 4>;
4245     p.pu[LUMA_4x16].satd  = satd_altivec<4, 16>;//satd4<4, 16>;
4246     p.pu[LUMA_32x32].satd = satd_altivec<32, 32>;//satd8<32, 32>;
4247     p.pu[LUMA_32x16].satd = satd_altivec<32, 16>;//satd8<32, 16>;
4248     p.pu[LUMA_16x32].satd = satd_altivec<16, 32>;//satd8<16, 32>;
4249     p.pu[LUMA_32x24].satd = satd_altivec<32, 24>;//satd8<32, 24>;
4250     p.pu[LUMA_24x32].satd = satd_altivec<24, 32>;//satd8<24, 32>;
4251     p.pu[LUMA_32x8].satd  = satd_altivec<32, 8>;//satd8<32, 8>;
4252     p.pu[LUMA_8x32].satd  = satd_altivec<8,32>;//satd8<8, 32>;
4253     p.pu[LUMA_64x64].satd = satd_altivec<64, 64>;//satd8<64, 64>;
4254     p.pu[LUMA_64x32].satd = satd_altivec<64, 32>;//satd8<64, 32>;
4255     p.pu[LUMA_32x64].satd = satd_altivec<32, 64>;//satd8<32, 64>;
4256     p.pu[LUMA_64x48].satd = satd_altivec<64, 48>;//satd8<64, 48>;
4257     p.pu[LUMA_48x64].satd = satd_altivec<48, 64>;//satd8<48, 64>;
4258     p.pu[LUMA_64x16].satd = satd_altivec<64, 16>;//satd8<64, 16>;
4259     p.pu[LUMA_16x64].satd = satd_altivec<16, 64>;//satd8<16, 64>;
4260 
4261     p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd   = satd_4x4_altivec;//satd_4x4;
4262     p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd   = satd_8x8_altivec;//satd8<8, 8>;
4263     p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = satd_16x16_altivec;//satd8<16, 16>;
4264     p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = satd_altivec<32, 32>;//satd8<32, 32>;
4265 
4266     p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd   = satd_8x4_altivec;//satd_8x4;
4267     p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd   = satd_4x8_altivec;//satd4<4, 8>;
4268     p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd  = satd_16x8_altivec;//satd8<16, 8>;
4269     p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd  = satd_8x16_altivec;//satd8<8, 16>;
4270     p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = satd_altivec<32, 16>;//satd8<32, 16>;
4271     p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = satd_altivec<16, 32>;//satd8<16, 32>;
4272 
4273     p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd = satd_altivec<16, 12>;//satd4<16, 12>;
4274     p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd = satd_altivec<12, 16>;//satd4<12, 16>;
4275     p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd  = satd_altivec<16, 4>;//satd4<16, 4>;
4276     p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd  = satd_altivec<4, 16>;//satd4<4, 16>;
4277     p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = satd_altivec<32, 24>;//satd8<32, 24>;
4278     p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = satd_altivec<24, 32>;//satd8<24, 32>;
4279     p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd  = satd_altivec<32, 8>;//satd8<32, 8>;
4280     p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd  = satd_altivec<8,32>;//satd8<8, 32>;
4281 
4282     p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd   = satd_4x8_altivec;//satd4<4, 8>;
4283     p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd  = satd_8x16_altivec;//satd8<8, 16>;
4284     p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd = satd_altivec<16, 32>;//satd8<16, 32>;
4285     p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = satd_altivec<32, 64>;//satd8<32, 64>;
4286 
4287     p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd   = satd_4x4_altivec;//satd_4x4;
4288     p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd   = satd_8x8_altivec;//satd8<8, 8>;
4289     p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd  = satd_altivec<4, 16>;//satd4<4, 16>;
4290     p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = satd_16x16_altivec;//satd8<16, 16>;
4291     p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd  = satd_altivec<8,32>;//satd8<8, 32>;
4292     p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = satd_altivec<32, 32>;//satd8<32, 32>;
4293     p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = satd_altivec<16, 64>;//satd8<16, 64>;
4294 
4295     p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd  = satd_altivec<8, 12>;//satd4<8, 12>;
4296     p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd   = satd_8x4_altivec;//satd4<8, 4>;
4297     p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].satd = satd_altivec<16, 24>;//satd8<16, 24>;
4298     p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd = satd_altivec<12, 32>;//satd4<12, 32>;
4299     p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd  = satd_16x8_altivec;//satd8<16, 8>;
4300     p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd  = satd_altivec<4, 32>;//satd4<4, 32>;
4301     p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = satd_altivec<32, 48>;//satd8<32, 48>;
4302     p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].satd = satd_altivec<24, 64>;//satd8<24, 64>;
4303     p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = satd_altivec<32, 16>;//satd8<32, 16>;
4304     p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].satd  = satd_altivec<8,64>;//satd8<8, 64>;
4305 
4306     p.cu[BLOCK_4x4].sa8d   = satd_4x4_altivec;//satd_4x4;
4307     p.cu[BLOCK_8x8].sa8d   = sa8d_8x8_altivec;//sa8d_8x8;
4308     p.cu[BLOCK_16x16].sa8d = sa8d_16x16_altivec;//sa8d_16x16;
4309     p.cu[BLOCK_32x32].sa8d = sa8d_32x32_altivec;//sa8d16<32, 32>;
4310     p.cu[BLOCK_64x64].sa8d = sa8d_64x64_altivec;//sa8d16<64, 64>;
4311 
4312     p.chroma[X265_CSP_I420].cu[BLOCK_16x16].sa8d = sa8d_8x8_altivec;//sa8d8<8, 8>;
4313     p.chroma[X265_CSP_I420].cu[BLOCK_32x32].sa8d = sa8d_16x16_altivec;//sa8d16<16, 16>;
4314     p.chroma[X265_CSP_I420].cu[BLOCK_64x64].sa8d = sa8d_32x32_altivec;//sa8d16<32, 32>;
4315 
4316     p.chroma[X265_CSP_I422].cu[BLOCK_16x16].sa8d = sa8d_8x16_altivec;//sa8d8<8, 16>;
4317     p.chroma[X265_CSP_I422].cu[BLOCK_32x32].sa8d = sa8d_16x32_altivec;//sa8d16<16, 32>;
4318     p.chroma[X265_CSP_I422].cu[BLOCK_64x64].sa8d = sa8d_32x64_altivec;//sa8d16<32, 64>;
4319 
4320 }
4321 }
4322