1 /*****************************************************************************
2 * Copyright (C) 2013-2020 MulticoreWare, Inc
3 *
4 * Authors: Steve Borho <steve@borho.org>
5 * Mandar Gurav <mandar@multicorewareinc.com>
6 * Mahesh Pittala <mahesh@multicorewareinc.com>
7 * Min Chen <min.chen@multicorewareinc.com>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 *
23 * This program is also available under a commercial proprietary license.
24 * For more information, contact us at license @ x265.com.
25 *****************************************************************************/
26
27 #include "common.h"
28 #include "primitives.h"
29 #include "x265.h"
30 #include "ppccommon.h"
31
32 #include <cstdlib> // abs()
33
34 //using namespace X265_NS;
35
36 namespace X265_NS {
37 // place functions in anonymous namespace (file static)
38
39 /* Null vector */
40 #define LOAD_ZERO const vec_u8_t zerov = vec_splat_u8( 0 )
41
42 #define zero_u8v (vec_u8_t) zerov
43 #define zero_s8v (vec_s8_t) zerov
44 #define zero_u16v (vec_u16_t) zerov
45 #define zero_s16v (vec_s16_t) zerov
46 #define zero_u32v (vec_u32_t) zerov
47 #define zero_s32v (vec_s32_t) zerov
48
49 /* 8 <-> 16 bits conversions */
50 #ifdef WORDS_BIGENDIAN
51 #define vec_u8_to_u16_h(v) (vec_u16_t) vec_mergeh( zero_u8v, (vec_u8_t) v )
52 #define vec_u8_to_u16_l(v) (vec_u16_t) vec_mergel( zero_u8v, (vec_u8_t) v )
53 #define vec_u8_to_s16_h(v) (vec_s16_t) vec_mergeh( zero_u8v, (vec_u8_t) v )
54 #define vec_u8_to_s16_l(v) (vec_s16_t) vec_mergel( zero_u8v, (vec_u8_t) v )
55 #else
56 #define vec_u8_to_u16_h(v) (vec_u16_t) vec_mergeh( (vec_u8_t) v, zero_u8v )
57 #define vec_u8_to_u16_l(v) (vec_u16_t) vec_mergel( (vec_u8_t) v, zero_u8v )
58 #define vec_u8_to_s16_h(v) (vec_s16_t) vec_mergeh( (vec_u8_t) v, zero_u8v )
59 #define vec_u8_to_s16_l(v) (vec_s16_t) vec_mergel( (vec_u8_t) v, zero_u8v )
60 #endif
61
62 #define vec_u8_to_u16(v) vec_u8_to_u16_h(v)
63 #define vec_u8_to_s16(v) vec_u8_to_s16_h(v)
64
65 #if defined(__GNUC__)
66 #define ALIGN_VAR_8(T, var) T var __attribute__((aligned(8)))
67 #define ALIGN_VAR_16(T, var) T var __attribute__((aligned(16)))
68 #define ALIGN_VAR_32(T, var) T var __attribute__((aligned(32)))
69 #elif defined(_MSC_VER)
70 #define ALIGN_VAR_8(T, var) __declspec(align(8)) T var
71 #define ALIGN_VAR_16(T, var) __declspec(align(16)) T var
72 #define ALIGN_VAR_32(T, var) __declspec(align(32)) T var
73 #endif // if defined(__GNUC__)
74
75 typedef uint8_t pixel;
76 typedef uint32_t sum2_t ;
77 typedef uint16_t sum_t ;
78 #define BITS_PER_SUM (8 * sizeof(sum_t))
79
80 /***********************************************************************
81 * SAD routines - altivec implementation
82 **********************************************************************/
83 template<int lx, int ly>
sum_columns_altivec(vec_s32_t sumv,int * sum)84 void inline sum_columns_altivec(vec_s32_t sumv, int* sum){}
85
86 template<int lx, int ly>
sad16_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)87 int inline sad16_altivec(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
88 {
89 assert(lx <=16);
90 LOAD_ZERO;
91 vec_u8_t pix1v, pix2v;
92 vec_u8_t absv = zero_u8v;
93 vec_s32_t sumv = zero_s32v;
94 ALIGN_VAR_16(int, sum );
95
96 for( int y = 0; y < ly; y++ )
97 {
98 pix1v = /*vec_vsx_ld*/vec_xl( 0, pix1);
99 pix2v = /*vec_vsx_ld*/vec_xl( 0, pix2);
100 //print_vec_u8("pix1v", &pix1v);
101 //print_vec_u8("pix2v", &pix2v);
102
103 absv = (vector unsigned char)vec_sub(vec_max(pix1v, pix2v), vec_min(pix1v, pix2v));
104 //print_vec_u8("abs sub", &absv);
105
106 sumv = (vec_s32_t) vec_sum4s( absv, (vec_u32_t) sumv);
107 //print_vec_i("vec_sum4s 0", &sumv);
108
109 pix1 += stride_pix1;
110 pix2 += stride_pix2;
111 }
112
113 sum_columns_altivec<lx, ly>(sumv, &sum);
114 //printf("<%d %d>%d\n", lx, ly, sum);
115 return sum;
116 }
117
118 template<int lx, int ly> //to be implemented later
sad16_altivec(const int16_t * pix1,intptr_t stride_pix1,const int16_t * pix2,intptr_t stride_pix2)119 int sad16_altivec(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2)
120 {
121 int sum = 0;
122 return sum;
123 }
124
125 template<int lx, int ly>//to be implemented later
sad_altivec(const int16_t * pix1,intptr_t stride_pix1,const int16_t * pix2,intptr_t stride_pix2)126 int sad_altivec(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2)
127 {
128 int sum = 0;
129 return sum;
130 }
131
132 template<>
sum_columns_altivec(vec_s32_t sumv,int * sum)133 void inline sum_columns_altivec<16, 4>(vec_s32_t sumv, int* sum)
134 {
135 LOAD_ZERO;
136 sumv = vec_sums( sumv, zero_s32v );
137 //print_vec_i("vec_sums", &sumv);
138 sumv = vec_splat( sumv, 3 );
139 //print_vec_i("vec_splat 3", &sumv);
140 vec_ste( sumv, 0, sum );
141 }
142
143 template<>
sum_columns_altivec(vec_s32_t sumv,int * sum)144 void inline sum_columns_altivec<16, 8>(vec_s32_t sumv, int* sum)
145 {
146 LOAD_ZERO;
147 sumv = vec_sums( sumv, zero_s32v );
148 //print_vec_i("vec_sums", &sumv);
149 sumv = vec_splat( sumv, 3 );
150 //print_vec_i("vec_splat 3", &sumv);
151 vec_ste( sumv, 0, sum );
152 }
153
154 template<>
sum_columns_altivec(vec_s32_t sumv,int * sum)155 void inline sum_columns_altivec<16, 12>(vec_s32_t sumv, int* sum)
156 {
157 LOAD_ZERO;
158 sumv = vec_sums( sumv, zero_s32v );
159 //print_vec_i("vec_sums", &sumv);
160 sumv = vec_splat( sumv, 3 );
161 //print_vec_i("vec_splat 3", &sumv);
162 vec_ste( sumv, 0, sum );
163 }
164
165 template<>
sum_columns_altivec(vec_s32_t sumv,int * sum)166 void inline sum_columns_altivec<16, 16>(vec_s32_t sumv, int* sum)
167 {
168 LOAD_ZERO;
169 sumv = vec_sums( sumv, zero_s32v );
170 //print_vec_i("vec_sums", &sumv);
171 sumv = vec_splat( sumv, 3 );
172 //print_vec_i("vec_splat 3", &sumv);
173 vec_ste( sumv, 0, sum );
174 }
175
176 template<>
sum_columns_altivec(vec_s32_t sumv,int * sum)177 void inline sum_columns_altivec<16, 24>(vec_s32_t sumv, int* sum)
178 {
179 LOAD_ZERO;
180 sumv = vec_sums( sumv, zero_s32v );
181 //print_vec_i("vec_sums", &sumv);
182 sumv = vec_splat( sumv, 3 );
183 //print_vec_i("vec_splat 3", &sumv);
184 vec_ste( sumv, 0, sum );
185 }
186
187 template<>
sum_columns_altivec(vec_s32_t sumv,int * sum)188 void inline sum_columns_altivec<16, 32>(vec_s32_t sumv, int* sum)
189 {
190 LOAD_ZERO;
191 sumv = vec_sums( sumv, zero_s32v );
192 //print_vec_i("vec_sums", &sumv);
193 sumv = vec_splat( sumv, 3 );
194 //print_vec_i("vec_splat 3", &sumv);
195 vec_ste( sumv, 0, sum );
196 }
197
198 template<>
sum_columns_altivec(vec_s32_t sumv,int * sum)199 void inline sum_columns_altivec<16, 48>(vec_s32_t sumv, int* sum)
200 {
201 LOAD_ZERO;
202 sumv = vec_sums( sumv, zero_s32v );
203 //print_vec_i("vec_sums", &sumv);
204 sumv = vec_splat( sumv, 3 );
205 //print_vec_i("vec_splat 3", &sumv);
206 vec_ste( sumv, 0, sum );
207 }
208
209 template<>
sum_columns_altivec(vec_s32_t sumv,int * sum)210 void inline sum_columns_altivec<16, 64>(vec_s32_t sumv, int* sum)
211 {
212 LOAD_ZERO;
213 sumv = vec_sums( sumv, zero_s32v );
214 //print_vec_i("vec_sums", &sumv);
215 sumv = vec_splat( sumv, 3 );
216 //print_vec_i("vec_splat 3", &sumv);
217 vec_ste( sumv, 0, sum );
218 }
219
220
221 template<>
sum_columns_altivec(vec_s32_t sumv,int * sum)222 void inline sum_columns_altivec<8, 4>(vec_s32_t sumv, int* sum)
223 {
224 LOAD_ZERO;
225 sumv = vec_sum2s( sumv, zero_s32v );
226 //print_vec_i("vec_sums", &sumv);
227 sumv = vec_splat( sumv, 1 );
228 //print_vec_i("vec_splat 1", &sumv);
229 vec_ste( sumv, 0, sum );
230 }
231
232 template<>
sum_columns_altivec(vec_s32_t sumv,int * sum)233 void inline sum_columns_altivec<8, 8>(vec_s32_t sumv, int* sum)
234 {
235 LOAD_ZERO;
236 sumv = vec_sum2s( sumv, zero_s32v );
237 //print_vec_i("vec_sums", &sumv);
238 sumv = vec_splat( sumv, 1 );
239 //print_vec_i("vec_splat 1", &sumv);
240 vec_ste( sumv, 0, sum );
241 }
242
243 template<>
sum_columns_altivec(vec_s32_t sumv,int * sum)244 void inline sum_columns_altivec<8, 16>(vec_s32_t sumv, int* sum)
245 {
246 LOAD_ZERO;
247 sumv = vec_sum2s( sumv, zero_s32v );
248 //print_vec_i("vec_sums", &sumv);
249 sumv = vec_splat( sumv, 1 );
250 //print_vec_i("vec_splat 1", &sumv);
251 vec_ste( sumv, 0, sum );
252 }
253
254 template<>
sum_columns_altivec(vec_s32_t sumv,int * sum)255 void inline sum_columns_altivec<8, 32>(vec_s32_t sumv, int* sum)
256 {
257 LOAD_ZERO;
258 sumv = vec_sum2s( sumv, zero_s32v );
259 //print_vec_i("vec_sums", &sumv);
260 sumv = vec_splat( sumv, 1 );
261 //print_vec_i("vec_splat 1", &sumv);
262 vec_ste( sumv, 0, sum );
263 }
264
265 template<>
sum_columns_altivec(vec_s32_t sumv,int * sum)266 void inline sum_columns_altivec<4, 4>(vec_s32_t sumv, int* sum)
267 {
268 LOAD_ZERO;
269 sumv = vec_splat( sumv, 0 );
270 //print_vec_i("vec_splat 0", &sumv);
271 vec_ste( sumv, 0, sum );
272 }
273
274 template<>
sum_columns_altivec(vec_s32_t sumv,int * sum)275 void inline sum_columns_altivec<4, 8>(vec_s32_t sumv, int* sum)
276 {
277 LOAD_ZERO;
278 sumv = vec_splat( sumv, 0 );
279 //print_vec_i("vec_splat 0", &sumv);
280 vec_ste( sumv, 0, sum );
281 }
282
283 template<>
sum_columns_altivec(vec_s32_t sumv,int * sum)284 void inline sum_columns_altivec<4, 16>(vec_s32_t sumv, int* sum)
285 {
286 LOAD_ZERO;
287 sumv = vec_splat( sumv, 0 );
288 //print_vec_i("vec_splat 0", &sumv);
289 vec_ste( sumv, 0, sum );
290 }
291
292 template<>
sum_columns_altivec(vec_s32_t sumv,int * sum)293 void inline sum_columns_altivec<12, 16>(vec_s32_t sumv, int* sum)
294 {
295 LOAD_ZERO;
296 vec_s32_t sum1v= vec_splat( sumv, 3);
297 sumv = vec_sums( sumv, zero_s32v );
298 //print_vec_i("vec_sums", &sumv);
299 sumv = vec_splat( sumv, 3 );
300 //print_vec_i("vec_splat 1", &sumv);
301 sumv = vec_sub(sumv, sum1v);
302 vec_ste( sumv, 0, sum );
303 }
304
305 template<int lx, int ly>
sad_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)306 int inline sad_altivec(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2){ return 0; }
307
308 template<>
sad_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)309 int inline sad_altivec<24, 32>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
310 {
311 ALIGN_VAR_16(int, sum );
312 sum = sad16_altivec<16, 32>(pix1, stride_pix1, pix2, stride_pix2)
313 + sad16_altivec<8, 32>(pix1+16, stride_pix1, pix2+16, stride_pix2);
314 //printf("<24 32>%d\n", sum);
315 return sum;
316 }
317
318 template<>
sad_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)319 int inline sad_altivec<32, 8>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
320 {
321 ALIGN_VAR_16(int, sum );
322 sum = sad16_altivec<16, 8>(pix1, stride_pix1, pix2, stride_pix2)
323 + sad16_altivec<16, 8>(pix1+16, stride_pix1, pix2+16, stride_pix2);
324 //printf("<32 8>%d\n", sum);
325 return sum;
326 }
327
328 template<>
sad_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)329 int inline sad_altivec<32, 16>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
330 {
331 ALIGN_VAR_16(int, sum );
332 sum = sad16_altivec<16, 16>(pix1, stride_pix1, pix2, stride_pix2)
333 + sad16_altivec<16, 16>(pix1+16, stride_pix1, pix2+16, stride_pix2);
334 //printf("<32 16>%d\n", sum);
335 return sum;
336 }
337
338 template<>
sad_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)339 int inline sad_altivec<32, 24>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
340 {
341 ALIGN_VAR_16(int, sum );
342 sum = sad16_altivec<16, 24>(pix1, stride_pix1, pix2, stride_pix2)
343 + sad16_altivec<16, 24>(pix1+16, stride_pix1, pix2+16, stride_pix2);
344 //printf("<32 24>%d\n", sum);
345 return sum;
346 }
347
348 template<>
sad_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)349 int inline sad_altivec<32, 32>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
350 {
351 ALIGN_VAR_16(int, sum );
352 sum = sad16_altivec<16, 32>(pix1, stride_pix1, pix2, stride_pix2)
353 + sad16_altivec<16, 32>(pix1+16, stride_pix1, pix2+16, stride_pix2);
354 //printf("<32 32>%d\n", sum);
355 return sum;
356 }
357
358 template<>
sad_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)359 int inline sad_altivec<32, 64>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
360 {
361 ALIGN_VAR_16(int, sum );
362 sum = sad16_altivec<16, 64>(pix1, stride_pix1, pix2, stride_pix2)
363 + sad16_altivec<16, 64>(pix1+16, stride_pix1, pix2+16, stride_pix2);
364 //printf("<32 64>%d\n", sum);
365 return sum;
366 }
367
368 template<>
sad_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)369 int inline sad_altivec<48, 64>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
370 {
371 ALIGN_VAR_16(int, sum );
372 sum = sad16_altivec<16, 64>(pix1, stride_pix1, pix2, stride_pix2)
373 + sad16_altivec<16, 64>(pix1+16, stride_pix1, pix2+16, stride_pix2)
374 + sad16_altivec<16, 64>(pix1+32, stride_pix1, pix2+32, stride_pix2);
375 //printf("<48 64>%d\n", sum);
376 return sum;
377 }
378
379 template<>
sad_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)380 int inline sad_altivec<64, 16>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
381 {
382 ALIGN_VAR_16(int, sum );
383 sum = sad16_altivec<16, 16>(pix1, stride_pix1, pix2, stride_pix2)
384 + sad16_altivec<16, 16>(pix1+16, stride_pix1, pix2+16, stride_pix2)
385 + sad16_altivec<16, 16>(pix1+32, stride_pix1, pix2+32, stride_pix2)
386 + sad16_altivec<16, 16>(pix1+48, stride_pix1, pix2+48, stride_pix2);
387 //printf("<64 16>%d\n", sum);
388 return sum;
389 }
390
391 template<>
sad_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)392 int inline sad_altivec<64, 32>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
393 {
394 ALIGN_VAR_16(int, sum );
395 sum = sad16_altivec<16, 32>(pix1, stride_pix1, pix2, stride_pix2)
396 + sad16_altivec<16, 32>(pix1+16, stride_pix1, pix2+16, stride_pix2)
397 + sad16_altivec<16, 32>(pix1+32, stride_pix1, pix2+32, stride_pix2)
398 + sad16_altivec<16, 32>(pix1+48, stride_pix1, pix2+48, stride_pix2);
399 //printf("<64 32>%d\n", sum);
400 return sum;
401 }
402
403 template<>
sad_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)404 int inline sad_altivec<64, 48>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
405 {
406 ALIGN_VAR_16(int, sum );
407 sum = sad16_altivec<16, 48>(pix1, stride_pix1, pix2, stride_pix2)
408 + sad16_altivec<16, 48>(pix1+16, stride_pix1, pix2+16, stride_pix2)
409 + sad16_altivec<16, 48>(pix1+32, stride_pix1, pix2+32, stride_pix2)
410 + sad16_altivec<16, 48>(pix1+48, stride_pix1, pix2+48, stride_pix2);
411 //printf("<64 48>%d\n", sum);
412 return sum;
413 }
414
415 template<>
sad_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)416 int inline sad_altivec<64, 64>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
417 {
418 ALIGN_VAR_16(int, sum );
419 sum = sad16_altivec<16, 64>(pix1, stride_pix1, pix2, stride_pix2)
420 + sad16_altivec<16, 64>(pix1+16, stride_pix1, pix2+16, stride_pix2)
421 + sad16_altivec<16, 64>(pix1+32, stride_pix1, pix2+32, stride_pix2)
422 + sad16_altivec<16, 64>(pix1+48, stride_pix1, pix2+48, stride_pix2);
423 //printf("<64 64>%d\n", sum);
424 return sum;
425 }
426
427 /***********************************************************************
428 * SAD_X3 routines - altivec implementation
429 **********************************************************************/
430 template<int lx, int ly>
sad16_x3_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,intptr_t frefstride,int32_t * res)431 void inline sad16_x3_altivec(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
432 {
433 res[0] = 0;
434 res[1] = 0;
435 res[2] = 0;
436 assert(lx <=16);
437 LOAD_ZERO;
438 vec_u8_t pix1v, pix2v, pix3v, pix4v;
439 vec_u8_t absv1_2 = zero_u8v;
440 vec_u8_t absv1_3 = zero_u8v;
441 vec_u8_t absv1_4 = zero_u8v;
442 vec_s32_t sumv0 = zero_s32v;
443 vec_s32_t sumv1 = zero_s32v;
444 vec_s32_t sumv2 = zero_s32v;
445
446 for( int y = 0; y < ly; y++ )
447 {
448 pix1v = vec_xl( 0, pix1); //@@RM vec_vsx_ld( 0, pix1);
449 pix2v = vec_xl( 0, pix2); //@@RM vec_vsx_ld( 0, pix2);
450 pix3v = vec_xl( 0, pix3); //@@RM vec_vsx_ld( 0, pix3);
451 pix4v = vec_xl( 0, pix4); //@@RM vec_vsx_ld( 0, pix4);
452
453 //@@RM : using vec_abs has 2 drawbacks here:
454 //@@RM first, it produces the incorrect result (unpack should be used first)
455 //@@RM second, it is slower than sub(max,min), as noted in freescale's documentation
456 //@@RM absv = (vector unsigned char)vec_abs((vector signed char)vec_sub(pix1v, pix2v)); //@@RM vec_abs((vec_s8_t)vec_sub(pix1v, pix2v));
457 absv1_2 = (vector unsigned char)vec_sub(vec_max(pix1v, pix2v), vec_min(pix1v, pix2v)); //@@RM vec_abs((vec_s8_t)vec_sub(pix1v, pix2v));
458 sumv0 = (vec_s32_t) vec_sum4s( absv1_2, (vec_u32_t) sumv0);
459
460 absv1_3 = (vector unsigned char)vec_sub(vec_max(pix1v, pix3v), vec_min(pix1v, pix3v)); //@@RM vec_abs((vec_s8_t)vec_sub(pix1v, pix3v));
461 sumv1 = (vec_s32_t) vec_sum4s( absv1_3, (vec_u32_t) sumv1);
462
463 absv1_4 = (vector unsigned char)vec_sub(vec_max(pix1v, pix4v), vec_min(pix1v, pix4v)); //@@RM vec_abs((vec_s8_t)vec_sub(pix1v, pix3v));
464 sumv2 = (vec_s32_t) vec_sum4s( absv1_4, (vec_u32_t) sumv2);
465
466 pix1 += FENC_STRIDE;
467 pix2 += frefstride;
468 pix3 += frefstride;
469 pix4 += frefstride;
470 }
471
472 sum_columns_altivec<lx, ly>(sumv0, res+0);
473 sum_columns_altivec<lx, ly>(sumv1, res+1);
474 sum_columns_altivec<lx, ly>(sumv2, res+2);
475 //printf("<%d %d>%d %d %d\n", lx, ly, res[0], res[1], res[2]);
476 }
477
478 template<int lx, int ly>
sad_x3_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,intptr_t frefstride,int32_t * res)479 void inline sad_x3_altivec(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res){}
480
481 template<>
sad_x3_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,intptr_t frefstride,int32_t * res)482 void inline sad_x3_altivec<24, 32>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
483 {
484 int32_t sum[3];
485 sad16_x3_altivec<16, 32>(pix1, pix2, pix3, pix4, frefstride, sum);
486 sad16_x3_altivec<8, 32>(pix1+16, pix2+16, pix3+16, pix4+16, frefstride, res);
487 res[0] += sum[0];
488 res[1] += sum[1];
489 res[2] += sum[2];
490 //printf("<24 32>%d %d %d\n", res[0], res[1], res[2]);
491 }
492
493 template<>
sad_x3_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,intptr_t frefstride,int32_t * res)494 void inline sad_x3_altivec<32, 8>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
495 {
496 int32_t sum[3];
497 sad16_x3_altivec<16, 8>(pix1, pix2, pix3, pix4, frefstride, sum);
498 sad16_x3_altivec<16, 8>(pix1+16, pix2+16, pix3+16, pix4+16, frefstride, res);
499 res[0] += sum[0];
500 res[1] += sum[1];
501 res[2] += sum[2];
502 //printf("<32 8>%d %d %d\n", res[0], res[1], res[2]);
503 }
504
505 template<>
sad_x3_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,intptr_t frefstride,int32_t * res)506 void inline sad_x3_altivec<32, 16>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
507 {
508 int32_t sum[3];
509 sad16_x3_altivec<16, 16>(pix1, pix2, pix3, pix4, frefstride, sum);
510 sad16_x3_altivec<16, 16>(pix1+16, pix2+16, pix3+16, pix4+16, frefstride, res);
511 res[0] += sum[0];
512 res[1] += sum[1];
513 res[2] += sum[2];
514 //printf("<32 16>%d %d %d\n", res[0], res[1], res[2]);
515 }
516
517 template<>
sad_x3_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,intptr_t frefstride,int32_t * res)518 void inline sad_x3_altivec<32, 24>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
519 {
520 int32_t sum[3];
521 sad16_x3_altivec<16, 24>(pix1, pix2, pix3, pix4, frefstride, sum);
522 sad16_x3_altivec<16, 24>(pix1+16, pix2+16, pix3+16, pix4+16, frefstride, res);
523 res[0] += sum[0];
524 res[1] += sum[1];
525 res[2] += sum[2];
526 //printf("<32 24>%d %d %d\n", res[0], res[1], res[2]);
527 }
528
529 template<>
sad_x3_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,intptr_t frefstride,int32_t * res)530 void sad_x3_altivec<32, 32>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
531 {
532
533 const int lx = 32 ;
534 const int ly = 32 ;
535
536 vector unsigned int v_zeros = {0, 0, 0, 0} ;
537
538 vector signed short v_results_0 = {0, 0, 0, 0, 0, 0, 0, 0} ;
539 vector signed short v_results_1 = {0, 0, 0, 0, 0, 0, 0, 0} ;
540 vector signed short v_results_2 = {0, 0, 0, 0, 0, 0, 0, 0} ;
541
542
543 vector signed int v_results_int_0 ;
544 vector signed int v_results_int_1 ;
545 vector signed int v_results_int_2 ;
546
547 vector unsigned char v_pix1 ;
548 vector unsigned char v_pix2 ;
549 vector unsigned char v_pix3 ;
550 vector unsigned char v_pix4 ;
551
552 vector unsigned char v_abs_diff_0 ;
553 vector unsigned char v_abs_diff_1 ;
554 vector unsigned char v_abs_diff_2 ;
555
556 vector signed short v_unpack_mask = {0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF} ;
557
558 vector signed short v_short_0_0 , v_short_0_1 ;
559 vector signed short v_short_1_0 , v_short_1_1 ;
560 vector signed short v_short_2_0 , v_short_2_1 ;
561
562 vector signed short v_sum_0 ;
563 vector signed short v_sum_1 ;
564 vector signed short v_sum_2 ;
565
566
567
568 res[0] = 0;
569 res[1] = 0;
570 res[2] = 0;
571 for (int y = 0; y < ly; y++)
572 {
573 for (int x = 0; x < lx; x+=16)
574 {
575 v_pix1 = vec_xl(x, pix1) ;
576
577 // for(int ii=0; ii<16; ii++) { res[0] += abs(pix1[x + ii] - pix2[x + ii]); }
578 v_pix2 = vec_xl(x, pix2) ;
579 v_abs_diff_0 = vec_sub(vec_max(v_pix1, v_pix2), vec_min(v_pix1, v_pix2)) ;
580 v_short_0_0 = vec_unpackh((vector signed char)v_abs_diff_0) ;
581 v_short_0_0 = vec_and(v_short_0_0, v_unpack_mask) ;
582 v_short_0_1 = vec_unpackl((vector signed char)v_abs_diff_0) ;
583 v_short_0_1 = vec_and(v_short_0_1, v_unpack_mask) ;
584 v_sum_0 = vec_add(v_short_0_0, v_short_0_1) ;
585 v_results_0 = vec_add(v_results_0, v_sum_0) ;
586
587 // for(int ii=0; ii<16; ii++) { res[1] += abs(pix1[x + ii] - pix3[x + ii]); }
588 v_pix3 = vec_xl(x, pix3) ;
589 v_abs_diff_1 = vec_sub(vec_max(v_pix1, v_pix3), vec_min(v_pix1, v_pix3)) ;
590 v_short_1_0 = vec_unpackh((vector signed char)v_abs_diff_1) ;
591 v_short_1_0 = vec_and(v_short_1_0, v_unpack_mask) ;
592 v_short_1_1 = vec_unpackl((vector signed char)v_abs_diff_1) ;
593 v_short_1_1 = vec_and(v_short_1_1, v_unpack_mask) ;
594 v_sum_1 = vec_add(v_short_1_0, v_short_1_1) ;
595 v_results_1 = vec_add(v_results_1, v_sum_1) ;
596
597
598 // for(int ii=0; ii<16; ii++) { res[2] += abs(pix1[x + ii] - pix4[x + ii]); }
599 v_pix4 = vec_xl(x, pix4) ;
600 v_abs_diff_2 = vec_sub(vec_max(v_pix1, v_pix4), vec_min(v_pix1, v_pix4)) ;
601 v_short_2_0 = vec_unpackh((vector signed char)v_abs_diff_2) ;
602 v_short_2_0 = vec_and(v_short_2_0, v_unpack_mask) ;
603 v_short_2_1 = vec_unpackl((vector signed char)v_abs_diff_2) ;
604 v_short_2_1 = vec_and(v_short_2_1, v_unpack_mask) ;
605 v_sum_2 = vec_add(v_short_2_0, v_short_2_1) ;
606 v_results_2 = vec_add(v_results_2, v_sum_2) ;
607
608 }
609
610 pix1 += FENC_STRIDE;
611 pix2 += frefstride;
612 pix3 += frefstride;
613 pix4 += frefstride;
614 }
615
616
617 v_results_int_0 = vec_sum4s((vector signed short)v_results_0, (vector signed int)v_zeros) ;
618 v_results_int_0 = vec_sums(v_results_int_0, (vector signed int)v_zeros) ;
619 res[0] = v_results_int_0[3] ;
620
621
622 v_results_int_1 = vec_sum4s((vector signed short)v_results_1, (vector signed int)v_zeros) ;
623 v_results_int_1 = vec_sums(v_results_int_1, (vector signed int)v_zeros) ;
624 res[1] = v_results_int_1[3] ;
625
626
627 v_results_int_2 = vec_sum4s((vector signed short)v_results_2, (vector signed int)v_zeros) ;
628 v_results_int_2 = vec_sums(v_results_int_2, (vector signed int)v_zeros) ;
629 res[2] = v_results_int_2[3] ;
630
631 //printf("<32 32>%d %d %d\n", res[0], res[1], res[2]);
632
633 } // end sad_x3_altivec
634
635 template<>
sad_x3_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,intptr_t frefstride,int32_t * res)636 void inline sad_x3_altivec<32, 64>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
637 {
638 int32_t sum[3];
639 sad16_x3_altivec<16, 64>(pix1, pix2, pix3, pix4, frefstride, sum);
640 sad16_x3_altivec<16, 64>(pix1+16, pix2+16, pix3+16, pix4+16, frefstride, res);
641 res[0] += sum[0];
642 res[1] += sum[1];
643 res[2] += sum[2];
644 //printf("<32 64>%d %d %d\n", res[0], res[1], res[2]);
645 }
646
647 template<>
sad_x3_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,intptr_t frefstride,int32_t * res)648 void inline sad_x3_altivec<48, 64>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
649 {
650 int32_t sum[6];
651 sad16_x3_altivec<16, 64>(pix1, pix2, pix3, pix4, frefstride, sum);
652 sad16_x3_altivec<16, 64>(pix1+16, pix2+16, pix3+16, pix4+16, frefstride, sum+3);
653 sad16_x3_altivec<16, 64>(pix1+32, pix2+32, pix3+32, pix4+32, frefstride, res);
654 res[0] = sum[0]+sum[3]+res[0];
655 res[1] = sum[1]+sum[4]+res[1];
656 res[2] = sum[2]+sum[5]+res[2];
657 //printf("<48 64>%d %d %d\n", res[0], res[1], res[2]);
658 }
659
660 template<>
sad_x3_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,intptr_t frefstride,int32_t * res)661 void inline sad_x3_altivec<64, 16>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
662 {
663 int32_t sum[9];
664 sad16_x3_altivec<16, 16>(pix1, pix2, pix3, pix4, frefstride, sum);
665 sad16_x3_altivec<16, 16>(pix1+16, pix2+16, pix3+16, pix4+16, frefstride, sum+3);
666 sad16_x3_altivec<16, 16>(pix1+32, pix2+32, pix3+32, pix4+32, frefstride, sum+6);
667 sad16_x3_altivec<16, 16>(pix1+48, pix2+48, pix3+48, pix4+48, frefstride, res);
668 res[0] = sum[0]+sum[3]+sum[6]+res[0];
669 res[1] = sum[1]+sum[4]+sum[7]+res[1];
670 res[2] = sum[2]+sum[5]+sum[8]+res[2];
671 //printf("<64 16>%d %d %d\n", res[0], res[1], res[2]);
672 }
673
674 template<>
sad_x3_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,intptr_t frefstride,int32_t * res)675 void inline sad_x3_altivec<64, 32>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
676 {
677 int32_t sum[9];
678 sad16_x3_altivec<16, 32>(pix1, pix2, pix3, pix4, frefstride, sum);
679 sad16_x3_altivec<16, 32>(pix1+16, pix2+16, pix3+16, pix4+16, frefstride, sum+3);
680 sad16_x3_altivec<16, 32>(pix1+32, pix2+32, pix3+32, pix4+32, frefstride, sum+6);
681 sad16_x3_altivec<16, 32>(pix1+48, pix2+48, pix3+48, pix4+48, frefstride, res);
682 res[0] = sum[0]+sum[3]+sum[6]+res[0];
683 res[1] = sum[1]+sum[4]+sum[7]+res[1];
684 res[2] = sum[2]+sum[5]+sum[8]+res[2];
685 //printf("<64 32>%d %d %d\n", res[0], res[1], res[2]);
686 }
687
688 template<>
sad_x3_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,intptr_t frefstride,int32_t * res)689 void inline sad_x3_altivec<64, 48>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
690 {
691 int32_t sum[9];
692 sad16_x3_altivec<16, 48>(pix1, pix2, pix3, pix4, frefstride, sum);
693 sad16_x3_altivec<16, 48>(pix1+16, pix2+16, pix3+16, pix4+16, frefstride, sum+3);
694 sad16_x3_altivec<16, 48>(pix1+32, pix2+32, pix3+32, pix4+32, frefstride, sum+6);
695 sad16_x3_altivec<16, 48>(pix1+48, pix2+48, pix3+48, pix4+48, frefstride, res);
696 res[0] = sum[0]+sum[3]+sum[6]+res[0];
697 res[1] = sum[1]+sum[4]+sum[7]+res[1];
698 res[2] = sum[2]+sum[5]+sum[8]+res[2];
699 //printf("<64 48>%d %d %d\n", res[0], res[1], res[2]);
700 }
701
702 template<>
sad_x3_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,intptr_t frefstride,int32_t * res)703 void inline sad_x3_altivec<64, 64>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
704 {
705 int32_t sum[9];
706 sad16_x3_altivec<16, 64>(pix1, pix2, pix3, pix4, frefstride, sum);
707 sad16_x3_altivec<16, 64>(pix1+16, pix2+16, pix3+16, pix4+16, frefstride, sum+3);
708 sad16_x3_altivec<16, 64>(pix1+32, pix2+32, pix3+32, pix4+32, frefstride, sum+6);
709 sad16_x3_altivec<16, 64>(pix1+48, pix2+48, pix3+48, pix4+48, frefstride, res);
710 res[0] = sum[0]+sum[3]+sum[6]+res[0];
711 res[1] = sum[1]+sum[4]+sum[7]+res[1];
712 res[2] = sum[2]+sum[5]+sum[8]+res[2];
713 //printf("<64 64>%d %d %d\n", res[0], res[1], res[2]);
714 }
715
716 /***********************************************************************
717 * SAD_X4 routines - altivec implementation
718 **********************************************************************/
719 template<int lx, int ly>
sad16_x4_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,const pixel * pix5,intptr_t frefstride,int32_t * res)720 void inline sad16_x4_altivec(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
721 {
722 res[0] = 0;
723 res[1] = 0;
724 res[2] = 0;
725 assert(lx <=16);
726 LOAD_ZERO;
727 vec_u8_t pix1v, pix2v, pix3v, pix4v, pix5v;
728 vec_u8_t absv1_2 = zero_u8v;
729 vec_u8_t absv1_3 = zero_u8v;
730 vec_u8_t absv1_4 = zero_u8v;
731 vec_u8_t absv1_5 = zero_u8v;
732 vec_s32_t sumv0 = zero_s32v;
733 vec_s32_t sumv1 = zero_s32v;
734 vec_s32_t sumv2 = zero_s32v;
735 vec_s32_t sumv3 = zero_s32v;
736
737 for( int y = 0; y < ly; y++ )
738 {
739 pix1v = vec_xl( 0, pix1); //@@RM vec_vsx_ld( 0, pix1);
740 pix2v = vec_xl( 0, pix2); //@@RM vec_vsx_ld( 0, pix2);
741 pix3v = vec_xl( 0, pix3); //@@RM vec_vsx_ld( 0, pix3);
742 pix4v = vec_xl( 0, pix4); //@@RM vec_vsx_ld( 0, pix4);
743 pix5v = vec_xl( 0, pix5); //@@RM vec_vsx_ld( 0, pix4);
744
745 //@@RM : using vec_abs has 2 drawbacks here:
746 //@@RM first, it produces the incorrect result (unpack should be used first)
747 //@@RM second, it is slower than sub(max,min), as noted in freescale's documentation
748 //@@RM absv = (vector unsigned char)vec_abs((vector signed char)vec_sub(pix1v, pix2v)); //@@RM vec_abs((vec_s8_t)vec_sub(pix1v, pix2v));
749 absv1_2 = (vector unsigned char)vec_sub(vec_max(pix1v, pix2v), vec_min(pix1v, pix2v)); //@@RM vec_abs((vec_s8_t)vec_sub(pix1v, pix2v));
750 sumv0 = (vec_s32_t) vec_sum4s( absv1_2, (vec_u32_t) sumv0);
751
752 absv1_3 = (vector unsigned char)vec_sub(vec_max(pix1v, pix3v), vec_min(pix1v, pix3v)); //@@RM vec_abs((vec_s8_t)vec_sub(pix1v, pix3v));
753 sumv1 = (vec_s32_t) vec_sum4s( absv1_3, (vec_u32_t) sumv1);
754
755 absv1_4 = (vector unsigned char)vec_sub(vec_max(pix1v, pix4v), vec_min(pix1v, pix4v)); //@@RM vec_abs((vec_s8_t)vec_sub(pix1v, pix3v));
756 sumv2 = (vec_s32_t) vec_sum4s( absv1_4, (vec_u32_t) sumv2);
757
758 absv1_5 = (vector unsigned char)vec_sub(vec_max(pix1v, pix5v), vec_min(pix1v, pix5v)); //@@RM vec_abs((vec_s8_t)vec_sub(pix1v, pix3v));
759 sumv3 = (vec_s32_t) vec_sum4s( absv1_5, (vec_u32_t) sumv3);
760
761 pix1 += FENC_STRIDE;
762 pix2 += frefstride;
763 pix3 += frefstride;
764 pix4 += frefstride;
765 pix5 += frefstride;
766 }
767
768 sum_columns_altivec<lx, ly>(sumv0, res+0);
769 sum_columns_altivec<lx, ly>(sumv1, res+1);
770 sum_columns_altivec<lx, ly>(sumv2, res+2);
771 sum_columns_altivec<lx, ly>(sumv3, res+3);
772 //printf("<%d %d>%d %d %d %d\n", lx, ly, res[0], res[1], res[2], res[3]);
773 }
774
775 template<int lx, int ly>
sad_x4_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,const pixel * pix5,intptr_t frefstride,int32_t * res)776 void inline sad_x4_altivec(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res){}
777
778
779 template<>
sad_x4_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,const pixel * pix5,intptr_t frefstride,int32_t * res)780 void inline sad_x4_altivec<24, 32>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
781 {
782 int32_t sum[4];
783 sad16_x4_altivec<16, 32>(pix1, pix2, pix3, pix4, pix5, frefstride, sum);
784 sad16_x4_altivec<8, 32>(pix1+16, pix2+16, pix3+16, pix4+16, pix5+16, frefstride, res);
785 res[0] += sum[0];
786 res[1] += sum[1];
787 res[2] += sum[2];
788 res[3] += sum[3];
789 //printf("<24 32>%d %d %d %d\n", res[0], res[1], res[2], res[3]);
790 }
791
792 template<>
sad_x4_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,const pixel * pix5,intptr_t frefstride,int32_t * res)793 void inline sad_x4_altivec<32, 8>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
794 {
795 int32_t sum[4];
796 sad16_x4_altivec<16, 8>(pix1, pix2, pix3, pix4, pix5, frefstride, sum);
797 sad16_x4_altivec<16, 8>(pix1+16, pix2+16, pix3+16, pix4+16, pix5+16, frefstride, res);
798 res[0] += sum[0];
799 res[1] += sum[1];
800 res[2] += sum[2];
801 res[3] += sum[3];
802 //printf("<32 8>%d %d %d %d\n", res[0], res[1], res[2], res[3]);
803 }
804
805 template<>
sad_x4_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,const pixel * pix5,intptr_t frefstride,int32_t * res)806 void sad_x4_altivec<32,16>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
807 {
808
809 const int lx = 32 ;
810 const int ly = 16 ;
811
812 vector unsigned int v_zeros = {0, 0, 0, 0} ;
813
814 vector signed short v_results_0 = {0, 0, 0, 0, 0, 0, 0, 0} ;
815 vector signed short v_results_1 = {0, 0, 0, 0, 0, 0, 0, 0} ;
816 vector signed short v_results_2 = {0, 0, 0, 0, 0, 0, 0, 0} ;
817 vector signed short v_results_3 = {0, 0, 0, 0, 0, 0, 0, 0} ;
818
819
820 vector signed int v_results_int_0 ;
821 vector signed int v_results_int_1 ;
822 vector signed int v_results_int_2 ;
823 vector signed int v_results_int_3 ;
824
825 vector unsigned char v_pix1 ;
826 vector unsigned char v_pix2 ;
827 vector unsigned char v_pix3 ;
828 vector unsigned char v_pix4 ;
829 vector unsigned char v_pix5 ;
830
831 vector unsigned char v_abs_diff_0 ;
832 vector unsigned char v_abs_diff_1 ;
833 vector unsigned char v_abs_diff_2 ;
834 vector unsigned char v_abs_diff_3 ;
835
836 vector signed short v_unpack_mask = {0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF} ;
837
838 vector signed short v_short_0_0 , v_short_0_1 ;
839 vector signed short v_short_1_0 , v_short_1_1 ;
840 vector signed short v_short_2_0 , v_short_2_1 ;
841 vector signed short v_short_3_0 , v_short_3_1 ;
842
843 vector signed short v_sum_0 ;
844 vector signed short v_sum_1 ;
845 vector signed short v_sum_2 ;
846 vector signed short v_sum_3 ;
847
848
849 res[0] = 0;
850 res[1] = 0;
851 res[2] = 0;
852 res[3] = 0;
853 for (int y = 0; y < ly; y++)
854 {
855 for (int x = 0; x < lx; x+=16)
856 {
857 v_pix1 = vec_xl(x, pix1) ;
858
859 // for(int ii=0; ii<16; ii++) { res[0] += abs(pix1[x + ii] - pix2[x + ii]); }
860 v_pix2 = vec_xl(x, pix2) ;
861 v_abs_diff_0 = vec_sub(vec_max(v_pix1, v_pix2), vec_min(v_pix1, v_pix2)) ;
862 v_short_0_0 = vec_unpackh((vector signed char)v_abs_diff_0) ;
863 v_short_0_0 = vec_and(v_short_0_0, v_unpack_mask) ;
864 v_short_0_1 = vec_unpackl((vector signed char)v_abs_diff_0) ;
865 v_short_0_1 = vec_and(v_short_0_1, v_unpack_mask) ;
866 v_sum_0 = vec_add(v_short_0_0, v_short_0_1) ;
867 v_results_0 = vec_add(v_results_0, v_sum_0) ;
868
869 // for(int ii=0; ii<16; ii++) { res[1] += abs(pix1[x + ii] - pix3[x + ii]); }
870 v_pix3 = vec_xl(x, pix3) ;
871 v_abs_diff_1 = vec_sub(vec_max(v_pix1, v_pix3), vec_min(v_pix1, v_pix3)) ;
872 v_short_1_0 = vec_unpackh((vector signed char)v_abs_diff_1) ;
873 v_short_1_0 = vec_and(v_short_1_0, v_unpack_mask) ;
874 v_short_1_1 = vec_unpackl((vector signed char)v_abs_diff_1) ;
875 v_short_1_1 = vec_and(v_short_1_1, v_unpack_mask) ;
876 v_sum_1 = vec_add(v_short_1_0, v_short_1_1) ;
877 v_results_1 = vec_add(v_results_1, v_sum_1) ;
878
879
880 // for(int ii=0; ii<16; ii++) { res[2] += abs(pix1[x + ii] - pix4[x + ii]); }
881 v_pix4 = vec_xl(x, pix4) ;
882 v_abs_diff_2 = vec_sub(vec_max(v_pix1, v_pix4), vec_min(v_pix1, v_pix4)) ;
883 v_short_2_0 = vec_unpackh((vector signed char)v_abs_diff_2) ;
884 v_short_2_0 = vec_and(v_short_2_0, v_unpack_mask) ;
885 v_short_2_1 = vec_unpackl((vector signed char)v_abs_diff_2) ;
886 v_short_2_1 = vec_and(v_short_2_1, v_unpack_mask) ;
887 v_sum_2 = vec_add(v_short_2_0, v_short_2_1) ;
888 v_results_2 = vec_add(v_results_2, v_sum_2) ;
889
890
891 // for(int ii=0; ii<16; ii++) { res[3] += abs(pix1[x + ii] - pix5[x + ii]); }
892 v_pix5 = vec_xl(x, pix5) ;
893 v_abs_diff_3 = vec_sub(vec_max(v_pix1, v_pix5), vec_min(v_pix1, v_pix5)) ;
894 v_short_3_0 = vec_unpackh((vector signed char)v_abs_diff_3) ;
895 v_short_3_0 = vec_and(v_short_3_0, v_unpack_mask) ;
896 v_short_3_1 = vec_unpackl((vector signed char)v_abs_diff_3) ;
897 v_short_3_1 = vec_and(v_short_3_1, v_unpack_mask) ;
898 v_sum_3 = vec_add(v_short_3_0, v_short_3_1) ;
899 v_results_3 = vec_add(v_results_3, v_sum_3) ;
900 }
901
902 pix1 += FENC_STRIDE;
903 pix2 += frefstride;
904 pix3 += frefstride;
905 pix4 += frefstride;
906 pix5 += frefstride;
907 }
908
909
910 v_results_int_0 = vec_sum4s((vector signed short)v_results_0, (vector signed int)v_zeros) ;
911 v_results_int_0 = vec_sums(v_results_int_0, (vector signed int)v_zeros) ;
912 res[0] = v_results_int_0[3] ;
913
914
915 v_results_int_1 = vec_sum4s((vector signed short)v_results_1, (vector signed int)v_zeros) ;
916 v_results_int_1 = vec_sums(v_results_int_1, (vector signed int)v_zeros) ;
917 res[1] = v_results_int_1[3] ;
918
919
920 v_results_int_2 = vec_sum4s((vector signed short)v_results_2, (vector signed int)v_zeros) ;
921 v_results_int_2 = vec_sums(v_results_int_2, (vector signed int)v_zeros) ;
922 res[2] = v_results_int_2[3] ;
923
924
925 v_results_int_3 = vec_sum4s((vector signed short)v_results_3, (vector signed int)v_zeros) ;
926 v_results_int_3 = vec_sums(v_results_int_3, (vector signed int)v_zeros) ;
927 res[3] = v_results_int_3[3] ;
928 //printf("<32 16>%d %d %d %d\n", res[0], res[1], res[2], res[3]);
929 } // end sad_x4_altivec
930
931 template<>
sad_x4_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,const pixel * pix5,intptr_t frefstride,int32_t * res)932 void inline sad_x4_altivec<32, 24>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
933 {
934 int32_t sum[4];
935 sad16_x4_altivec<16, 24>(pix1, pix2, pix3, pix4, pix5, frefstride, sum);
936 sad16_x4_altivec<16, 24>(pix1+16, pix2+16, pix3+16, pix4+16, pix5+16, frefstride, res);
937 res[0] += sum[0];
938 res[1] += sum[1];
939 res[2] += sum[2];
940 res[3] += sum[3];
941 //printf("<32 24>%d %d %d %d\n", res[0], res[1], res[2], res[3]);
942 }
943
944 template<>
sad_x4_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,const pixel * pix5,intptr_t frefstride,int32_t * res)945 void sad_x4_altivec<32,32>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
946 {
947
948 const int lx = 32 ;
949 const int ly = 32 ;
950
951 vector unsigned int v_zeros = {0, 0, 0, 0} ;
952
953 vector signed short v_results_0 = {0, 0, 0, 0, 0, 0, 0, 0} ;
954 vector signed short v_results_1 = {0, 0, 0, 0, 0, 0, 0, 0} ;
955 vector signed short v_results_2 = {0, 0, 0, 0, 0, 0, 0, 0} ;
956 vector signed short v_results_3 = {0, 0, 0, 0, 0, 0, 0, 0} ;
957
958
959 vector signed int v_results_int_0 ;
960 vector signed int v_results_int_1 ;
961 vector signed int v_results_int_2 ;
962 vector signed int v_results_int_3 ;
963
964 vector unsigned char v_pix1 ;
965 vector unsigned char v_pix2 ;
966 vector unsigned char v_pix3 ;
967 vector unsigned char v_pix4 ;
968 vector unsigned char v_pix5 ;
969
970 vector unsigned char v_abs_diff_0 ;
971 vector unsigned char v_abs_diff_1 ;
972 vector unsigned char v_abs_diff_2 ;
973 vector unsigned char v_abs_diff_3 ;
974
975 vector signed short v_unpack_mask = {0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF} ;
976
977 vector signed short v_short_0_0 , v_short_0_1 ;
978 vector signed short v_short_1_0 , v_short_1_1 ;
979 vector signed short v_short_2_0 , v_short_2_1 ;
980 vector signed short v_short_3_0 , v_short_3_1 ;
981
982 vector signed short v_sum_0 ;
983 vector signed short v_sum_1 ;
984 vector signed short v_sum_2 ;
985 vector signed short v_sum_3 ;
986
987
988 res[0] = 0;
989 res[1] = 0;
990 res[2] = 0;
991 res[3] = 0;
992 for (int y = 0; y < ly; y++)
993 {
994 for (int x = 0; x < lx; x+=16)
995 {
996 v_pix1 = vec_xl(x, pix1) ;
997
998 // for(int ii=0; ii<16; ii++) { res[0] += abs(pix1[x + ii] - pix2[x + ii]); }
999 v_pix2 = vec_xl(x, pix2) ;
1000 v_abs_diff_0 = vec_sub(vec_max(v_pix1, v_pix2), vec_min(v_pix1, v_pix2)) ;
1001 v_short_0_0 = vec_unpackh((vector signed char)v_abs_diff_0) ;
1002 v_short_0_0 = vec_and(v_short_0_0, v_unpack_mask) ;
1003 v_short_0_1 = vec_unpackl((vector signed char)v_abs_diff_0) ;
1004 v_short_0_1 = vec_and(v_short_0_1, v_unpack_mask) ;
1005 v_sum_0 = vec_add(v_short_0_0, v_short_0_1) ;
1006 v_results_0 = vec_add(v_results_0, v_sum_0) ;
1007
1008 // for(int ii=0; ii<16; ii++) { res[1] += abs(pix1[x + ii] - pix3[x + ii]); }
1009 v_pix3 = vec_xl(x, pix3) ;
1010 v_abs_diff_1 = vec_sub(vec_max(v_pix1, v_pix3), vec_min(v_pix1, v_pix3)) ;
1011 v_short_1_0 = vec_unpackh((vector signed char)v_abs_diff_1) ;
1012 v_short_1_0 = vec_and(v_short_1_0, v_unpack_mask) ;
1013 v_short_1_1 = vec_unpackl((vector signed char)v_abs_diff_1) ;
1014 v_short_1_1 = vec_and(v_short_1_1, v_unpack_mask) ;
1015 v_sum_1 = vec_add(v_short_1_0, v_short_1_1) ;
1016 v_results_1 = vec_add(v_results_1, v_sum_1) ;
1017
1018
1019 // for(int ii=0; ii<16; ii++) { res[2] += abs(pix1[x + ii] - pix4[x + ii]); }
1020 v_pix4 = vec_xl(x, pix4) ;
1021 v_abs_diff_2 = vec_sub(vec_max(v_pix1, v_pix4), vec_min(v_pix1, v_pix4)) ;
1022 v_short_2_0 = vec_unpackh((vector signed char)v_abs_diff_2) ;
1023 v_short_2_0 = vec_and(v_short_2_0, v_unpack_mask) ;
1024 v_short_2_1 = vec_unpackl((vector signed char)v_abs_diff_2) ;
1025 v_short_2_1 = vec_and(v_short_2_1, v_unpack_mask) ;
1026 v_sum_2 = vec_add(v_short_2_0, v_short_2_1) ;
1027 v_results_2 = vec_add(v_results_2, v_sum_2) ;
1028
1029
1030 // for(int ii=0; ii<16; ii++) { res[3] += abs(pix1[x + ii] - pix5[x + ii]); }
1031 v_pix5 = vec_xl(x, pix5) ;
1032 v_abs_diff_3 = vec_sub(vec_max(v_pix1, v_pix5), vec_min(v_pix1, v_pix5)) ;
1033 v_short_3_0 = vec_unpackh((vector signed char)v_abs_diff_3) ;
1034 v_short_3_0 = vec_and(v_short_3_0, v_unpack_mask) ;
1035 v_short_3_1 = vec_unpackl((vector signed char)v_abs_diff_3) ;
1036 v_short_3_1 = vec_and(v_short_3_1, v_unpack_mask) ;
1037 v_sum_3 = vec_add(v_short_3_0, v_short_3_1) ;
1038 v_results_3 = vec_add(v_results_3, v_sum_3) ;
1039 }
1040
1041 pix1 += FENC_STRIDE;
1042 pix2 += frefstride;
1043 pix3 += frefstride;
1044 pix4 += frefstride;
1045 pix5 += frefstride;
1046 }
1047
1048
1049 v_results_int_0 = vec_sum4s((vector signed short)v_results_0, (vector signed int)v_zeros) ;
1050 v_results_int_0 = vec_sums(v_results_int_0, (vector signed int)v_zeros) ;
1051 res[0] = v_results_int_0[3] ;
1052
1053
1054 v_results_int_1 = vec_sum4s((vector signed short)v_results_1, (vector signed int)v_zeros) ;
1055 v_results_int_1 = vec_sums(v_results_int_1, (vector signed int)v_zeros) ;
1056 res[1] = v_results_int_1[3] ;
1057
1058
1059 v_results_int_2 = vec_sum4s((vector signed short)v_results_2, (vector signed int)v_zeros) ;
1060 v_results_int_2 = vec_sums(v_results_int_2, (vector signed int)v_zeros) ;
1061 res[2] = v_results_int_2[3] ;
1062
1063
1064 v_results_int_3 = vec_sum4s((vector signed short)v_results_3, (vector signed int)v_zeros) ;
1065 v_results_int_3 = vec_sums(v_results_int_3, (vector signed int)v_zeros) ;
1066 res[3] = v_results_int_3[3] ;
1067
1068 //printf("<32 32>%d %d %d %d\n", res[0], res[1], res[2], res[3]);
1069 } // end sad_x4_altivec
1070
1071 template<>
sad_x4_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,const pixel * pix5,intptr_t frefstride,int32_t * res)1072 void inline sad_x4_altivec<32, 64>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
1073 {
1074 int32_t sum[4];
1075 sad16_x4_altivec<16, 64>(pix1, pix2, pix3, pix4, pix5, frefstride, sum);
1076 sad16_x4_altivec<16, 64>(pix1+16, pix2+16, pix3+16, pix4+16, pix5+16, frefstride, res);
1077 res[0] += sum[0];
1078 res[1] += sum[1];
1079 res[2] += sum[2];
1080 res[3] += sum[3];
1081 //printf("<32 64>%d %d %d %d\n", res[0], res[1], res[2], res[3]);
1082 }
1083
1084 template<>
sad_x4_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,const pixel * pix5,intptr_t frefstride,int32_t * res)1085 void inline sad_x4_altivec<48, 64>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
1086 {
1087 int32_t sum[8];
1088 sad16_x4_altivec<16, 64>(pix1, pix2, pix3, pix4, pix5, frefstride, sum);
1089 sad16_x4_altivec<16, 64>(pix1+16, pix2+16, pix3+16, pix4+16, pix5+16, frefstride, sum+4);
1090 sad16_x4_altivec<16, 64>(pix1+32, pix2+32, pix3+32, pix4+32, pix5+32, frefstride, res);
1091 res[0] = sum[0]+sum[4]+res[0];
1092 res[1] = sum[1]+sum[5]+res[1];
1093 res[2] = sum[2]+sum[6]+res[2];
1094 res[3] = sum[3]+sum[7]+res[3];
1095 //printf("<48 64>%d %d %d %d\n", res[0], res[1], res[2], res[3]);
1096 }
1097
1098 template<>
sad_x4_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,const pixel * pix5,intptr_t frefstride,int32_t * res)1099 void inline sad_x4_altivec<64, 16>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
1100 {
1101 int32_t sum[12];
1102 sad16_x4_altivec<16, 16>(pix1, pix2, pix3, pix4, pix5, frefstride, sum);
1103 sad16_x4_altivec<16, 16>(pix1+16, pix2+16, pix3+16, pix4+16, pix5+16, frefstride, sum+4);
1104 sad16_x4_altivec<16, 16>(pix1+32, pix2+32, pix3+32, pix4+32, pix5+32, frefstride, sum+8);
1105 sad16_x4_altivec<16, 16>(pix1+48, pix2+48, pix3+48, pix4+48, pix5+48, frefstride, res);
1106 res[0] = sum[0]+sum[4]+sum[8]+res[0];
1107 res[1] = sum[1]+sum[5]+sum[9]+res[1];
1108 res[2] = sum[2]+sum[6]+sum[10]+res[2];
1109 res[3] = sum[3]+sum[7]+sum[11]+res[3];
1110 //printf("<64 16>%d %d %d %d\n", res[0], res[1], res[2], res[3]);
1111 }
1112
1113 template<>
sad_x4_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,const pixel * pix5,intptr_t frefstride,int32_t * res)1114 void inline sad_x4_altivec<64, 32>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
1115 {
1116 int32_t sum[12];
1117 sad16_x4_altivec<16, 32>(pix1, pix2, pix3, pix4, pix5, frefstride, sum);
1118 sad16_x4_altivec<16, 32>(pix1+16, pix2+16, pix3+16, pix4+16, pix5+16, frefstride, sum+4);
1119 sad16_x4_altivec<16, 32>(pix1+32, pix2+32, pix3+32, pix4+32, pix5+32, frefstride, sum+8);
1120 sad16_x4_altivec<16, 32>(pix1+48, pix2+48, pix3+48, pix4+48, pix5+48, frefstride, res);
1121 res[0] = sum[0]+sum[4]+sum[8]+res[0];
1122 res[1] = sum[1]+sum[5]+sum[9]+res[1];
1123 res[2] = sum[2]+sum[6]+sum[10]+res[2];
1124 res[3] = sum[3]+sum[7]+sum[11]+res[3];
1125 //printf("<64 32>%d %d %d %d\n", res[0], res[1], res[2], res[3]);
1126 }
1127
1128 template<>
sad_x4_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,const pixel * pix5,intptr_t frefstride,int32_t * res)1129 void inline sad_x4_altivec<64, 48>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
1130 {
1131 int32_t sum[12];
1132 sad16_x4_altivec<16, 48>(pix1, pix2, pix3, pix4, pix5, frefstride, sum);
1133 sad16_x4_altivec<16, 48>(pix1+16, pix2+16, pix3+16, pix4+16, pix5+16, frefstride, sum+4);
1134 sad16_x4_altivec<16, 48>(pix1+32, pix2+32, pix3+32, pix4+32, pix5+32, frefstride, sum+8);
1135 sad16_x4_altivec<16, 48>(pix1+48, pix2+48, pix3+48, pix4+48, pix5+48, frefstride, res);
1136 res[0] = sum[0]+sum[4]+sum[8]+res[0];
1137 res[1] = sum[1]+sum[5]+sum[9]+res[1];
1138 res[2] = sum[2]+sum[6]+sum[10]+res[2];
1139 res[3] = sum[3]+sum[7]+sum[11]+res[3];
1140 //printf("<64 48>%d %d %d %d\n", res[0], res[1], res[2], res[3]);
1141 }
1142
1143 template<>
sad_x4_altivec(const pixel * pix1,const pixel * pix2,const pixel * pix3,const pixel * pix4,const pixel * pix5,intptr_t frefstride,int32_t * res)1144 void inline sad_x4_altivec<64, 64>(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
1145 {
1146 int32_t sum[12];
1147 sad16_x4_altivec<16, 64>(pix1, pix2, pix3, pix4, pix5, frefstride, sum);
1148 sad16_x4_altivec<16, 64>(pix1+16, pix2+16, pix3+16, pix4+16, pix5+16, frefstride, sum+4);
1149 sad16_x4_altivec<16, 64>(pix1+32, pix2+32, pix3+32, pix4+32, pix5+32, frefstride, sum+8);
1150 sad16_x4_altivec<16, 64>(pix1+48, pix2+48, pix3+48, pix4+48, pix5+48, frefstride, res);
1151 res[0] = sum[0]+sum[4]+sum[8]+res[0];
1152 res[1] = sum[1]+sum[5]+sum[9]+res[1];
1153 res[2] = sum[2]+sum[6]+sum[10]+res[2];
1154 res[3] = sum[3]+sum[7]+sum[11]+res[3];
1155 //printf("<64 64>%d %d %d %d\n", res[0], res[1], res[2], res[3]);
1156 }
1157
1158
1159 /***********************************************************************
1160 * SATD routines - altivec implementation
1161 **********************************************************************/
1162 #define HADAMARD4_VEC(s0, s1, s2, s3, d0, d1, d2, d3) \
1163 {\
1164 vec_s16_t t0, t1, t2, t3;\
1165 t0 = vec_add(s0, s1);\
1166 t1 = vec_sub(s0, s1);\
1167 t2 = vec_add(s2, s3);\
1168 t3 = vec_sub(s2, s3);\
1169 d0 = vec_add(t0, t2);\
1170 d2 = vec_sub(t0, t2);\
1171 d1 = vec_add(t1, t3);\
1172 d3 = vec_sub(t1, t3);\
1173 }
1174
1175 #define VEC_TRANSPOSE_4(a0,a1,a2,a3,b0,b1,b2,b3) \
1176 b0 = vec_mergeh( a0, a0 ); \
1177 b1 = vec_mergeh( a1, a0 ); \
1178 b2 = vec_mergeh( a2, a0 ); \
1179 b3 = vec_mergeh( a3, a0 ); \
1180 a0 = vec_mergeh( b0, b2 ); \
1181 a1 = vec_mergel( b0, b2 ); \
1182 a2 = vec_mergeh( b1, b3 ); \
1183 a3 = vec_mergel( b1, b3 ); \
1184 b0 = vec_mergeh( a0, a2 ); \
1185 b1 = vec_mergel( a0, a2 ); \
1186 b2 = vec_mergeh( a1, a3 ); \
1187 b3 = vec_mergel( a1, a3 )
1188
1189 #define VEC_TRANSPOSE_8(a0,a1,a2,a3,a4,a5,a6,a7,b0,b1,b2,b3,b4,b5,b6,b7) \
1190 b0 = vec_mergeh( a0, a4 ); \
1191 b1 = vec_mergel( a0, a4 ); \
1192 b2 = vec_mergeh( a1, a5 ); \
1193 b3 = vec_mergel( a1, a5 ); \
1194 b4 = vec_mergeh( a2, a6 ); \
1195 b5 = vec_mergel( a2, a6 ); \
1196 b6 = vec_mergeh( a3, a7 ); \
1197 b7 = vec_mergel( a3, a7 ); \
1198 a0 = vec_mergeh( b0, b4 ); \
1199 a1 = vec_mergel( b0, b4 ); \
1200 a2 = vec_mergeh( b1, b5 ); \
1201 a3 = vec_mergel( b1, b5 ); \
1202 a4 = vec_mergeh( b2, b6 ); \
1203 a5 = vec_mergel( b2, b6 ); \
1204 a6 = vec_mergeh( b3, b7 ); \
1205 a7 = vec_mergel( b3, b7 ); \
1206 b0 = vec_mergeh( a0, a4 ); \
1207 b1 = vec_mergel( a0, a4 ); \
1208 b2 = vec_mergeh( a1, a5 ); \
1209 b3 = vec_mergel( a1, a5 ); \
1210 b4 = vec_mergeh( a2, a6 ); \
1211 b5 = vec_mergel( a2, a6 ); \
1212 b6 = vec_mergeh( a3, a7 ); \
1213 b7 = vec_mergel( a3, a7 )
1214
satd_4x4_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)1215 int satd_4x4_altivec(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
1216 {
1217 ALIGN_VAR_16( int, sum );
1218
1219 LOAD_ZERO;
1220 vec_s16_t pix1v, pix2v;
1221 vec_s16_t diff0v, diff1v, diff2v, diff3v;
1222 vec_s16_t temp0v, temp1v, temp2v, temp3v;
1223 vec_s32_t satdv, satdv1, satdv2, satdv3;
1224
1225 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1226 pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
1227 diff0v = vec_sub( pix1v, pix2v );
1228 pix1 += stride_pix1;
1229 pix2 += stride_pix2;
1230
1231 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1232 pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
1233 diff1v = vec_sub( pix1v, pix2v );
1234 pix1 += stride_pix1;
1235 pix2 += stride_pix2;
1236
1237 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1238 pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
1239 diff2v = vec_sub( pix1v, pix2v );
1240 pix1 += stride_pix1;
1241 pix2 += stride_pix2;
1242
1243 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1244 pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
1245 diff3v = vec_sub( pix1v, pix2v );
1246 pix1 += stride_pix1;
1247 pix2 += stride_pix2;
1248
1249 /* Hadamar H */
1250 HADAMARD4_VEC(diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v);
1251 VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v, diff0v, diff1v, diff2v, diff3v );
1252 /* Hadamar V */
1253 HADAMARD4_VEC(diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v);
1254
1255 #if 1
1256 temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
1257 satdv = vec_sum4s( temp0v, zero_s32v);
1258
1259 temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
1260 satdv1 = vec_sum4s( temp1v, zero_s32v );
1261
1262 temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
1263 satdv2 = vec_sum4s( temp2v, zero_s32v );
1264
1265 temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
1266 satdv3 = vec_sum4s( temp3v, zero_s32v );
1267
1268 satdv += satdv1;
1269 satdv2 += satdv3;
1270 satdv += satdv2;
1271
1272 satdv = vec_sum2s( satdv, zero_s32v );
1273 //satdv = vec_splat( satdv, 1 );
1274 //vec_ste( satdv, 0, &sum );
1275 sum = vec_extract(satdv, 1);
1276 //print(sum);
1277 #else
1278 temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
1279 satdv = vec_sum4s( temp0v, zero_s32v);
1280
1281 temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
1282 satdv= vec_sum4s( temp1v, satdv );
1283
1284 temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
1285 satdv= vec_sum4s( temp2v, satdv );
1286
1287 temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
1288 satdv= vec_sum4s( temp3v, satdv );
1289
1290 satdv = vec_sum2s( satdv, zero_s32v );
1291 //satdv = vec_splat( satdv, 1 );
1292 //vec_ste( satdv, 0, &sum );
1293 sum = vec_extract(satdv, 1);
1294 //print(sum);
1295 #endif
1296 return sum >> 1;
1297 }
1298
1299 #define HADAMARD4_x2vec(v_out0, v_out1, v_in0, v_in1, v_perm_l0_0, v_perm_l0_1) \
1300 { \
1301 \
1302 vector unsigned int v_l0_input_0, v_l0_input_1 ; \
1303 v_l0_input_0 = vec_perm((vector unsigned int)v_in0, (vector unsigned int)v_in1, v_perm_l0_0) ; \
1304 v_l0_input_1 = vec_perm((vector unsigned int)v_in0, (vector unsigned int)v_in1, v_perm_l0_1) ; \
1305 \
1306 vector unsigned int v_l0_add_result, v_l0_sub_result ; \
1307 v_l0_add_result = vec_add(v_l0_input_0, v_l0_input_1) ; \
1308 v_l0_sub_result = vec_sub(v_l0_input_0, v_l0_input_1) ; \
1309 \
1310 vector unsigned char v_perm_l1_0 = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17} ; \
1311 vector unsigned char v_perm_l1_1 = {0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0xF, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F} ; \
1312 \
1313 vector unsigned int v_l1_input_0, v_l1_input_1 ; \
1314 v_l1_input_0 = vec_perm(v_l0_add_result, v_l0_sub_result, v_perm_l1_0) ; \
1315 v_l1_input_1 = vec_perm(v_l0_add_result, v_l0_sub_result, v_perm_l1_1) ; \
1316 \
1317 vector unsigned int v_l1_add_result, v_l1_sub_result ; \
1318 v_l1_add_result = vec_add(v_l1_input_0, v_l1_input_1) ; \
1319 v_l1_sub_result = vec_sub(v_l1_input_0, v_l1_input_1) ; \
1320 \
1321 \
1322 v_out0 = v_l1_add_result ; \
1323 v_out1 = v_l1_sub_result ; \
1324 \
1325 \
1326 }
1327
satd_4x8_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)1328 int satd_4x8_altivec(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
1329 {
1330 ALIGN_VAR_16( int, sum );
1331
1332 LOAD_ZERO;
1333 vec_s16_t pix1v, pix2v;
1334 vec_s16_t diff0v, diff1v, diff2v, diff3v;
1335 vec_s16_t temp0v, temp1v, temp2v, temp3v;
1336 vec_s32_t satdv, satdv1, satdv2, satdv3;;
1337
1338 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1339 pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
1340 diff0v = vec_sub( pix1v, pix2v );
1341 pix1 += stride_pix1;
1342 pix2 += stride_pix2;
1343
1344 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1345 pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
1346 diff1v = vec_sub( pix1v, pix2v );
1347 pix1 += stride_pix1;
1348 pix2 += stride_pix2;
1349
1350 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1351 pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
1352 diff2v = vec_sub( pix1v, pix2v );
1353 pix1 += stride_pix1;
1354 pix2 += stride_pix2;
1355
1356 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1357 pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
1358 diff3v = vec_sub( pix1v, pix2v );
1359 pix1 += stride_pix1;
1360 pix2 += stride_pix2;
1361
1362 /* Hadamar H */
1363 HADAMARD4_VEC(diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v);
1364 VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v, diff0v, diff1v, diff2v, diff3v );
1365 /* Hadamar V */
1366 HADAMARD4_VEC(diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v);
1367
1368 #if 1
1369 temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
1370 satdv = vec_sum4s( temp0v, zero_s32v);
1371
1372 temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
1373 satdv1= vec_sum4s( temp1v, zero_s32v );
1374
1375 temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
1376 satdv2= vec_sum4s( temp2v, zero_s32v );
1377
1378 temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
1379 satdv3= vec_sum4s( temp3v, zero_s32v );
1380
1381 satdv += satdv1;
1382 satdv2 += satdv3;
1383 satdv += satdv2;
1384 #else
1385 temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
1386 satdv = vec_sum4s( temp0v, zero_s32v);
1387
1388 temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
1389 satdv= vec_sum4s( temp1v, satdv );
1390
1391 temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
1392 satdv= vec_sum4s( temp2v, satdv );
1393
1394 temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
1395 satdv= vec_sum4s( temp3v, satdv );
1396 #endif
1397
1398 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1399 pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
1400 diff0v = vec_sub( pix1v, pix2v );
1401 pix1 += stride_pix1;
1402 pix2 += stride_pix2;
1403
1404 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1405 pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
1406 diff1v = vec_sub( pix1v, pix2v );
1407 pix1 += stride_pix1;
1408 pix2 += stride_pix2;
1409
1410 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1411 pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
1412 diff2v = vec_sub( pix1v, pix2v );
1413 pix1 += stride_pix1;
1414 pix2 += stride_pix2;
1415
1416 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1417 pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
1418 diff3v = vec_sub( pix1v, pix2v );
1419 pix1 += stride_pix1;
1420 pix2 += stride_pix2;
1421
1422 /* Hadamar H */
1423 HADAMARD4_VEC(diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v);
1424 VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v, diff0v, diff1v, diff2v, diff3v );
1425 /* Hadamar V */
1426 HADAMARD4_VEC(diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v);
1427
1428 #if 1
1429 temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
1430 satdv += vec_sum4s( temp0v, zero_s32v);
1431
1432 temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
1433 satdv1 = vec_sum4s( temp1v, zero_s32v );
1434
1435 temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
1436 satdv2 = vec_sum4s( temp2v, zero_s32v );
1437
1438 temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
1439 satdv3 = vec_sum4s( temp3v, zero_s32v );
1440
1441 satdv += satdv1;
1442 satdv2 += satdv3;
1443 satdv += satdv2;
1444
1445 satdv = vec_sum2s( satdv, zero_s32v );
1446 sum = vec_extract(satdv, 1);
1447 #else
1448 temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
1449 satdv = vec_sum4s( temp0v, satdv);
1450
1451 temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
1452 satdv= vec_sum4s( temp1v, satdv );
1453
1454 temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
1455 satdv= vec_sum4s( temp2v, satdv );
1456
1457 temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
1458 satdv= vec_sum4s( temp3v, satdv );
1459
1460 satdv = vec_sum2s( satdv, zero_s32v );
1461 satdv = vec_splat( satdv, 1 );
1462 vec_ste( satdv, 0, &sum );
1463 #endif
1464 return sum >> 1;
1465 }
1466
1467 #if 1
satd_8x4_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)1468 static int satd_8x4_altivec(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
1469 {
1470 const vector signed short v_unsigned_short_mask = {0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF, 0x00FF} ;
1471 vector unsigned char v_pix1_ub, v_pix2_ub ;
1472 vector signed short v_pix1_ss, v_pix2_ss ;
1473 vector signed short v_sub ;
1474 vector signed int v_sub_sw_0, v_sub_sw_1 ;
1475 vector signed int v_packed_sub_0, v_packed_sub_1 ;
1476 vector unsigned int v_hadamard_result_0, v_hadamard_result_1, v_hadamard_result_2, v_hadamard_result_3 ;
1477
1478 // for (int i = 0; i < 4; i+=2, pix1 += 2*stride_pix1, pix2 += 2*stride_pix2)
1479 // {
1480 //a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM);
1481 //a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM);
1482 //a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM);
1483 //a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM);
1484
1485 // Load 16 elements from each pix array
1486 v_pix1_ub = vec_xl(0, pix1) ;
1487 v_pix2_ub = vec_xl(0, pix2) ;
1488
1489 // We only care about the top 8, and in short format
1490 v_pix1_ss = vec_unpackh((vector signed char)v_pix1_ub) ;
1491 v_pix2_ss = vec_unpackh((vector signed char)v_pix2_ub) ;
1492
1493 // Undo the sign extend of the unpacks
1494 v_pix1_ss = vec_and(v_pix1_ss, v_unsigned_short_mask) ;
1495 v_pix2_ss = vec_and(v_pix2_ss, v_unsigned_short_mask) ;
1496
1497 // Peform the subtraction
1498 v_sub = vec_sub(v_pix1_ss, v_pix2_ss) ;
1499
1500 // Unpack the sub results into ints
1501 v_sub_sw_0 = vec_unpackh(v_sub) ;
1502 v_sub_sw_1 = vec_unpackl(v_sub) ;
1503 v_sub_sw_1 = vec_sl(v_sub_sw_1, (vector unsigned int){16,16,16,16}) ;
1504
1505 // Add the int sub results (compatibility with the original code)
1506 v_packed_sub_0 = vec_add(v_sub_sw_0, v_sub_sw_1) ;
1507
1508 //a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM);
1509 //a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM);
1510 //a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM);
1511 //a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM);
1512
1513 // Load 16 elements from each pix array
1514 v_pix1_ub = vec_xl(stride_pix1, pix1) ;
1515 v_pix2_ub = vec_xl(stride_pix2, pix2) ;
1516
1517 // We only care about the top 8, and in short format
1518 v_pix1_ss = vec_unpackh((vector signed char)v_pix1_ub) ;
1519 v_pix2_ss = vec_unpackh((vector signed char)v_pix2_ub) ;
1520
1521 // Undo the sign extend of the unpacks
1522 v_pix1_ss = vec_and(v_pix1_ss, v_unsigned_short_mask) ;
1523 v_pix2_ss = vec_and(v_pix2_ss, v_unsigned_short_mask) ;
1524
1525 // Peform the subtraction
1526 v_sub = vec_sub(v_pix1_ss, v_pix2_ss) ;
1527
1528 // Unpack the sub results into ints
1529 v_sub_sw_0 = vec_unpackh(v_sub) ;
1530 v_sub_sw_1 = vec_unpackl(v_sub) ;
1531 v_sub_sw_1 = vec_sl(v_sub_sw_1, (vector unsigned int){16,16,16,16}) ;
1532
1533 // Add the int sub results (compatibility with the original code)
1534 v_packed_sub_1 = vec_add(v_sub_sw_0, v_sub_sw_1) ;
1535
1536 // original: HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3);
1537 // modified while vectorizing: HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], v_packed_sub_0[0], v_packed_sub_0[1], v_packed_sub_0[2], v_packed_sub_0[3]);
1538
1539 // original: HADAMARD4(tmp[i+1][0], tmp[i+1][1], tmp[i+1][2], tmp[i+1][3], a0, a1, a2, a3);
1540 // modified while vectorizing: HADAMARD4(tmp[i+1][0], tmp[i+1][1], tmp[i+1][2], tmp[i+1][3], v_packed_sub_1[0], v_packed_sub_1[1], v_packed_sub_1[2], v_packed_sub_1[3]);
1541
1542 // Go after two hadamard4(int) at once, fully utilizing the vector width
1543 // Note that the hadamard4(int) provided by x264/x265 is actually two hadamard4(short) simultaneously
1544 const vector unsigned char v_perm_l0_0 = {0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B} ;
1545 const vector unsigned char v_perm_l0_1 = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F} ;
1546 HADAMARD4_x2vec(v_hadamard_result_0, v_hadamard_result_1, v_packed_sub_0, v_packed_sub_1, v_perm_l0_0, v_perm_l0_1) ;
1547
1548 //##
1549 // tmp[0][0] = v_hadamard_result_0[0] ;
1550 // tmp[0][1] = v_hadamard_result_0[2] ;
1551 // tmp[0][2] = v_hadamard_result_1[0] ;
1552 // tmp[0][3] = v_hadamard_result_1[2] ;
1553
1554 // tmp[1][0] = v_hadamard_result_0[1] ;
1555 // tmp[1][1] = v_hadamard_result_0[3] ;
1556 // tmp[1][2] = v_hadamard_result_1[1] ;
1557 // tmp[1][3] = v_hadamard_result_1[3] ;
1558 //##
1559
1560 //a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM);
1561 //a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM);
1562 //a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM);
1563 //a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM);
1564
1565 // Load 16 elements from each pix array
1566 v_pix1_ub = vec_xl(2*stride_pix1, pix1) ;
1567 v_pix2_ub = vec_xl(2*stride_pix1, pix2) ;
1568
1569 // We only care about the top 8, and in short format
1570 v_pix1_ss = vec_unpackh((vector signed char)v_pix1_ub) ;
1571 v_pix2_ss = vec_unpackh((vector signed char)v_pix2_ub) ;
1572
1573 // Undo the sign extend of the unpacks
1574 v_pix1_ss = vec_and(v_pix1_ss, v_unsigned_short_mask) ;
1575 v_pix2_ss = vec_and(v_pix2_ss, v_unsigned_short_mask) ;
1576
1577 // Peform the subtraction
1578 v_sub = vec_sub(v_pix1_ss, v_pix2_ss) ;
1579
1580 // Unpack the sub results into ints
1581 v_sub_sw_0 = vec_unpackh(v_sub) ;
1582 v_sub_sw_1 = vec_unpackl(v_sub) ;
1583 v_sub_sw_1 = vec_sl(v_sub_sw_1, (vector unsigned int){16,16,16,16}) ;
1584
1585 // Add the int sub results (compatibility with the original code)
1586 v_packed_sub_0 = vec_add(v_sub_sw_0, v_sub_sw_1) ;
1587
1588 //a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM);
1589 //a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM);
1590 //a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM);
1591 //a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM);
1592
1593 // Load 16 elements from each pix array
1594 v_pix1_ub = vec_xl(3*stride_pix1, pix1) ;
1595 v_pix2_ub = vec_xl(3*stride_pix2, pix2) ;
1596
1597 // We only care about the top 8, and in short format
1598 v_pix1_ss = vec_unpackh((vector signed char)v_pix1_ub) ;
1599 v_pix2_ss = vec_unpackh((vector signed char)v_pix2_ub) ;
1600
1601 // Undo the sign extend of the unpacks
1602 v_pix1_ss = vec_and(v_pix1_ss, v_unsigned_short_mask) ;
1603 v_pix2_ss = vec_and(v_pix2_ss, v_unsigned_short_mask) ;
1604
1605 // Peform the subtraction
1606 v_sub = vec_sub(v_pix1_ss, v_pix2_ss) ;
1607
1608 // Unpack the sub results into ints
1609 v_sub_sw_0 = vec_unpackh(v_sub) ;
1610 v_sub_sw_1 = vec_unpackl(v_sub) ;
1611 v_sub_sw_1 = vec_sl(v_sub_sw_1, (vector unsigned int){16,16,16,16}) ;
1612
1613 // Add the int sub results (compatibility with the original code)
1614 v_packed_sub_1 = vec_add(v_sub_sw_0, v_sub_sw_1) ;
1615
1616
1617 // original: HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3);
1618 // modified while vectorizing: HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], v_packed_sub_0[0], v_packed_sub_0[1], v_packed_sub_0[2], v_packed_sub_0[3]);
1619
1620 // original: HADAMARD4(tmp[i+1][0], tmp[i+1][1], tmp[i+1][2], tmp[i+1][3], a0, a1, a2, a3);
1621 // modified while vectorizing: HADAMARD4(tmp[i+1][0], tmp[i+1][1], tmp[i+1][2], tmp[i+1][3], v_packed_sub_1[0], v_packed_sub_1[1], v_packed_sub_1[2], v_packed_sub_1[3]);
1622
1623 // Go after two hadamard4(int) at once, fully utilizing the vector width
1624 // Note that the hadamard4(int) provided by x264/x265 is actually two hadamard4(short) simultaneously
1625 HADAMARD4_x2vec(v_hadamard_result_2, v_hadamard_result_3, v_packed_sub_0, v_packed_sub_1, v_perm_l0_0, v_perm_l0_1) ;
1626
1627 //##
1628 //## tmp[2][0] = v_hadamard_result_2[0] ;
1629 //## tmp[2][1] = v_hadamard_result_2[2] ;
1630 //## tmp[2][2] = v_hadamard_result_3[0] ;
1631 //## tmp[2][3] = v_hadamard_result_3[2] ;
1632 //##
1633 //## tmp[3][0] = v_hadamard_result_2[1] ;
1634 //## tmp[3][1] = v_hadamard_result_2[3] ;
1635 //## tmp[3][2] = v_hadamard_result_3[1] ;
1636 //## tmp[3][3] = v_hadamard_result_3[3] ;
1637 //##
1638 // }
1639 // for (int i = 0; i < 4; i++)
1640 // {
1641 // HADAMARD4(a0, a1, a2, a3, tmp[0][0], tmp[1][0], tmp[2][0], tmp[3][0]);
1642 // sum += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
1643
1644 // HADAMARD4(a0, a1, a2, a3, tmp[0][1], tmp[1][1], tmp[2][1], tmp[3][1]);
1645 // sum += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
1646 const vector unsigned char v_lowerloop_perm_l0_0 = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B} ;
1647 const vector unsigned char v_lowerloop_perm_l0_1 = {0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F, 0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F} ;
1648 HADAMARD4_x2vec(v_hadamard_result_0, v_hadamard_result_2, v_hadamard_result_0, v_hadamard_result_2, v_lowerloop_perm_l0_0, v_lowerloop_perm_l0_1) ;
1649
1650 const vector unsigned int v_15 = {15, 15, 15, 15} ;
1651 const vector unsigned int v_0x10001 = (vector unsigned int){ 0x10001, 0x10001, 0x10001, 0x10001 };
1652 const vector unsigned int v_0xffff = (vector unsigned int){ 0xffff, 0xffff, 0xffff, 0xffff };
1653
1654
1655 vector unsigned int v_hadamard_result_s_0 ;
1656 v_hadamard_result_s_0 = vec_sra(v_hadamard_result_0, v_15) ;
1657 v_hadamard_result_s_0 = vec_and(v_hadamard_result_s_0, v_0x10001) ;
1658 asm ("vmuluwm %0,%1,%2"
1659 : "=v" (v_hadamard_result_s_0)
1660 : "v" (v_hadamard_result_s_0) , "v" (v_0xffff)
1661 ) ;
1662 v_hadamard_result_0 = vec_add(v_hadamard_result_0, v_hadamard_result_s_0) ;
1663 v_hadamard_result_0 = vec_xor(v_hadamard_result_0, v_hadamard_result_s_0) ;
1664
1665 vector unsigned int v_hadamard_result_s_2 ;
1666 v_hadamard_result_s_2 = vec_sra(v_hadamard_result_2, v_15) ;
1667 v_hadamard_result_s_2 = vec_and(v_hadamard_result_s_2, v_0x10001) ;
1668 asm ("vmuluwm %0,%1,%2"
1669 : "=v" (v_hadamard_result_s_2)
1670 : "v" (v_hadamard_result_s_2) , "v" (v_0xffff)
1671 ) ;
1672 v_hadamard_result_2 = vec_add(v_hadamard_result_2, v_hadamard_result_s_2) ;
1673 v_hadamard_result_2 = vec_xor(v_hadamard_result_2, v_hadamard_result_s_2) ;
1674
1675 // HADAMARD4(a0, a1, a2, a3, tmp[0][2], tmp[1][2], tmp[2][2], tmp[3][2]);
1676 // sum += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
1677
1678 // HADAMARD4(a0, a1, a2, a3, tmp[0][3], tmp[1][3], tmp[2][3], tmp[3][3]);
1679 // sum += abs2(a0) + abs2(a1) + abs2(a2) + abs2(a3);
1680
1681 HADAMARD4_x2vec(v_hadamard_result_1, v_hadamard_result_3, v_hadamard_result_1, v_hadamard_result_3, v_lowerloop_perm_l0_0, v_lowerloop_perm_l0_1) ;
1682
1683 vector unsigned int v_hadamard_result_s_1 ;
1684 v_hadamard_result_s_1 = vec_sra(v_hadamard_result_1, v_15) ;
1685 v_hadamard_result_s_1 = vec_and(v_hadamard_result_s_1, v_0x10001) ;
1686 asm ("vmuluwm %0,%1,%2"
1687 : "=v" (v_hadamard_result_s_1)
1688 : "v" (v_hadamard_result_s_1) , "v" (v_0xffff)
1689 ) ;
1690 v_hadamard_result_1 = vec_add(v_hadamard_result_1, v_hadamard_result_s_1) ;
1691 v_hadamard_result_1 = vec_xor(v_hadamard_result_1, v_hadamard_result_s_1) ;
1692
1693 vector unsigned int v_hadamard_result_s_3 ;
1694 v_hadamard_result_s_3 = vec_sra(v_hadamard_result_3, v_15) ;
1695 v_hadamard_result_s_3 = vec_and(v_hadamard_result_s_3, v_0x10001) ;
1696 asm ("vmuluwm %0,%1,%2"
1697 : "=v" (v_hadamard_result_s_3)
1698 : "v" (v_hadamard_result_s_3) , "v" (v_0xffff)
1699 ) ;
1700 v_hadamard_result_3 = vec_add(v_hadamard_result_3, v_hadamard_result_s_3) ;
1701 v_hadamard_result_3 = vec_xor(v_hadamard_result_3, v_hadamard_result_s_3) ;
1702
1703 // }
1704
1705
1706 vector unsigned int v_sum_0, v_sum_1 ;
1707 vector signed int v_sum ;
1708
1709 v_sum_0 = vec_add(v_hadamard_result_0, v_hadamard_result_2) ;
1710 v_sum_1 = vec_add(v_hadamard_result_1, v_hadamard_result_3) ;
1711
1712 v_sum_0 = vec_add(v_sum_0, v_sum_1) ;
1713
1714 vector signed int v_zero = {0, 0, 0, 0} ;
1715 v_sum = vec_sums((vector signed int)v_sum_0, v_zero) ;
1716
1717 // return (((sum_t)sum) + (sum >> BITS_PER_SUM)) >> 1;
1718 return (((sum_t)v_sum[3]) + (v_sum[3] >> BITS_PER_SUM)) >> 1;
1719 }
1720 #else
satd_8x4_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)1721 int satd_8x4_altivec(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
1722 {
1723 ALIGN_VAR_16( int, sum );
1724 LOAD_ZERO;
1725 vec_s16_t pix1v, pix2v;
1726 vec_s16_t diff0v, diff1v, diff2v, diff3v, diff4v, diff5v, diff6v, diff7v;
1727 vec_s16_t temp0v, temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v;
1728 vec_s32_t satdv;
1729
1730
1731 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1732 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
1733 diff0v = vec_sub( pix1v, pix2v );
1734 pix1 += stride_pix1;
1735 pix2 += stride_pix2;
1736
1737 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1738 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
1739 diff1v = vec_sub( pix1v, pix2v );
1740 pix1 += stride_pix1;
1741 pix2 += stride_pix2;
1742
1743 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1744 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
1745 diff2v = vec_sub( pix1v, pix2v );
1746 pix1 += stride_pix1;
1747 pix2 += stride_pix2;
1748
1749 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1750 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
1751 diff3v = vec_sub( pix1v, pix2v );
1752 pix1 += stride_pix1;
1753 pix2 += stride_pix2;
1754
1755 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1756 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
1757 diff4v = vec_sub( pix1v, pix2v );
1758 pix1 += stride_pix1;
1759 pix2 += stride_pix2;
1760
1761 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1762 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
1763 diff5v = vec_sub( pix1v, pix2v );
1764 pix1 += stride_pix1;
1765 pix2 += stride_pix2;
1766
1767 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1768 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
1769 diff6v = vec_sub( pix1v, pix2v );
1770 pix1 += stride_pix1;
1771 pix2 += stride_pix2;
1772
1773 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1774 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
1775 diff7v = vec_sub( pix1v, pix2v );
1776 pix1 += stride_pix1;
1777 pix2 += stride_pix2;
1778
1779 HADAMARD4_VEC( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v );
1780 //HADAMARD4_VEC( diff4v, diff5v, diff6v, diff7v, temp4v, temp5v, temp6v, temp7v );
1781
1782 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
1783 temp4v, temp5v, temp6v, temp7v,
1784 diff0v, diff1v, diff2v, diff3v,
1785 diff4v, diff5v, diff6v, diff7v );
1786
1787 HADAMARD4_VEC( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v );
1788 HADAMARD4_VEC( diff4v, diff5v, diff6v, diff7v, temp4v, temp5v, temp6v, temp7v );
1789
1790 temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
1791 satdv = vec_sum4s( temp0v, satdv);
1792
1793 temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
1794 satdv= vec_sum4s( temp1v, satdv );
1795
1796 temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
1797 satdv= vec_sum4s( temp2v, satdv );
1798
1799 temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
1800 satdv= vec_sum4s( temp3v, satdv );
1801
1802 temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
1803 satdv = vec_sum4s( temp4v, satdv);
1804
1805 temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
1806 satdv= vec_sum4s( temp5v, satdv );
1807
1808 temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
1809 satdv= vec_sum4s( temp6v, satdv );
1810
1811 temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
1812 satdv= vec_sum4s( temp7v, satdv );
1813
1814 satdv = vec_sums( satdv, zero_s32v );
1815 satdv = vec_splat( satdv, 3 );
1816 vec_ste( satdv, 0, &sum );
1817
1818 //print(sum);
1819 return sum>>1;
1820 }
1821 #endif
1822
satd_8x8_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)1823 int satd_8x8_altivec(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
1824 {
1825 ALIGN_VAR_16( int, sum );
1826 LOAD_ZERO;
1827 vec_s16_t pix1v, pix2v;
1828 vec_s16_t diff0v, diff1v, diff2v, diff3v, diff4v, diff5v, diff6v, diff7v;
1829 vec_s16_t temp0v, temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v;
1830 vec_s32_t satdv, satdv1, satdv2, satdv3, satdv4, satdv5, satdv6, satdv7;
1831 //vec_s32_t satdv=(vec_s32_t){0,0,0,0};
1832
1833
1834 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1835 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
1836 diff0v = vec_sub( pix1v, pix2v );
1837 pix1 += stride_pix1;
1838 pix2 += stride_pix2;
1839
1840 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1841 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
1842 diff1v = vec_sub( pix1v, pix2v );
1843 pix1 += stride_pix1;
1844 pix2 += stride_pix2;
1845
1846 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1847 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
1848 diff2v = vec_sub( pix1v, pix2v );
1849 pix1 += stride_pix1;
1850 pix2 += stride_pix2;
1851
1852 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1853 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
1854 diff3v = vec_sub( pix1v, pix2v );
1855 pix1 += stride_pix1;
1856 pix2 += stride_pix2;
1857
1858 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1859 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
1860 diff4v = vec_sub( pix1v, pix2v );
1861 pix1 += stride_pix1;
1862 pix2 += stride_pix2;
1863
1864 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1865 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
1866 diff5v = vec_sub( pix1v, pix2v );
1867 pix1 += stride_pix1;
1868 pix2 += stride_pix2;
1869
1870 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1871 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
1872 diff6v = vec_sub( pix1v, pix2v );
1873 pix1 += stride_pix1;
1874 pix2 += stride_pix2;
1875
1876 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1877 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
1878 diff7v = vec_sub( pix1v, pix2v );
1879 pix1 += stride_pix1;
1880 pix2 += stride_pix2;
1881
1882 HADAMARD4_VEC( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v );
1883 HADAMARD4_VEC( diff4v, diff5v, diff6v, diff7v, temp4v, temp5v, temp6v, temp7v );
1884
1885 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
1886 temp4v, temp5v, temp6v, temp7v,
1887 diff0v, diff1v, diff2v, diff3v,
1888 diff4v, diff5v, diff6v, diff7v );
1889
1890 HADAMARD4_VEC( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v );
1891 HADAMARD4_VEC( diff4v, diff5v, diff6v, diff7v, temp4v, temp5v, temp6v, temp7v );
1892
1893 #if 1
1894 temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
1895 satdv = vec_sum4s( temp0v, zero_s32v);
1896
1897 temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
1898 satdv1= vec_sum4s( temp1v, zero_s32v );
1899
1900 temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
1901 satdv2= vec_sum4s( temp2v, zero_s32v );
1902
1903 temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
1904 satdv3= vec_sum4s( temp3v, zero_s32v );
1905
1906 temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
1907 satdv4 = vec_sum4s( temp4v, zero_s32v);
1908
1909 temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
1910 satdv5= vec_sum4s( temp5v, zero_s32v );
1911
1912 temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
1913 satdv6= vec_sum4s( temp6v, zero_s32v );
1914
1915 temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
1916 satdv7= vec_sum4s( temp7v, zero_s32v );
1917
1918 satdv += satdv1;
1919 satdv2 += satdv3;
1920 satdv4 += satdv5;
1921 satdv6 += satdv7;
1922
1923 satdv += satdv2;
1924 satdv4 += satdv6;
1925 satdv += satdv4;
1926
1927 satdv = vec_sums( satdv, zero_s32v );
1928 sum = vec_extract(satdv, 3);
1929 #else
1930 temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
1931 satdv = vec_sum4s( temp0v, satdv);
1932
1933 temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
1934 satdv= vec_sum4s( temp1v, satdv );
1935
1936 temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
1937 satdv= vec_sum4s( temp2v, satdv );
1938
1939 temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
1940 satdv= vec_sum4s( temp3v, satdv );
1941
1942 temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
1943 satdv = vec_sum4s( temp4v, satdv);
1944
1945 temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
1946 satdv= vec_sum4s( temp5v, satdv );
1947
1948 temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
1949 satdv= vec_sum4s( temp6v, satdv );
1950
1951 temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
1952 satdv= vec_sum4s( temp7v, satdv );
1953
1954 satdv = vec_sums( satdv, zero_s32v );
1955 satdv = vec_splat( satdv, 3 );
1956 vec_ste( satdv, 0, &sum );
1957 #endif
1958 return sum>>1;
1959 }
1960
satd_8x16_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)1961 int satd_8x16_altivec(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
1962 {
1963 ALIGN_VAR_16( int, sum );
1964
1965 LOAD_ZERO;
1966 vec_s16_t pix1v, pix2v;
1967 vec_s16_t diff0v, diff1v, diff2v, diff3v, diff4v, diff5v, diff6v, diff7v;
1968 vec_s16_t temp0v, temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v;
1969 //vec_s32_t satdv=(vec_s32_t){0,0,0,0};
1970 vec_s32_t satdv, satdv1, satdv2, satdv3, satdv4, satdv5, satdv6, satdv7;
1971
1972
1973 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1974 pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
1975 diff0v = vec_sub( pix1v, pix2v );
1976 pix1 += stride_pix1;
1977 pix2 += stride_pix2;
1978
1979 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1980 pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
1981 diff1v = vec_sub( pix1v, pix2v );
1982 pix1 += stride_pix1;
1983 pix2 += stride_pix2;
1984
1985 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1986 pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
1987 diff2v = vec_sub( pix1v, pix2v );
1988 pix1 += stride_pix1;
1989 pix2 += stride_pix2;
1990
1991 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1992 pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
1993 diff3v = vec_sub( pix1v, pix2v );
1994 pix1 += stride_pix1;
1995 pix2 += stride_pix2;
1996
1997 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
1998 pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
1999 diff4v = vec_sub( pix1v, pix2v );
2000 pix1 += stride_pix1;
2001 pix2 += stride_pix2;
2002
2003 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
2004 pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
2005 diff5v = vec_sub( pix1v, pix2v );
2006 pix1 += stride_pix1;
2007 pix2 += stride_pix2;
2008
2009 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
2010 pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
2011 diff6v = vec_sub( pix1v, pix2v );
2012 pix1 += stride_pix1;
2013 pix2 += stride_pix2;
2014
2015 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
2016 pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
2017 diff7v = vec_sub( pix1v, pix2v );
2018 pix1 += stride_pix1;
2019 pix2 += stride_pix2;
2020
2021 HADAMARD4_VEC( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v );
2022 HADAMARD4_VEC( diff4v, diff5v, diff6v, diff7v, temp4v, temp5v, temp6v, temp7v );
2023
2024 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
2025 temp4v, temp5v, temp6v, temp7v,
2026 diff0v, diff1v, diff2v, diff3v,
2027 diff4v, diff5v, diff6v, diff7v );
2028
2029 HADAMARD4_VEC( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v );
2030 HADAMARD4_VEC( diff4v, diff5v, diff6v, diff7v, temp4v, temp5v, temp6v, temp7v );
2031
2032 #if 1
2033 temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
2034 satdv = vec_sum4s( temp0v, zero_s32v);
2035
2036 temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
2037 satdv1= vec_sum4s( temp1v, zero_s32v );
2038
2039 temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
2040 satdv2= vec_sum4s( temp2v, zero_s32v );
2041
2042 temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
2043 satdv3= vec_sum4s( temp3v, zero_s32v );
2044
2045 temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
2046 satdv4 = vec_sum4s( temp4v, zero_s32v);
2047
2048 temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
2049 satdv5= vec_sum4s( temp5v, zero_s32v );
2050
2051 temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
2052 satdv6= vec_sum4s( temp6v, zero_s32v );
2053
2054 temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
2055 satdv7= vec_sum4s( temp7v, zero_s32v );
2056
2057 satdv += satdv1;
2058 satdv2 += satdv3;
2059 satdv4 += satdv5;
2060 satdv6 += satdv7;
2061
2062 satdv += satdv2;
2063 satdv4 += satdv6;
2064 satdv += satdv4;
2065 #else
2066 temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
2067 satdv = vec_sum4s( temp0v, satdv);
2068
2069 temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
2070 satdv= vec_sum4s( temp1v, satdv );
2071
2072 temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
2073 satdv= vec_sum4s( temp2v, satdv );
2074
2075 temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
2076 satdv= vec_sum4s( temp3v, satdv );
2077
2078 temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
2079 satdv = vec_sum4s( temp4v, satdv);
2080
2081 temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
2082 satdv= vec_sum4s( temp5v, satdv );
2083
2084 temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
2085 satdv= vec_sum4s( temp6v, satdv );
2086
2087 temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
2088 satdv= vec_sum4s( temp7v, satdv );
2089 #endif
2090
2091 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
2092 pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
2093 diff0v = vec_sub( pix1v, pix2v );
2094 pix1 += stride_pix1;
2095 pix2 += stride_pix2;
2096
2097 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
2098 pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
2099 diff1v = vec_sub( pix1v, pix2v );
2100 pix1 += stride_pix1;
2101 pix2 += stride_pix2;
2102
2103 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
2104 pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
2105 diff2v = vec_sub( pix1v, pix2v );
2106 pix1 += stride_pix1;
2107 pix2 += stride_pix2;
2108
2109 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
2110 pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
2111 diff3v = vec_sub( pix1v, pix2v );
2112 pix1 += stride_pix1;
2113 pix2 += stride_pix2;
2114
2115 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
2116 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
2117 diff4v = vec_sub( pix1v, pix2v );
2118 pix1 += stride_pix1;
2119 pix2 += stride_pix2;
2120
2121 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
2122 pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
2123 diff5v = vec_sub( pix1v, pix2v );
2124 pix1 += stride_pix1;
2125 pix2 += stride_pix2;
2126
2127 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
2128 pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
2129 diff6v = vec_sub( pix1v, pix2v );
2130 pix1 += stride_pix1;
2131 pix2 += stride_pix2;
2132
2133 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
2134 pix2v = vec_u8_to_s16(vec_xl(0, pix2) );
2135 diff7v = vec_sub( pix1v, pix2v );
2136 pix1 += stride_pix1;
2137 pix2 += stride_pix2;
2138
2139 HADAMARD4_VEC( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v );
2140 HADAMARD4_VEC( diff4v, diff5v, diff6v, diff7v, temp4v, temp5v, temp6v, temp7v );
2141
2142 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
2143 temp4v, temp5v, temp6v, temp7v,
2144 diff0v, diff1v, diff2v, diff3v,
2145 diff4v, diff5v, diff6v, diff7v );
2146
2147 HADAMARD4_VEC( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v );
2148 HADAMARD4_VEC( diff4v, diff5v, diff6v, diff7v, temp4v, temp5v, temp6v, temp7v );
2149
2150 #if 1
2151 temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
2152 satdv += vec_sum4s( temp0v, zero_s32v);
2153
2154 temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
2155 satdv1= vec_sum4s( temp1v, zero_s32v );
2156
2157 temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
2158 satdv2= vec_sum4s( temp2v, zero_s32v );
2159
2160 temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
2161 satdv3= vec_sum4s( temp3v, zero_s32v );
2162
2163 temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
2164 satdv4 = vec_sum4s( temp4v, zero_s32v);
2165
2166 temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
2167 satdv5= vec_sum4s( temp5v, zero_s32v );
2168
2169 temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
2170 satdv6= vec_sum4s( temp6v, zero_s32v );
2171
2172 temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
2173 satdv7= vec_sum4s( temp7v, zero_s32v );
2174
2175 satdv += satdv1;
2176 satdv2 += satdv3;
2177 satdv4 += satdv5;
2178 satdv6 += satdv7;
2179
2180 satdv += satdv2;
2181 satdv4 += satdv6;
2182 satdv += satdv4;
2183
2184 satdv = vec_sums( satdv, zero_s32v );
2185 sum = vec_extract(satdv, 3);
2186 #else
2187 temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
2188 satdv = vec_sum4s( temp0v, satdv);
2189
2190 temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
2191 satdv= vec_sum4s( temp1v, satdv );
2192
2193 temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
2194 satdv= vec_sum4s( temp2v, satdv );
2195
2196 temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
2197 satdv= vec_sum4s( temp3v, satdv );
2198
2199 temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
2200 satdv = vec_sum4s( temp4v, satdv);
2201
2202 temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
2203 satdv= vec_sum4s( temp5v, satdv );
2204
2205 temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
2206 satdv= vec_sum4s( temp6v, satdv );
2207
2208 temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
2209 satdv= vec_sum4s( temp7v, satdv );
2210
2211 satdv = vec_sums( satdv, zero_s32v );
2212 satdv = vec_splat( satdv, 3 );
2213 vec_ste( satdv, 0, &sum );
2214 #endif
2215 return sum >> 1;
2216 }
2217
2218 #define VEC_DIFF_S16(p1,i1,p2,i2,dh,dl)\
2219 {\
2220 pix1v = (vec_s16_t)vec_xl(0, p1);\
2221 temp0v = vec_u8_to_s16_h( pix1v );\
2222 temp1v = vec_u8_to_s16_l( pix1v );\
2223 pix2v = (vec_s16_t)vec_xl(0, p2);\
2224 temp2v = vec_u8_to_s16_h( pix2v );\
2225 temp3v = vec_u8_to_s16_l( pix2v );\
2226 dh = vec_sub( temp0v, temp2v );\
2227 dl = vec_sub( temp1v, temp3v );\
2228 p1 += i1;\
2229 p2 += i2;\
2230 }
2231
2232
satd_16x4_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2233 int satd_16x4_altivec(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2234 {
2235 ALIGN_VAR_16( int, sum );
2236 LOAD_ZERO;
2237 //vec_s32_t satdv=(vec_s32_t){0,0,0,0};
2238 vec_s32_t satdv, satdv1, satdv2, satdv3, satdv4, satdv5, satdv6, satdv7;
2239 vec_s16_t pix1v, pix2v;
2240 vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v;
2241 vec_s16_t diffl0v, diffl1v, diffl2v, diffl3v;
2242 vec_s16_t temp0v, temp1v, temp2v, temp3v,
2243 temp4v, temp5v, temp6v, temp7v;
2244
2245 VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh0v,diffl0v);
2246 VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh1v, diffl1v);
2247 VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh2v, diffl2v);
2248 VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh3v, diffl3v);
2249
2250
2251 HADAMARD4_VEC( diffh0v, diffh1v, diffh2v, diffh3v, temp0v, temp1v, temp2v, temp3v );
2252 HADAMARD4_VEC( diffl0v, diffl1v, diffl2v, diffl3v, temp4v, temp5v, temp6v, temp7v );
2253
2254 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
2255 temp4v, temp5v, temp6v, temp7v,
2256 diffh0v, diffh1v, diffh2v, diffh3v,
2257 diffl0v, diffl1v, diffl2v, diffl3v);
2258
2259 HADAMARD4_VEC( diffh0v, diffh1v, diffh2v, diffh3v, temp0v, temp1v, temp2v, temp3v );
2260 HADAMARD4_VEC( diffl0v, diffl1v, diffl2v, diffl3v, temp4v, temp5v, temp6v, temp7v );
2261
2262 #if 1
2263 temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
2264 satdv = vec_sum4s( temp0v, zero_s32v);
2265
2266 temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
2267 satdv1= vec_sum4s( temp1v, zero_s32v );
2268
2269 temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
2270 satdv2= vec_sum4s( temp2v, zero_s32v );
2271
2272 temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
2273 satdv3= vec_sum4s( temp3v, zero_s32v );
2274
2275 temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
2276 satdv4 = vec_sum4s( temp4v, zero_s32v);
2277
2278 temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
2279 satdv5= vec_sum4s( temp5v, zero_s32v );
2280
2281 temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
2282 satdv6= vec_sum4s( temp6v, zero_s32v );
2283
2284 temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
2285 satdv7= vec_sum4s( temp7v, zero_s32v );
2286
2287 satdv += satdv1;
2288 satdv2 += satdv3;
2289 satdv4 += satdv5;
2290 satdv6 += satdv7;
2291
2292 satdv += satdv2;
2293 satdv4 += satdv6;
2294 satdv += satdv4;
2295
2296 satdv = vec_sums( satdv, zero_s32v );
2297 sum = vec_extract(satdv, 3);
2298 #else
2299 temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
2300 satdv = vec_sum4s( temp0v, zero_s32v);
2301
2302 temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
2303 satdv= vec_sum4s( temp1v, satdv );
2304
2305 temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
2306 satdv= vec_sum4s( temp2v, satdv );
2307
2308 temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
2309 satdv= vec_sum4s( temp3v, satdv );
2310
2311 temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
2312 satdv = vec_sum4s( temp4v, satdv);
2313
2314 temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
2315 satdv= vec_sum4s( temp5v, satdv );
2316
2317 temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
2318 satdv= vec_sum4s( temp6v, satdv );
2319
2320 temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
2321 satdv= vec_sum4s( temp7v, satdv );
2322
2323 satdv = vec_sums( satdv, zero_s32v );
2324 satdv = vec_splat( satdv, 3 );
2325 vec_ste( satdv, 0, &sum );
2326 #endif
2327 return sum >> 1;
2328 }
2329
satd_16x8_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2330 int satd_16x8_altivec(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2331 {
2332 ALIGN_VAR_16( int, sum );
2333 LOAD_ZERO;
2334 //vec_s32_t satdv=(vec_s32_t){0,0,0,0};
2335 vec_s32_t satdv, satdv1, satdv2, satdv3, satdv4, satdv5, satdv6, satdv7;
2336 vec_s16_t pix1v, pix2v;
2337 vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v,
2338 diffh4v, diffh5v, diffh6v, diffh7v;
2339 vec_s16_t diffl0v, diffl1v, diffl2v, diffl3v,
2340 diffl4v, diffl5v, diffl6v, diffl7v;
2341 vec_s16_t temp0v, temp1v, temp2v, temp3v,
2342 temp4v, temp5v, temp6v, temp7v;
2343
2344 VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh0v,diffl0v);
2345 VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh1v, diffl1v);
2346 VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh2v, diffl2v);
2347 VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh3v, diffl3v);
2348 VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh4v, diffl4v);
2349 VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh5v, diffl5v);
2350 VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh6v, diffl6v);
2351 VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh7v, diffl7v);
2352
2353 HADAMARD4_VEC( diffh0v, diffh1v, diffh2v, diffh3v, temp0v, temp1v, temp2v, temp3v );
2354 HADAMARD4_VEC( diffh4v, diffh5v, diffh6v, diffh7v, temp4v, temp5v, temp6v, temp7v );
2355
2356 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
2357 temp4v, temp5v, temp6v, temp7v,
2358 diffh0v, diffh1v, diffh2v, diffh3v,
2359 diffh4v, diffh5v, diffh6v, diffh7v );
2360
2361 HADAMARD4_VEC( diffh0v, diffh1v, diffh2v, diffh3v, temp0v, temp1v, temp2v, temp3v );
2362 HADAMARD4_VEC( diffh4v, diffh5v, diffh6v, diffh7v, temp4v, temp5v, temp6v, temp7v );
2363
2364 #if 1
2365 temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
2366 satdv = vec_sum4s( temp0v, zero_s32v);
2367
2368 temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
2369 satdv1= vec_sum4s( temp1v, zero_s32v );
2370
2371 temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
2372 satdv2= vec_sum4s( temp2v, zero_s32v );
2373
2374 temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
2375 satdv3= vec_sum4s( temp3v, zero_s32v );
2376
2377 temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
2378 satdv4 = vec_sum4s( temp4v, zero_s32v);
2379
2380 temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
2381 satdv5= vec_sum4s( temp5v, zero_s32v );
2382
2383 temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
2384 satdv6= vec_sum4s( temp6v, zero_s32v );
2385
2386 temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
2387 satdv7= vec_sum4s( temp7v, zero_s32v );
2388
2389 satdv += satdv1;
2390 satdv2 += satdv3;
2391 satdv4 += satdv5;
2392 satdv6 += satdv7;
2393
2394 satdv += satdv2;
2395 satdv4 += satdv6;
2396 satdv += satdv4;
2397 #else
2398 temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
2399 satdv = vec_sum4s( temp0v, zero_s32v);
2400
2401 temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
2402 satdv= vec_sum4s( temp1v, satdv );
2403
2404 temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
2405 satdv= vec_sum4s( temp2v, satdv );
2406
2407 temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
2408 satdv= vec_sum4s( temp3v, satdv );
2409
2410 temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
2411 satdv = vec_sum4s( temp4v, satdv);
2412
2413 temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
2414 satdv= vec_sum4s( temp5v, satdv );
2415
2416 temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
2417 satdv= vec_sum4s( temp6v, satdv );
2418
2419 temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
2420 satdv= vec_sum4s( temp7v, satdv );
2421 #endif
2422
2423 HADAMARD4_VEC( diffl0v, diffl1v, diffl2v, diffl3v, temp0v, temp1v, temp2v, temp3v );
2424 HADAMARD4_VEC( diffl4v, diffl5v, diffl6v, diffl7v, temp4v, temp5v, temp6v, temp7v );
2425
2426 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
2427 temp4v, temp5v, temp6v, temp7v,
2428 diffl0v, diffl1v, diffl2v, diffl3v,
2429 diffl4v, diffl5v, diffl6v, diffl7v );
2430
2431 HADAMARD4_VEC( diffl0v, diffl1v, diffl2v, diffl3v, temp0v, temp1v, temp2v, temp3v );
2432 HADAMARD4_VEC( diffl4v, diffl5v, diffl6v, diffl7v, temp4v, temp5v, temp6v, temp7v );
2433
2434 #if 1
2435 temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
2436 satdv += vec_sum4s( temp0v, zero_s32v);
2437
2438 temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
2439 satdv1= vec_sum4s( temp1v, zero_s32v );
2440
2441 temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
2442 satdv2= vec_sum4s( temp2v, zero_s32v );
2443
2444 temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
2445 satdv3= vec_sum4s( temp3v, zero_s32v );
2446
2447 temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
2448 satdv4 = vec_sum4s( temp4v, zero_s32v);
2449
2450 temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
2451 satdv5= vec_sum4s( temp5v, zero_s32v );
2452
2453 temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
2454 satdv6= vec_sum4s( temp6v, zero_s32v );
2455
2456 temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
2457 satdv7= vec_sum4s( temp7v, zero_s32v );
2458
2459 satdv += satdv1;
2460 satdv2 += satdv3;
2461 satdv4 += satdv5;
2462 satdv6 += satdv7;
2463
2464 satdv += satdv2;
2465 satdv4 += satdv6;
2466 satdv += satdv4;
2467
2468 satdv = vec_sums( satdv, zero_s32v );
2469 sum = vec_extract(satdv, 3);
2470 #else
2471 temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
2472 satdv = vec_sum4s( temp0v, satdv);
2473
2474 temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
2475 satdv= vec_sum4s( temp1v, satdv );
2476
2477 temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
2478 satdv= vec_sum4s( temp2v, satdv );
2479
2480 temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
2481 satdv= vec_sum4s( temp3v, satdv );
2482
2483 temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
2484 satdv = vec_sum4s( temp4v, satdv);
2485
2486 temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
2487 satdv= vec_sum4s( temp5v, satdv );
2488
2489 temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
2490 satdv= vec_sum4s( temp6v, satdv );
2491
2492 temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
2493 satdv= vec_sum4s( temp7v, satdv );
2494
2495 satdv = vec_sums( satdv, zero_s32v );
2496 satdv = vec_splat( satdv, 3 );
2497 vec_ste( satdv, 0, &sum );
2498 #endif
2499 return sum >> 1;
2500 }
2501
satd_16x16_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2502 int satd_16x16_altivec(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2503 {
2504 ALIGN_VAR_16( int, sum );
2505 LOAD_ZERO;
2506 //vec_s32_t satdv=(vec_s32_t){0,0,0,0};
2507 vec_s32_t satdv, satdv1, satdv2, satdv3, satdv4, satdv5, satdv6, satdv7;
2508 vec_s16_t pix1v, pix2v;
2509 vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v,
2510 diffh4v, diffh5v, diffh6v, diffh7v;
2511 vec_s16_t diffl0v, diffl1v, diffl2v, diffl3v,
2512 diffl4v, diffl5v, diffl6v, diffl7v;
2513 vec_s16_t temp0v, temp1v, temp2v, temp3v,
2514 temp4v, temp5v, temp6v, temp7v;
2515
2516 VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh0v,diffl0v);
2517 VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh1v, diffl1v);
2518 VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh2v, diffl2v);
2519 VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh3v, diffl3v);
2520 VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh4v, diffl4v);
2521 VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh5v, diffl5v);
2522 VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh6v, diffl6v);
2523 VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh7v, diffl7v);
2524
2525 HADAMARD4_VEC( diffh0v, diffh1v, diffh2v, diffh3v, temp0v, temp1v, temp2v, temp3v );
2526 HADAMARD4_VEC( diffh4v, diffh5v, diffh6v, diffh7v, temp4v, temp5v, temp6v, temp7v );
2527
2528 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
2529 temp4v, temp5v, temp6v, temp7v,
2530 diffh0v, diffh1v, diffh2v, diffh3v,
2531 diffh4v, diffh5v, diffh6v, diffh7v );
2532
2533 HADAMARD4_VEC( diffh0v, diffh1v, diffh2v, diffh3v, temp0v, temp1v, temp2v, temp3v );
2534 HADAMARD4_VEC( diffh4v, diffh5v, diffh6v, diffh7v, temp4v, temp5v, temp6v, temp7v );
2535
2536 #if 1
2537 temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
2538 satdv = vec_sum4s( temp0v, zero_s32v);
2539
2540 temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
2541 satdv1= vec_sum4s( temp1v, zero_s32v );
2542
2543 temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
2544 satdv2= vec_sum4s( temp2v, zero_s32v );
2545
2546 temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
2547 satdv3= vec_sum4s( temp3v, zero_s32v );
2548
2549 temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
2550 satdv4 = vec_sum4s( temp4v, zero_s32v);
2551
2552 temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
2553 satdv5= vec_sum4s( temp5v, zero_s32v );
2554
2555 temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
2556 satdv6= vec_sum4s( temp6v, zero_s32v );
2557
2558 temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
2559 satdv7= vec_sum4s( temp7v, zero_s32v );
2560
2561 satdv += satdv1;
2562 satdv2 += satdv3;
2563 satdv4 += satdv5;
2564 satdv6 += satdv7;
2565
2566 satdv += satdv2;
2567 satdv4 += satdv6;
2568 satdv += satdv4;
2569 #else
2570 temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
2571 satdv = vec_sum4s( temp0v, zero_s32v);
2572
2573 temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
2574 satdv= vec_sum4s( temp1v, satdv );
2575
2576 temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
2577 satdv= vec_sum4s( temp2v, satdv );
2578
2579 temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
2580 satdv= vec_sum4s( temp3v, satdv );
2581
2582 temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
2583 satdv = vec_sum4s( temp4v, satdv);
2584
2585 temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
2586 satdv= vec_sum4s( temp5v, satdv );
2587
2588 temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
2589 satdv= vec_sum4s( temp6v, satdv );
2590
2591 temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
2592 satdv= vec_sum4s( temp7v, satdv );
2593 #endif
2594
2595 HADAMARD4_VEC( diffl0v, diffl1v, diffl2v, diffl3v, temp0v, temp1v, temp2v, temp3v );
2596 HADAMARD4_VEC( diffl4v, diffl5v, diffl6v, diffl7v, temp4v, temp5v, temp6v, temp7v );
2597
2598 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
2599 temp4v, temp5v, temp6v, temp7v,
2600 diffl0v, diffl1v, diffl2v, diffl3v,
2601 diffl4v, diffl5v, diffl6v, diffl7v );
2602
2603 HADAMARD4_VEC( diffl0v, diffl1v, diffl2v, diffl3v, temp0v, temp1v, temp2v, temp3v );
2604 HADAMARD4_VEC( diffl4v, diffl5v, diffl6v, diffl7v, temp4v, temp5v, temp6v, temp7v );
2605
2606 #if 1
2607 temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
2608 satdv += vec_sum4s( temp0v, zero_s32v);
2609
2610 temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
2611 satdv1= vec_sum4s( temp1v, zero_s32v );
2612
2613 temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
2614 satdv2= vec_sum4s( temp2v, zero_s32v );
2615
2616 temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
2617 satdv3= vec_sum4s( temp3v, zero_s32v );
2618
2619 temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
2620 satdv4 = vec_sum4s( temp4v, zero_s32v);
2621
2622 temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
2623 satdv5= vec_sum4s( temp5v, zero_s32v );
2624
2625 temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
2626 satdv6= vec_sum4s( temp6v, zero_s32v );
2627
2628 temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
2629 satdv7= vec_sum4s( temp7v, zero_s32v );
2630
2631 satdv += satdv1;
2632 satdv2 += satdv3;
2633 satdv4 += satdv5;
2634 satdv6 += satdv7;
2635
2636 satdv += satdv2;
2637 satdv4 += satdv6;
2638 satdv += satdv4;
2639
2640 #else
2641 temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
2642 satdv = vec_sum4s( temp0v, satdv);
2643
2644 temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
2645 satdv= vec_sum4s( temp1v, satdv );
2646
2647 temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
2648 satdv= vec_sum4s( temp2v, satdv );
2649
2650 temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
2651 satdv= vec_sum4s( temp3v, satdv );
2652
2653 temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
2654 satdv = vec_sum4s( temp4v, satdv);
2655
2656 temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
2657 satdv= vec_sum4s( temp5v, satdv );
2658
2659 temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
2660 satdv= vec_sum4s( temp6v, satdv );
2661
2662 temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
2663 satdv= vec_sum4s( temp7v, satdv );
2664 #endif
2665 VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh0v,diffl0v);
2666 VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh1v, diffl1v);
2667 VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh2v, diffl2v);
2668 VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh3v, diffl3v);
2669 VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh4v, diffl4v);
2670 VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh5v, diffl5v);
2671 VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh6v, diffl6v);
2672 VEC_DIFF_S16(pix1,stride_pix1,pix2,stride_pix2,diffh7v, diffl7v);
2673
2674 HADAMARD4_VEC( diffh0v, diffh1v, diffh2v, diffh3v, temp0v, temp1v, temp2v, temp3v );
2675 HADAMARD4_VEC( diffh4v, diffh5v, diffh6v, diffh7v, temp4v, temp5v, temp6v, temp7v );
2676
2677 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
2678 temp4v, temp5v, temp6v, temp7v,
2679 diffh0v, diffh1v, diffh2v, diffh3v,
2680 diffh4v, diffh5v, diffh6v, diffh7v );
2681
2682 HADAMARD4_VEC( diffh0v, diffh1v, diffh2v, diffh3v, temp0v, temp1v, temp2v, temp3v );
2683 HADAMARD4_VEC( diffh4v, diffh5v, diffh6v, diffh7v, temp4v, temp5v, temp6v, temp7v );
2684
2685 #if 1
2686 temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
2687 satdv += vec_sum4s( temp0v, zero_s32v);
2688
2689 temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
2690 satdv1= vec_sum4s( temp1v, zero_s32v );
2691
2692 temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
2693 satdv2= vec_sum4s( temp2v, zero_s32v );
2694
2695 temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
2696 satdv3= vec_sum4s( temp3v, zero_s32v );
2697
2698 temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
2699 satdv4 = vec_sum4s( temp4v, zero_s32v);
2700
2701 temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
2702 satdv5= vec_sum4s( temp5v, zero_s32v );
2703
2704 temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
2705 satdv6= vec_sum4s( temp6v, zero_s32v );
2706
2707 temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
2708 satdv7= vec_sum4s( temp7v, zero_s32v );
2709
2710 satdv += satdv1;
2711 satdv2 += satdv3;
2712 satdv4 += satdv5;
2713 satdv6 += satdv7;
2714
2715 satdv += satdv2;
2716 satdv4 += satdv6;
2717 satdv += satdv4;
2718 #else
2719 temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
2720 satdv = vec_sum4s( temp0v, satdv);
2721
2722 temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
2723 satdv= vec_sum4s( temp1v, satdv );
2724
2725 temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
2726 satdv= vec_sum4s( temp2v, satdv );
2727
2728 temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
2729 satdv= vec_sum4s( temp3v, satdv );
2730
2731 temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
2732 satdv = vec_sum4s( temp4v, satdv);
2733
2734 temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
2735 satdv= vec_sum4s( temp5v, satdv );
2736
2737 temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
2738 satdv= vec_sum4s( temp6v, satdv );
2739
2740 temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
2741 satdv= vec_sum4s( temp7v, satdv );
2742 #endif
2743 HADAMARD4_VEC( diffl0v, diffl1v, diffl2v, diffl3v, temp0v, temp1v, temp2v, temp3v );
2744 HADAMARD4_VEC( diffl4v, diffl5v, diffl6v, diffl7v, temp4v, temp5v, temp6v, temp7v );
2745
2746 VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
2747 temp4v, temp5v, temp6v, temp7v,
2748 diffl0v, diffl1v, diffl2v, diffl3v,
2749 diffl4v, diffl5v, diffl6v, diffl7v );
2750
2751 HADAMARD4_VEC( diffl0v, diffl1v, diffl2v, diffl3v, temp0v, temp1v, temp2v, temp3v );
2752 HADAMARD4_VEC( diffl4v, diffl5v, diffl6v, diffl7v, temp4v, temp5v, temp6v, temp7v );
2753
2754 #if 1
2755 temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
2756 satdv += vec_sum4s( temp0v, zero_s32v);
2757
2758 temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
2759 satdv1= vec_sum4s( temp1v, zero_s32v );
2760
2761 temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
2762 satdv2= vec_sum4s( temp2v, zero_s32v );
2763
2764 temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
2765 satdv3= vec_sum4s( temp3v, zero_s32v );
2766
2767 temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
2768 satdv4 = vec_sum4s( temp4v, zero_s32v);
2769
2770 temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
2771 satdv5= vec_sum4s( temp5v, zero_s32v );
2772
2773 temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
2774 satdv6= vec_sum4s( temp6v, zero_s32v );
2775
2776 temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
2777 satdv7= vec_sum4s( temp7v, zero_s32v );
2778
2779 satdv += satdv1;
2780 satdv2 += satdv3;
2781 satdv4 += satdv5;
2782 satdv6 += satdv7;
2783
2784 satdv += satdv2;
2785 satdv4 += satdv6;
2786 satdv += satdv4;
2787
2788 satdv = vec_sums( satdv, zero_s32v );
2789 sum = vec_extract(satdv, 3);
2790 #else
2791 temp0v = vec_max( temp0v, vec_sub( zero_s16v, temp0v ) );
2792 satdv = vec_sum4s( temp0v, satdv);
2793
2794 temp1v = vec_max( temp1v, vec_sub( zero_s16v, temp1v ) );
2795 satdv= vec_sum4s( temp1v, satdv );
2796
2797 temp2v = vec_max( temp2v, vec_sub( zero_s16v, temp2v ) );
2798 satdv= vec_sum4s( temp2v, satdv );
2799
2800 temp3v = vec_max( temp3v, vec_sub( zero_s16v, temp3v ) );
2801 satdv= vec_sum4s( temp3v, satdv );
2802
2803 temp4v = vec_max( temp4v, vec_sub( zero_s16v, temp4v ) );
2804 satdv = vec_sum4s( temp4v, satdv);
2805
2806 temp5v = vec_max( temp5v, vec_sub( zero_s16v, temp5v ) );
2807 satdv= vec_sum4s( temp5v, satdv );
2808
2809 temp6v = vec_max( temp6v, vec_sub( zero_s16v, temp6v ) );
2810 satdv= vec_sum4s( temp6v, satdv );
2811
2812 temp7v = vec_max( temp7v, vec_sub( zero_s16v, temp7v ) );
2813 satdv= vec_sum4s( temp7v, satdv );
2814
2815 satdv = vec_sums( satdv, zero_s32v );
2816 satdv = vec_splat( satdv, 3 );
2817 vec_ste( satdv, 0, &sum );
2818 #endif
2819 return sum >> 1;
2820 }
2821
2822
2823 template<int w, int h>
2824 int satd_altivec(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
2825
2826 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2827 int satd_altivec<4, 4>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2828 {
2829 return satd_4x4_altivec(pix1, stride_pix1, pix2, stride_pix2);
2830 }
2831
2832 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2833 int satd_altivec<4, 8>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2834 {
2835 return satd_4x8_altivec(pix1, stride_pix1, pix2, stride_pix2);
2836 }
2837
2838 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2839 int satd_altivec<4, 12>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2840 {
2841 int satd = 0;
2842 satd = satd_4x4_altivec(pix1, stride_pix1, pix2, stride_pix2)
2843 + satd_4x8_altivec(pix1+4*stride_pix1, stride_pix1, pix2+4*stride_pix2, stride_pix2);
2844
2845 return satd;
2846 }
2847
2848 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2849 int satd_altivec<4, 16>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2850 {
2851 int satd = 0;
2852 satd = satd_4x8_altivec(pix1, stride_pix1, pix2, stride_pix2)
2853 + satd_4x8_altivec(pix1+8*stride_pix1, stride_pix1, pix2+8*stride_pix2, stride_pix2);
2854
2855 return satd;
2856 }
2857
2858 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2859 int satd_altivec<4, 24>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2860 {
2861 int satd = 0;
2862 satd = satd_4x8_altivec(pix1, stride_pix1, pix2, stride_pix2)
2863 + satd_4x8_altivec(pix1+8*stride_pix1, stride_pix1, pix2+8*stride_pix2, stride_pix2)
2864 + satd_4x8_altivec(pix1+16*stride_pix1, stride_pix1, pix2+16*stride_pix2, stride_pix2);
2865
2866 return satd;
2867 }
2868
2869 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2870 int satd_altivec<4, 32>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2871 {
2872 int satd = 0;
2873 satd = satd_4x8_altivec(pix1, stride_pix1, pix2, stride_pix2)
2874 + satd_4x8_altivec(pix1+8*stride_pix1, stride_pix1, pix2+8*stride_pix2, stride_pix2)
2875 + satd_4x8_altivec(pix1+16*stride_pix1, stride_pix1, pix2+16*stride_pix2, stride_pix2)
2876 + satd_4x8_altivec(pix1+24*stride_pix1, stride_pix1, pix2+24*stride_pix2, stride_pix2);
2877
2878 return satd;
2879 }
2880
2881 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2882 int satd_altivec<4, 64>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2883 {
2884 int satd = 0;
2885 satd = satd_altivec<4, 32>(pix1, stride_pix1, pix2, stride_pix2)
2886 + satd_altivec<4, 32>(pix1+32*stride_pix1, stride_pix1, pix2+32*stride_pix2, stride_pix2);
2887
2888 return satd;
2889 }
2890
2891 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2892 int satd_altivec<8, 4>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2893 {
2894 return satd_8x4_altivec(pix1, stride_pix1, pix2, stride_pix2);
2895 }
2896
2897 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2898 int satd_altivec<8, 8>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2899 {
2900 return satd_8x8_altivec(pix1, stride_pix1, pix2, stride_pix2);
2901 }
2902
2903 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2904 int satd_altivec<8, 12>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2905 {
2906 int satd = 0;
2907 satd = satd_8x8_altivec(pix1, stride_pix1, pix2, stride_pix2)
2908 + satd_8x4_altivec(pix1+8*stride_pix1, stride_pix1, pix2+8*stride_pix2, stride_pix2);
2909 return satd;
2910 }
2911
2912 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2913 int satd_altivec<8,16>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2914 {
2915 return satd_8x16_altivec(pix1, stride_pix1, pix2, stride_pix2);
2916 }
2917
2918 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2919 int satd_altivec<8,24>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2920 {
2921 int satd = 0;
2922 satd = satd_8x8_altivec(pix1, stride_pix1, pix2, stride_pix2)
2923 + satd_8x16_altivec(pix1+8*stride_pix1, stride_pix1, pix2+8*stride_pix2, stride_pix2);
2924 return satd;
2925 }
2926
2927 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2928 int satd_altivec<8,32>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2929 {
2930 int satd = 0;
2931 satd = satd_8x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
2932 + satd_8x16_altivec(pix1+16*stride_pix1, stride_pix1, pix2+16*stride_pix2, stride_pix2);
2933 return satd;
2934 }
2935
2936 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2937 int satd_altivec<8,64>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2938 {
2939 int satd = 0;
2940 satd = satd_8x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
2941 + satd_8x16_altivec(pix1+16*stride_pix1, stride_pix1, pix2+16*stride_pix2, stride_pix2)
2942 + satd_8x16_altivec(pix1+32*stride_pix1, stride_pix1, pix2+32*stride_pix2, stride_pix2)
2943 + satd_8x16_altivec(pix1+48*stride_pix1, stride_pix1, pix2+48*stride_pix2, stride_pix2);
2944 return satd;
2945 }
2946
2947 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2948 int satd_altivec<12, 4>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2949 {
2950 int satd = 0;
2951 satd = satd_8x4_altivec(pix1, stride_pix1, pix2, stride_pix2)
2952 + satd_4x4_altivec(pix1+8, stride_pix1, pix2+8, stride_pix2);
2953 return satd;
2954 }
2955
2956 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2957 int satd_altivec<12, 8>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2958 {
2959 int satd = 0;
2960 satd = satd_8x8_altivec(pix1, stride_pix1, pix2, stride_pix2)
2961 + satd_4x8_altivec(pix1+8, stride_pix1, pix2+8, stride_pix2);
2962 return satd;
2963 }
2964
2965 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2966 int satd_altivec<12, 12>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2967 {
2968 int satd = 0;
2969 const pixel *pix3 = pix1 + 8*stride_pix1;
2970 const pixel *pix4 = pix2 + 8*stride_pix2;
2971 satd = satd_8x8_altivec(pix1, stride_pix1, pix2, stride_pix2)
2972 + satd_4x8_altivec(pix1+8, stride_pix1, pix2+8, stride_pix2);
2973 + satd_8x4_altivec(pix3, stride_pix1, pix4, stride_pix2)
2974 + satd_4x4_altivec(pix3+8, stride_pix1, pix4+8, stride_pix2);
2975 return satd;
2976 }
2977
2978 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2979 int satd_altivec<12, 16>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2980 {
2981 int satd = 0;
2982 const pixel *pix3 = pix1 + 8*stride_pix1;
2983 const pixel *pix4 = pix2 + 8*stride_pix2;
2984 satd = satd_8x8_altivec(pix1, stride_pix1, pix2, stride_pix2)
2985 + satd_4x8_altivec(pix1+8, stride_pix1, pix2+8, stride_pix2)
2986 + satd_8x8_altivec(pix3, stride_pix1, pix4, stride_pix2)
2987 + satd_4x8_altivec(pix3+8, stride_pix1, pix4+8, stride_pix2);
2988 return satd;
2989 }
2990
2991 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)2992 int satd_altivec<12, 24>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
2993 {
2994 int satd = 0;
2995 const pixel *pix3 = pix1 + 8*stride_pix1;
2996 const pixel *pix4 = pix2 + 8*stride_pix2;
2997 satd = satd_8x8_altivec(pix1, stride_pix1, pix2, stride_pix2)
2998 + satd_4x8_altivec(pix1+8, stride_pix1, pix2+8, stride_pix2)
2999 + satd_8x16_altivec(pix3, stride_pix1, pix4, stride_pix2)
3000 + satd_altivec<4, 16>(pix3+8, stride_pix1, pix4+8, stride_pix2);
3001 return satd;
3002 }
3003
3004 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3005 int satd_altivec<12, 32>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3006 {
3007 int satd = 0;
3008 const pixel *pix3 = pix1 + 16*stride_pix1;
3009 const pixel *pix4 = pix2 + 16*stride_pix2;
3010 satd = satd_8x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3011 + satd_altivec<4, 16>(pix1+8, stride_pix1, pix2+8, stride_pix2)
3012 + satd_8x16_altivec(pix3, stride_pix1, pix4, stride_pix2)
3013 + satd_altivec<4, 16>(pix3+8, stride_pix1, pix4+8, stride_pix2);
3014 return satd;
3015 }
3016
3017 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3018 int satd_altivec<12, 64>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3019 {
3020 int satd = 0;
3021 const pixel *pix3 = pix1 + 16*stride_pix1;
3022 const pixel *pix4 = pix2 + 16*stride_pix2;
3023 const pixel *pix5 = pix1 + 32*stride_pix1;
3024 const pixel *pix6 = pix2 + 32*stride_pix2;
3025 const pixel *pix7 = pix1 + 48*stride_pix1;
3026 const pixel *pix8 = pix2 + 48*stride_pix2;
3027 satd = satd_8x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3028 + satd_altivec<4, 16>(pix1+8, stride_pix1, pix2+8, stride_pix2)
3029 + satd_8x16_altivec(pix3, stride_pix1, pix4, stride_pix2)
3030 + satd_altivec<4, 16>(pix3+8, stride_pix1, pix4+8, stride_pix2)
3031 + satd_8x16_altivec(pix5, stride_pix1, pix6, stride_pix2)
3032 + satd_altivec<4, 16>(pix5+8, stride_pix1, pix6+8, stride_pix2)
3033 + satd_8x16_altivec(pix7, stride_pix1, pix8, stride_pix2)
3034 + satd_altivec<4, 16>(pix7+8, stride_pix1, pix8+8, stride_pix2);
3035 return satd;
3036 }
3037
3038 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3039 int satd_altivec<16, 4>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3040 {
3041 return satd_16x4_altivec(pix1, stride_pix1, pix2, stride_pix2);
3042 }
3043
3044 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3045 int satd_altivec<16, 8>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3046 {
3047 return satd_16x8_altivec(pix1, stride_pix1, pix2, stride_pix2);
3048 }
3049
3050 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3051 int satd_altivec<16, 12>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3052 {
3053 int satd = 0;
3054 satd = satd_16x4_altivec(pix1, stride_pix1, pix2, stride_pix2)
3055 + satd_16x8_altivec(pix1+4*stride_pix1, stride_pix1, pix2+4*stride_pix2, stride_pix2);
3056 return satd;
3057 }
3058
3059 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3060 int satd_altivec<16, 16>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3061 {
3062 return satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2);
3063 }
3064
3065 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3066 int satd_altivec<16, 24>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3067 {
3068 int satd = 0;
3069 satd = satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3070 + satd_16x8_altivec(pix1+16*stride_pix1, stride_pix1, pix2+16*stride_pix2, stride_pix2);
3071 return satd;
3072 }
3073
3074 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3075 int satd_altivec<16, 32>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3076 {
3077 int satd = 0;
3078 satd = satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3079 + satd_16x16_altivec(pix1+16*stride_pix1, stride_pix1, pix2+16*stride_pix2, stride_pix2);
3080 return satd;
3081 }
3082
3083 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3084 int satd_altivec<16, 64>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3085 {
3086 int satd = 0;
3087 satd = satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3088 + satd_16x16_altivec(pix1+16*stride_pix1, stride_pix1, pix2+16*stride_pix2, stride_pix2)
3089 + satd_16x16_altivec(pix1+32*stride_pix1, stride_pix1, pix2+32*stride_pix2, stride_pix2)
3090 + satd_16x16_altivec(pix1+48*stride_pix1, stride_pix1, pix2+48*stride_pix2, stride_pix2);
3091 return satd;
3092 }
3093
3094 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3095 int satd_altivec<24, 4>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3096 {
3097 int satd = 0;
3098 satd = satd_16x4_altivec(pix1, stride_pix1, pix2, stride_pix2)
3099 + satd_8x4_altivec(pix1+16, stride_pix1, pix2+16, stride_pix2);
3100 return satd;
3101 }
3102
3103 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3104 int satd_altivec<24, 8>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3105 {
3106 int satd = 0;
3107 satd = satd_16x8_altivec(pix1, stride_pix1, pix2, stride_pix2)
3108 + satd_8x8_altivec(pix1+16, stride_pix1, pix2+16, stride_pix2);
3109 return satd;
3110 }
3111
3112 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3113 int satd_altivec<24, 12>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3114 {
3115 int satd = 0;
3116 const pixel *pix3 = pix1 + 8*stride_pix1;
3117 const pixel *pix4 = pix2 + 8*stride_pix2;
3118 satd = satd_16x8_altivec(pix1, stride_pix1, pix2, stride_pix2)
3119 + satd_8x8_altivec(pix1+16, stride_pix1, pix2+16, stride_pix2)
3120 + satd_16x4_altivec(pix3, stride_pix1, pix4, stride_pix2)
3121 + satd_8x4_altivec(pix3+16, stride_pix1, pix4+16, stride_pix2);
3122 return satd;
3123 }
3124
3125 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3126 int satd_altivec<24, 16>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3127 {
3128 int satd = 0;
3129 satd = satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3130 + satd_8x16_altivec(pix1+16, stride_pix1, pix2+16, stride_pix2);
3131 return satd;
3132 }
3133
3134 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3135 int satd_altivec<24, 24>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3136 {
3137 int satd = 0;
3138 satd = satd_altivec<24, 16>(pix1, stride_pix1, pix2, stride_pix2)
3139 + satd_altivec<24, 8>(pix1+16*stride_pix1, stride_pix1, pix2+16*stride_pix2, stride_pix2);
3140 return satd;
3141 }
3142
3143 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3144 int satd_altivec<24, 32>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3145 {
3146 int satd = 0;
3147 const pixel *pix3 = pix1 + 16*stride_pix1;
3148 const pixel *pix4 = pix2 + 16*stride_pix2;
3149 satd = satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3150 + satd_8x16_altivec(pix1+16, stride_pix1, pix2+16, stride_pix2)
3151 + satd_16x16_altivec(pix3, stride_pix1, pix4, stride_pix2)
3152 + satd_8x16_altivec(pix3+16, stride_pix1, pix4+16, stride_pix2);
3153 return satd;
3154 }
3155
3156 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3157 int satd_altivec<24, 64>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3158 {
3159 int satd = 0;
3160 const pixel *pix3 = pix1 + 16*stride_pix1;
3161 const pixel *pix4 = pix2 + 16*stride_pix2;
3162 const pixel *pix5 = pix1 + 32*stride_pix1;
3163 const pixel *pix6 = pix2 + 32*stride_pix2;
3164 const pixel *pix7 = pix1 + 48*stride_pix1;
3165 const pixel *pix8 = pix2 + 48*stride_pix2;
3166 satd = satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3167 + satd_8x16_altivec(pix1+16, stride_pix1, pix2+16, stride_pix2)
3168 + satd_16x16_altivec(pix3, stride_pix1, pix4, stride_pix2)
3169 + satd_8x16_altivec(pix3+16, stride_pix1, pix4+16, stride_pix2)
3170 + satd_16x16_altivec(pix5, stride_pix1, pix6, stride_pix2)
3171 + satd_8x16_altivec(pix5+16, stride_pix1, pix6+16, stride_pix2)
3172 + satd_16x16_altivec(pix7, stride_pix1, pix8, stride_pix2)
3173 + satd_8x16_altivec(pix7+16, stride_pix1, pix8+16, stride_pix2);
3174 return satd;
3175 }
3176
3177 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3178 int satd_altivec<32, 4>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3179 {
3180 int satd = 0;
3181 satd = satd_16x4_altivec(pix1, stride_pix1, pix2, stride_pix2)
3182 + satd_16x4_altivec(pix1 + 16, stride_pix1, pix2 + 16, stride_pix2);
3183 return satd;
3184 }
3185
3186 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3187 int satd_altivec<32, 8>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3188 {
3189 int satd = 0;
3190 satd = satd_16x8_altivec(pix1, stride_pix1, pix2, stride_pix2)
3191 + satd_16x8_altivec(pix1 + 16, stride_pix1, pix2 + 16, stride_pix2);
3192 return satd;
3193 }
3194
3195 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3196 int satd_altivec<32, 12>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3197 {
3198 int satd = 0;
3199 const pixel *pix3 = pix1 + 8*stride_pix1;
3200 const pixel *pix4 = pix2 + 8*stride_pix2;
3201 satd = satd_16x8_altivec(pix1, stride_pix1, pix2, stride_pix2)
3202 + satd_16x8_altivec(pix1 + 16, stride_pix1, pix2 + 16, stride_pix2)
3203 + satd_16x4_altivec(pix3, stride_pix1, pix4, stride_pix2)
3204 + satd_16x4_altivec(pix3 + 16, stride_pix1, pix4 + 16, stride_pix2);
3205 return satd;
3206 }
3207
3208 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3209 int satd_altivec<32, 16>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3210 {
3211 int satd = 0;
3212 satd = satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3213 + satd_16x16_altivec(pix1 + 16, stride_pix1, pix2 + 16, stride_pix2);
3214 return satd;
3215 }
3216
3217 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3218 int satd_altivec<32, 24>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3219 {
3220 int satd = 0;
3221 const pixel *pix3 = pix1 + 16*stride_pix1;
3222 const pixel *pix4 = pix2 + 16*stride_pix2;
3223 satd = satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3224 + satd_16x16_altivec(pix1 + 16, stride_pix1, pix2 + 16, stride_pix2)
3225 + satd_16x8_altivec(pix3, stride_pix1, pix4, stride_pix2)
3226 + satd_16x8_altivec(pix3 + 16, stride_pix1, pix4 + 16, stride_pix2);
3227 return satd;
3228 }
3229
3230 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3231 int satd_altivec<32, 32>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3232 {
3233 int satd = 0;
3234 const pixel *pix3 = pix1 + 16*stride_pix1;
3235 const pixel *pix4 = pix2 + 16*stride_pix2;
3236 satd = satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3237 + satd_16x16_altivec(pix1 + 16, stride_pix1, pix2 + 16, stride_pix2)
3238 + satd_16x16_altivec(pix3, stride_pix1, pix4, stride_pix2)
3239 + satd_16x16_altivec(pix3 + 16, stride_pix1, pix4 + 16, stride_pix2);
3240 return satd;
3241 }
3242
3243 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3244 int satd_altivec<32, 48>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3245 {
3246 int satd = 0;
3247 const pixel *pix3 = pix1 + 16*stride_pix1;
3248 const pixel *pix4 = pix2 + 16*stride_pix2;
3249 const pixel *pix5 = pix1 + 32*stride_pix1;
3250 const pixel *pix6 = pix2 + 32*stride_pix2;
3251 satd = satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3252 + satd_16x16_altivec(pix1 + 16, stride_pix1, pix2 + 16, stride_pix2)
3253 + satd_16x16_altivec(pix3, stride_pix1, pix4, stride_pix2)
3254 + satd_16x16_altivec(pix3 + 16, stride_pix1, pix4 + 16, stride_pix2)
3255 + satd_16x16_altivec(pix5, stride_pix1, pix6, stride_pix2)
3256 + satd_16x16_altivec(pix5+16, stride_pix1, pix6+16, stride_pix2);
3257 return satd;
3258 }
3259
3260 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3261 int satd_altivec<32, 64>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3262 {
3263 int satd = 0;
3264 const pixel *pix3 = pix1 + 16*stride_pix1;
3265 const pixel *pix4 = pix2 + 16*stride_pix2;
3266 const pixel *pix5 = pix1 + 32*stride_pix1;
3267 const pixel *pix6 = pix2 + 32*stride_pix2;
3268 const pixel *pix7 = pix1 + 48*stride_pix1;
3269 const pixel *pix8 = pix2 + 48*stride_pix2;
3270 satd = satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3271 + satd_16x16_altivec(pix1+16, stride_pix1, pix2+16, stride_pix2)
3272 + satd_16x16_altivec(pix3, stride_pix1, pix4, stride_pix2)
3273 + satd_16x16_altivec(pix3+16, stride_pix1, pix4+16, stride_pix2)
3274 + satd_16x16_altivec(pix5, stride_pix1, pix6, stride_pix2)
3275 + satd_16x16_altivec(pix5+16, stride_pix1, pix6+16, stride_pix2)
3276 + satd_16x16_altivec(pix7, stride_pix1, pix8, stride_pix2)
3277 + satd_16x16_altivec(pix7+16, stride_pix1, pix8+16, stride_pix2);
3278 return satd;
3279 }
3280
3281 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3282 int satd_altivec<48, 4>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3283 {
3284 int satd = 0;
3285 satd = satd_16x4_altivec(pix1, stride_pix1, pix2, stride_pix2)
3286 + satd_16x4_altivec(pix1+16, stride_pix1, pix2+16, stride_pix2)
3287 + satd_16x4_altivec(pix1+32, stride_pix1, pix2+32, stride_pix2);
3288 return satd;
3289 }
3290
3291 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3292 int satd_altivec<48, 8>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3293 {
3294 int satd = 0;
3295 satd = satd_16x8_altivec(pix1, stride_pix1, pix2, stride_pix2)
3296 + satd_16x8_altivec(pix1+16, stride_pix1, pix2+16, stride_pix2)
3297 + satd_16x8_altivec(pix1+32, stride_pix1, pix2+32, stride_pix2);
3298 return satd;
3299 }
3300
3301 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3302 int satd_altivec<48, 12>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3303 {
3304 int satd = 0;
3305 const pixel *pix3 = pix1 + 8*stride_pix1;
3306 const pixel *pix4 = pix2 + 8*stride_pix2;
3307 satd = satd_16x8_altivec(pix1, stride_pix1, pix2, stride_pix2)
3308 + satd_16x8_altivec(pix1+16, stride_pix1, pix2+16, stride_pix2)
3309 + satd_16x8_altivec(pix1+32, stride_pix1, pix2+32, stride_pix2)
3310 +satd_16x4_altivec(pix3, stride_pix1, pix4, stride_pix2)
3311 + satd_16x4_altivec(pix3+16, stride_pix1, pix4+16, stride_pix2)
3312 + satd_16x4_altivec(pix3+32, stride_pix1, pix4+32, stride_pix2);
3313 return satd;
3314 }
3315
3316 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3317 int satd_altivec<48, 16>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3318 {
3319 int satd = 0;
3320 satd = satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3321 + satd_16x16_altivec(pix1+16, stride_pix1, pix2+16, stride_pix2)
3322 + satd_16x16_altivec(pix1+32, stride_pix1, pix2+32, stride_pix2);
3323 return satd;
3324 }
3325
3326 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3327 int satd_altivec<48, 24>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3328 {
3329 int satd = 0;
3330 const pixel *pix3 = pix1 + 8*stride_pix1;
3331 const pixel *pix4 = pix2 + 8*stride_pix2;
3332 satd = satd_16x8_altivec(pix1, stride_pix1, pix2, stride_pix2)
3333 + satd_16x8_altivec(pix1+16, stride_pix1, pix2+16, stride_pix2)
3334 + satd_16x8_altivec(pix1+32, stride_pix1, pix2+32, stride_pix2)
3335 +satd_16x16_altivec(pix3, stride_pix1, pix4, stride_pix2)
3336 + satd_16x16_altivec(pix3+16, stride_pix1, pix4+16, stride_pix2)
3337 + satd_16x16_altivec(pix3+32, stride_pix1, pix4+32, stride_pix2);
3338 return satd;
3339 }
3340
3341 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3342 int satd_altivec<48, 32>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3343 {
3344 int satd = 0;
3345 const pixel *pix3 = pix1 + 16*stride_pix1;
3346 const pixel *pix4 = pix2 + 16*stride_pix2;
3347 satd = satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3348 + satd_16x16_altivec(pix1+16, stride_pix1, pix2+16, stride_pix2)
3349 + satd_16x16_altivec(pix1+32, stride_pix1, pix2+32, stride_pix2)
3350 +satd_16x16_altivec(pix3, stride_pix1, pix4, stride_pix2)
3351 + satd_16x16_altivec(pix3+16, stride_pix1, pix4+16, stride_pix2)
3352 + satd_16x16_altivec(pix3+32, stride_pix1, pix4+32, stride_pix2);
3353 return satd;
3354 }
3355
3356 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3357 int satd_altivec<48, 64>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3358 {
3359 int satd = 0;
3360 const pixel *pix3 = pix1 + 16*stride_pix1;
3361 const pixel *pix4 = pix2 + 16*stride_pix2;
3362 const pixel *pix5 = pix1 + 32*stride_pix1;
3363 const pixel *pix6 = pix2 + 32*stride_pix2;
3364 const pixel *pix7 = pix1 + 48*stride_pix1;
3365 const pixel *pix8 = pix2 + 48*stride_pix2;
3366 satd = satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3367 + satd_16x16_altivec(pix1+16, stride_pix1, pix2+16, stride_pix2)
3368 + satd_16x16_altivec(pix1+32, stride_pix1, pix2+32, stride_pix2)
3369 +satd_16x16_altivec(pix3, stride_pix1, pix4, stride_pix2)
3370 + satd_16x16_altivec(pix3+16, stride_pix1, pix4+16, stride_pix2)
3371 + satd_16x16_altivec(pix3+32, stride_pix1, pix4+32, stride_pix2)
3372 +satd_16x16_altivec(pix5, stride_pix1, pix6, stride_pix2)
3373 + satd_16x16_altivec(pix5+16, stride_pix1, pix6+16, stride_pix2)
3374 + satd_16x16_altivec(pix5+32, stride_pix1, pix6+32, stride_pix2)
3375 +satd_16x16_altivec(pix7, stride_pix1, pix8, stride_pix2)
3376 + satd_16x16_altivec(pix7+16, stride_pix1,pix8+16, stride_pix2)
3377 + satd_16x16_altivec(pix7+32, stride_pix1, pix8+32, stride_pix2);
3378 return satd;
3379 }
3380
3381 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3382 int satd_altivec<64, 4>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3383 {
3384 int satd = 0;
3385 satd = satd_altivec<32, 4>(pix1, stride_pix1, pix2, stride_pix2)
3386 + satd_altivec<32, 4>(pix1+32, stride_pix1, pix2+32, stride_pix2);
3387 return satd;
3388 }
3389
3390 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3391 int satd_altivec<64, 8>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3392 {
3393 int satd = 0;
3394 satd = satd_altivec<32, 8>(pix1, stride_pix1, pix2, stride_pix2)
3395 + satd_altivec<32, 8>(pix1+32, stride_pix1, pix2+32, stride_pix2);
3396 return satd;
3397 }
3398
3399 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3400 int satd_altivec<64, 12>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3401 {
3402 int satd = 0;
3403 satd = satd_altivec<32, 12>(pix1, stride_pix1, pix2, stride_pix2)
3404 + satd_altivec<32, 12>(pix1+32, stride_pix1, pix2+32, stride_pix2);
3405 return satd;
3406 }
3407
3408 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3409 int satd_altivec<64, 16>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3410 {
3411 int satd = 0;
3412 satd = satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3413 + satd_16x16_altivec(pix1+16, stride_pix1, pix2+16, stride_pix2)
3414 + satd_16x16_altivec(pix1+32, stride_pix1, pix2+32, stride_pix2)
3415 + satd_16x16_altivec(pix1+48, stride_pix1, pix2+48, stride_pix2);
3416 return satd;
3417 }
3418
3419 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3420 int satd_altivec<64, 24>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3421 {
3422 int satd = 0;
3423 satd = satd_altivec<32, 24>(pix1, stride_pix1, pix2, stride_pix2)
3424 + satd_altivec<32, 24>(pix1+32, stride_pix1, pix2+32, stride_pix2);
3425 return satd;
3426 }
3427
3428 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3429 int satd_altivec<64, 32>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3430 {
3431 int satd = 0;
3432 const pixel *pix3 = pix1 + 16*stride_pix1;
3433 const pixel *pix4 = pix2 + 16*stride_pix2;
3434 satd = satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3435 + satd_16x16_altivec(pix1+16, stride_pix1, pix2+16, stride_pix2)
3436 + satd_16x16_altivec(pix1+32, stride_pix1, pix2+32, stride_pix2)
3437 + satd_16x16_altivec(pix1+48, stride_pix1, pix2+48, stride_pix2)
3438 + satd_16x16_altivec(pix3, stride_pix1, pix4, stride_pix2)
3439 + satd_16x16_altivec(pix3+16, stride_pix1, pix4+16, stride_pix2)
3440 + satd_16x16_altivec(pix3+32, stride_pix1, pix4+32, stride_pix2)
3441 + satd_16x16_altivec(pix3+48, stride_pix1, pix4+48, stride_pix2);
3442 return satd;
3443 }
3444
3445 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3446 int satd_altivec<64, 48>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3447 {
3448 int satd = 0;
3449 const pixel *pix3 = pix1 + 16*stride_pix1;
3450 const pixel *pix4 = pix2 + 16*stride_pix2;
3451 const pixel *pix5 = pix1 + 32*stride_pix1;
3452 const pixel *pix6 = pix2 + 32*stride_pix2;
3453 satd = satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3454 + satd_16x16_altivec(pix1+16, stride_pix1, pix2+16, stride_pix2)
3455 + satd_16x16_altivec(pix1+32, stride_pix1, pix2+32, stride_pix2)
3456 + satd_16x16_altivec(pix1+48, stride_pix1, pix2+48, stride_pix2)
3457 + satd_16x16_altivec(pix3, stride_pix1, pix4, stride_pix2)
3458 + satd_16x16_altivec(pix3+16, stride_pix1, pix4+16, stride_pix2)
3459 + satd_16x16_altivec(pix3+32, stride_pix1, pix4+32, stride_pix2)
3460 + satd_16x16_altivec(pix3+48, stride_pix1, pix4+48, stride_pix2)
3461 + satd_16x16_altivec(pix5, stride_pix1, pix6, stride_pix2)
3462 + satd_16x16_altivec(pix5+16, stride_pix1, pix6+16, stride_pix2)
3463 + satd_16x16_altivec(pix5+32, stride_pix1, pix6+32, stride_pix2)
3464 + satd_16x16_altivec(pix5+48, stride_pix1, pix6+48, stride_pix2);
3465 return satd;
3466 }
3467
3468 template<>
satd_altivec(const pixel * pix1,intptr_t stride_pix1,const pixel * pix2,intptr_t stride_pix2)3469 int satd_altivec<64, 64>(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)
3470 {
3471 int satd = 0;
3472 const pixel *pix3 = pix1 + 16*stride_pix1;
3473 const pixel *pix4 = pix2 + 16*stride_pix2;
3474 const pixel *pix5 = pix1 + 32*stride_pix1;
3475 const pixel *pix6 = pix2 + 32*stride_pix2;
3476 const pixel *pix7 = pix1 + 48*stride_pix1;
3477 const pixel *pix8 = pix2 + 48*stride_pix2;
3478 satd = satd_16x16_altivec(pix1, stride_pix1, pix2, stride_pix2)
3479 + satd_16x16_altivec(pix1+16, stride_pix1, pix2+16, stride_pix2)
3480 + satd_16x16_altivec(pix1+32, stride_pix1, pix2+32, stride_pix2)
3481 + satd_16x16_altivec(pix1+48, stride_pix1, pix2+48, stride_pix2)
3482 + satd_16x16_altivec(pix3, stride_pix1, pix4, stride_pix2)
3483 + satd_16x16_altivec(pix3+16, stride_pix1, pix4+16, stride_pix2)
3484 + satd_16x16_altivec(pix3+32, stride_pix1, pix4+32, stride_pix2)
3485 + satd_16x16_altivec(pix3+48, stride_pix1, pix4+48, stride_pix2)
3486 + satd_16x16_altivec(pix5, stride_pix1, pix6, stride_pix2)
3487 + satd_16x16_altivec(pix5+16, stride_pix1, pix6+16, stride_pix2)
3488 + satd_16x16_altivec(pix5+32, stride_pix1, pix6+32, stride_pix2)
3489 + satd_16x16_altivec(pix5+48, stride_pix1, pix6+48, stride_pix2)
3490 + satd_16x16_altivec(pix7, stride_pix1, pix8, stride_pix2)
3491 + satd_16x16_altivec(pix7+16, stride_pix1, pix8+16, stride_pix2)
3492 + satd_16x16_altivec(pix7+32, stride_pix1, pix8+32, stride_pix2)
3493 + satd_16x16_altivec(pix7+48, stride_pix1, pix8+48, stride_pix2);
3494 return satd;
3495 }
3496
3497
3498 /***********************************************************************
3499 * SA8D routines - altivec implementation
3500 **********************************************************************/
3501 #define SA8D_1D_ALTIVEC( sa8d0v, sa8d1v, sa8d2v, sa8d3v, \
3502 sa8d4v, sa8d5v, sa8d6v, sa8d7v ) \
3503 { \
3504 /* int a0 = SRC(0) + SRC(4) */ \
3505 vec_s16_t a0v = vec_add(sa8d0v, sa8d4v); \
3506 /* int a4 = SRC(0) - SRC(4) */ \
3507 vec_s16_t a4v = vec_sub(sa8d0v, sa8d4v); \
3508 /* int a1 = SRC(1) + SRC(5) */ \
3509 vec_s16_t a1v = vec_add(sa8d1v, sa8d5v); \
3510 /* int a5 = SRC(1) - SRC(5) */ \
3511 vec_s16_t a5v = vec_sub(sa8d1v, sa8d5v); \
3512 /* int a2 = SRC(2) + SRC(6) */ \
3513 vec_s16_t a2v = vec_add(sa8d2v, sa8d6v); \
3514 /* int a6 = SRC(2) - SRC(6) */ \
3515 vec_s16_t a6v = vec_sub(sa8d2v, sa8d6v); \
3516 /* int a3 = SRC(3) + SRC(7) */ \
3517 vec_s16_t a3v = vec_add(sa8d3v, sa8d7v); \
3518 /* int a7 = SRC(3) - SRC(7) */ \
3519 vec_s16_t a7v = vec_sub(sa8d3v, sa8d7v); \
3520 \
3521 /* int b0 = a0 + a2 */ \
3522 vec_s16_t b0v = vec_add(a0v, a2v); \
3523 /* int b2 = a0 - a2; */ \
3524 vec_s16_t b2v = vec_sub(a0v, a2v); \
3525 /* int b1 = a1 + a3; */ \
3526 vec_s16_t b1v = vec_add(a1v, a3v); \
3527 /* int b3 = a1 - a3; */ \
3528 vec_s16_t b3v = vec_sub(a1v, a3v); \
3529 /* int b4 = a4 + a6; */ \
3530 vec_s16_t b4v = vec_add(a4v, a6v); \
3531 /* int b6 = a4 - a6; */ \
3532 vec_s16_t b6v = vec_sub(a4v, a6v); \
3533 /* int b5 = a5 + a7; */ \
3534 vec_s16_t b5v = vec_add(a5v, a7v); \
3535 /* int b7 = a5 - a7; */ \
3536 vec_s16_t b7v = vec_sub(a5v, a7v); \
3537 \
3538 /* DST(0, b0 + b1) */ \
3539 sa8d0v = vec_add(b0v, b1v); \
3540 /* DST(1, b0 - b1) */ \
3541 sa8d1v = vec_sub(b0v, b1v); \
3542 /* DST(2, b2 + b3) */ \
3543 sa8d2v = vec_add(b2v, b3v); \
3544 /* DST(3, b2 - b3) */ \
3545 sa8d3v = vec_sub(b2v, b3v); \
3546 /* DST(4, b4 + b5) */ \
3547 sa8d4v = vec_add(b4v, b5v); \
3548 /* DST(5, b4 - b5) */ \
3549 sa8d5v = vec_sub(b4v, b5v); \
3550 /* DST(6, b6 + b7) */ \
3551 sa8d6v = vec_add(b6v, b7v); \
3552 /* DST(7, b6 - b7) */ \
3553 sa8d7v = vec_sub(b6v, b7v); \
3554 }
3555
sa8d_8x8_altivec(const pixel * pix1,intptr_t i_pix1,const pixel * pix2,intptr_t i_pix2)3556 inline int sa8d_8x8_altivec(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
3557 {
3558 ALIGN_VAR_16(int, sum);
3559
3560 LOAD_ZERO;
3561 vec_s16_t pix1v, pix2v;
3562 vec_s16_t diff0v, diff1v, diff2v, diff3v, diff4v, diff5v, diff6v, diff7v;
3563 vec_s16_t sa8d0v, sa8d1v, sa8d2v, sa8d3v, sa8d4v, sa8d5v, sa8d6v, sa8d7v;
3564
3565 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3566 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3567 diff0v = vec_sub( pix1v, pix2v );
3568 pix1 += i_pix1;
3569 pix2 += i_pix2;
3570
3571 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3572 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3573 diff1v = vec_sub( pix1v, pix2v );
3574 pix1 += i_pix1;
3575 pix2 += i_pix2;
3576
3577 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3578 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3579 diff2v = vec_sub( pix1v, pix2v );
3580 pix1 += i_pix1;
3581 pix2 += i_pix2;
3582
3583 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3584 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3585 diff3v = vec_sub( pix1v, pix2v );
3586 pix1 += i_pix1;
3587 pix2 += i_pix2;
3588
3589 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3590 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3591 diff4v = vec_sub( pix1v, pix2v );
3592 pix1 += i_pix1;
3593 pix2 += i_pix2;
3594
3595 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3596 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3597 diff5v = vec_sub( pix1v, pix2v );
3598 pix1 += i_pix1;
3599 pix2 += i_pix2;
3600
3601 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3602 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3603 diff6v = vec_sub( pix1v, pix2v );
3604 pix1 += i_pix1;
3605 pix2 += i_pix2;
3606
3607 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3608 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3609 diff7v = vec_sub( pix1v, pix2v );
3610 pix1 += i_pix1;
3611 pix2 += i_pix2;
3612
3613
3614 SA8D_1D_ALTIVEC(diff0v, diff1v, diff2v, diff3v,
3615 diff4v, diff5v, diff6v, diff7v);
3616 VEC_TRANSPOSE_8(diff0v, diff1v, diff2v, diff3v,
3617 diff4v, diff5v, diff6v, diff7v,
3618 sa8d0v, sa8d1v, sa8d2v, sa8d3v,
3619 sa8d4v, sa8d5v, sa8d6v, sa8d7v );
3620 SA8D_1D_ALTIVEC(sa8d0v, sa8d1v, sa8d2v, sa8d3v,
3621 sa8d4v, sa8d5v, sa8d6v, sa8d7v );
3622
3623 /* accumulation of the absolute value of all elements of the resulting bloc */
3624 vec_s16_t abs0v = vec_max( sa8d0v, vec_sub( zero_s16v, sa8d0v ) );
3625 vec_s16_t abs1v = vec_max( sa8d1v, vec_sub( zero_s16v, sa8d1v ) );
3626 vec_s16_t sum01v = vec_add(abs0v, abs1v);
3627
3628 vec_s16_t abs2v = vec_max( sa8d2v, vec_sub( zero_s16v, sa8d2v ) );
3629 vec_s16_t abs3v = vec_max( sa8d3v, vec_sub( zero_s16v, sa8d3v ) );
3630 vec_s16_t sum23v = vec_add(abs2v, abs3v);
3631
3632 vec_s16_t abs4v = vec_max( sa8d4v, vec_sub( zero_s16v, sa8d4v ) );
3633 vec_s16_t abs5v = vec_max( sa8d5v, vec_sub( zero_s16v, sa8d5v ) );
3634 vec_s16_t sum45v = vec_add(abs4v, abs5v);
3635
3636 vec_s16_t abs6v = vec_max( sa8d6v, vec_sub( zero_s16v, sa8d6v ) );
3637 vec_s16_t abs7v = vec_max( sa8d7v, vec_sub( zero_s16v, sa8d7v ) );
3638 vec_s16_t sum67v = vec_add(abs6v, abs7v);
3639
3640 vec_s16_t sum0123v = vec_add(sum01v, sum23v);
3641 vec_s16_t sum4567v = vec_add(sum45v, sum67v);
3642
3643 vec_s32_t sumblocv;
3644
3645 sumblocv = vec_sum4s(sum0123v, (vec_s32_t)zerov );
3646 //print_vec_s("sum0123v", &sum0123v);
3647 //print_vec_i("sumblocv = vec_sum4s(sum0123v, 0 )", &sumblocv);
3648 sumblocv = vec_sum4s(sum4567v, sumblocv );
3649 //print_vec_s("sum4567v", &sum4567v);
3650 //print_vec_i("sumblocv = vec_sum4s(sum4567v, sumblocv )", &sumblocv);
3651 sumblocv = vec_sums(sumblocv, (vec_s32_t)zerov );
3652 //print_vec_i("sumblocv=vec_sums(sumblocv,0 )", &sumblocv);
3653 sumblocv = vec_splat(sumblocv, 3);
3654 //print_vec_i("sumblocv = vec_splat(sumblocv, 3)", &sumblocv);
3655 vec_ste(sumblocv, 0, &sum);
3656
3657 return (sum + 2) >> 2;
3658 }
3659
3660
sa8d_8x8_altivec(const int16_t * pix1,intptr_t i_pix1)3661 int sa8d_8x8_altivec(const int16_t* pix1, intptr_t i_pix1)
3662 {
3663 int sum = 0;
3664 return ((sum+2)>>2);
3665 }
3666
sa8d_8x16_altivec(const pixel * pix1,intptr_t i_pix1,const pixel * pix2,intptr_t i_pix2)3667 inline int sa8d_8x16_altivec(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
3668 {
3669 ALIGN_VAR_16(int, sum);
3670 ALIGN_VAR_16(int, sum1);
3671
3672 LOAD_ZERO;
3673 vec_s16_t pix1v, pix2v;
3674 vec_s16_t diff0v, diff1v, diff2v, diff3v, diff4v, diff5v, diff6v, diff7v;
3675 vec_s16_t sa8d0v, sa8d1v, sa8d2v, sa8d3v, sa8d4v, sa8d5v, sa8d6v, sa8d7v;
3676
3677 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3678 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3679 diff0v = vec_sub( pix1v, pix2v );
3680 pix1 += i_pix1;
3681 pix2 += i_pix2;
3682
3683 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3684 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3685 diff1v = vec_sub( pix1v, pix2v );
3686 pix1 += i_pix1;
3687 pix2 += i_pix2;
3688
3689 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3690 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3691 diff2v = vec_sub( pix1v, pix2v );
3692 pix1 += i_pix1;
3693 pix2 += i_pix2;
3694
3695 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3696 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3697 diff3v = vec_sub( pix1v, pix2v );
3698 pix1 += i_pix1;
3699 pix2 += i_pix2;
3700
3701 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3702 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3703 diff4v = vec_sub( pix1v, pix2v );
3704 pix1 += i_pix1;
3705 pix2 += i_pix2;
3706
3707 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3708 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3709 diff5v = vec_sub( pix1v, pix2v );
3710 pix1 += i_pix1;
3711 pix2 += i_pix2;
3712
3713 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3714 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3715 diff6v = vec_sub( pix1v, pix2v );
3716 pix1 += i_pix1;
3717 pix2 += i_pix2;
3718
3719 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3720 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3721 diff7v = vec_sub( pix1v, pix2v );
3722 pix1 += i_pix1;
3723 pix2 += i_pix2;
3724
3725
3726 SA8D_1D_ALTIVEC(diff0v, diff1v, diff2v, diff3v,
3727 diff4v, diff5v, diff6v, diff7v);
3728 VEC_TRANSPOSE_8(diff0v, diff1v, diff2v, diff3v,
3729 diff4v, diff5v, diff6v, diff7v,
3730 sa8d0v, sa8d1v, sa8d2v, sa8d3v,
3731 sa8d4v, sa8d5v, sa8d6v, sa8d7v );
3732 SA8D_1D_ALTIVEC(sa8d0v, sa8d1v, sa8d2v, sa8d3v,
3733 sa8d4v, sa8d5v, sa8d6v, sa8d7v );
3734
3735 /* accumulation of the absolute value of all elements of the resulting bloc */
3736 vec_s16_t abs0v = vec_max( sa8d0v, vec_sub( zero_s16v, sa8d0v ) );
3737 vec_s16_t abs1v = vec_max( sa8d1v, vec_sub( zero_s16v, sa8d1v ) );
3738 vec_s16_t sum01v = vec_add(abs0v, abs1v);
3739
3740 vec_s16_t abs2v = vec_max( sa8d2v, vec_sub( zero_s16v, sa8d2v ) );
3741 vec_s16_t abs3v = vec_max( sa8d3v, vec_sub( zero_s16v, sa8d3v ) );
3742 vec_s16_t sum23v = vec_add(abs2v, abs3v);
3743
3744 vec_s16_t abs4v = vec_max( sa8d4v, vec_sub( zero_s16v, sa8d4v ) );
3745 vec_s16_t abs5v = vec_max( sa8d5v, vec_sub( zero_s16v, sa8d5v ) );
3746 vec_s16_t sum45v = vec_add(abs4v, abs5v);
3747
3748 vec_s16_t abs6v = vec_max( sa8d6v, vec_sub( zero_s16v, sa8d6v ) );
3749 vec_s16_t abs7v = vec_max( sa8d7v, vec_sub( zero_s16v, sa8d7v ) );
3750 vec_s16_t sum67v = vec_add(abs6v, abs7v);
3751
3752 vec_s16_t sum0123v = vec_add(sum01v, sum23v);
3753 vec_s16_t sum4567v = vec_add(sum45v, sum67v);
3754
3755 vec_s32_t sumblocv, sumblocv1;
3756
3757 sumblocv = vec_sum4s(sum0123v, (vec_s32_t)zerov );
3758 sumblocv = vec_sum4s(sum4567v, sumblocv );
3759 sumblocv = vec_sums(sumblocv, (vec_s32_t)zerov );
3760 sumblocv = vec_splat(sumblocv, 3);
3761 vec_ste(sumblocv, 0, &sum);
3762
3763 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3764 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3765 diff0v = vec_sub( pix1v, pix2v );
3766 pix1 += i_pix1;
3767 pix2 += i_pix2;
3768
3769 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3770 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3771 diff1v = vec_sub( pix1v, pix2v );
3772 pix1 += i_pix1;
3773 pix2 += i_pix2;
3774
3775 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3776 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3777 diff2v = vec_sub( pix1v, pix2v );
3778 pix1 += i_pix1;
3779 pix2 += i_pix2;
3780
3781 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3782 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3783 diff3v = vec_sub( pix1v, pix2v );
3784 pix1 += i_pix1;
3785 pix2 += i_pix2;
3786
3787 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3788 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3789 diff4v = vec_sub( pix1v, pix2v );
3790 pix1 += i_pix1;
3791 pix2 += i_pix2;
3792
3793 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3794 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3795 diff5v = vec_sub( pix1v, pix2v );
3796 pix1 += i_pix1;
3797 pix2 += i_pix2;
3798
3799 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3800 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3801 diff6v = vec_sub( pix1v, pix2v );
3802 pix1 += i_pix1;
3803 pix2 += i_pix2;
3804
3805 pix1v = vec_u8_to_s16(vec_xl(0, pix1));
3806 pix2v = vec_u8_to_s16( vec_xl(0, pix2) );
3807 diff7v = vec_sub( pix1v, pix2v );
3808 pix1 += i_pix1;
3809 pix2 += i_pix2;
3810
3811
3812 SA8D_1D_ALTIVEC(diff0v, diff1v, diff2v, diff3v,
3813 diff4v, diff5v, diff6v, diff7v);
3814 VEC_TRANSPOSE_8(diff0v, diff1v, diff2v, diff3v,
3815 diff4v, diff5v, diff6v, diff7v,
3816 sa8d0v, sa8d1v, sa8d2v, sa8d3v,
3817 sa8d4v, sa8d5v, sa8d6v, sa8d7v );
3818 SA8D_1D_ALTIVEC(sa8d0v, sa8d1v, sa8d2v, sa8d3v,
3819 sa8d4v, sa8d5v, sa8d6v, sa8d7v );
3820
3821 /* accumulation of the absolute value of all elements of the resulting bloc */
3822 abs0v = vec_max( sa8d0v, vec_sub( zero_s16v, sa8d0v ) );
3823 abs1v = vec_max( sa8d1v, vec_sub( zero_s16v, sa8d1v ) );
3824 sum01v = vec_add(abs0v, abs1v);
3825
3826 abs2v = vec_max( sa8d2v, vec_sub( zero_s16v, sa8d2v ) );
3827 abs3v = vec_max( sa8d3v, vec_sub( zero_s16v, sa8d3v ) );
3828 sum23v = vec_add(abs2v, abs3v);
3829
3830 abs4v = vec_max( sa8d4v, vec_sub( zero_s16v, sa8d4v ) );
3831 abs5v = vec_max( sa8d5v, vec_sub( zero_s16v, sa8d5v ) );
3832 sum45v = vec_add(abs4v, abs5v);
3833
3834 abs6v = vec_max( sa8d6v, vec_sub( zero_s16v, sa8d6v ) );
3835 abs7v = vec_max( sa8d7v, vec_sub( zero_s16v, sa8d7v ) );
3836 sum67v = vec_add(abs6v, abs7v);
3837
3838 sum0123v = vec_add(sum01v, sum23v);
3839 sum4567v = vec_add(sum45v, sum67v);
3840
3841 sumblocv1 = vec_sum4s(sum0123v, (vec_s32_t)zerov );
3842 sumblocv1 = vec_sum4s(sum4567v, sumblocv1 );
3843 sumblocv1 = vec_sums(sumblocv1, (vec_s32_t)zerov );
3844 sumblocv1 = vec_splat(sumblocv1, 3);
3845 vec_ste(sumblocv1, 0, &sum1);
3846
3847 sum = (sum + 2) >> 2;
3848 sum1 = (sum1 + 2) >> 2;
3849 sum += sum1;
3850 return (sum);
3851 }
3852
sa8d_16x8_altivec(const pixel * pix1,intptr_t i_pix1,const pixel * pix2,intptr_t i_pix2)3853 inline int sa8d_16x8_altivec(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
3854 {
3855 ALIGN_VAR_16(int, sumh);
3856 ALIGN_VAR_16(int, suml);
3857
3858 LOAD_ZERO;
3859 vec_s16_t pix1v, pix2v;
3860 vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v,
3861 diffh4v, diffh5v, diffh6v, diffh7v;
3862 vec_s16_t diffl0v, diffl1v, diffl2v, diffl3v,
3863 diffl4v, diffl5v, diffl6v, diffl7v;
3864 vec_s16_t sa8dh0v, sa8dh1v, sa8dh2v, sa8dh3v, sa8dh4v, sa8dh5v, sa8dh6v, sa8dh7v;
3865 vec_s16_t sa8dl0v, sa8dl1v, sa8dl2v, sa8dl3v, sa8dl4v, sa8dl5v, sa8dl6v, sa8dl7v;
3866 vec_s16_t temp0v, temp1v, temp2v, temp3v;
3867
3868 VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh0v,diffl0v);
3869 VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh1v, diffl1v);
3870 VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh2v, diffl2v);
3871 VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh3v, diffl3v);
3872 VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh4v, diffl4v);
3873 VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh5v, diffl5v);
3874 VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh6v, diffl6v);
3875 VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh7v, diffl7v);
3876
3877 SA8D_1D_ALTIVEC(diffh0v, diffh1v, diffh2v, diffh3v, diffh4v, diffh5v, diffh6v, diffh7v);
3878 VEC_TRANSPOSE_8(diffh0v, diffh1v, diffh2v, diffh3v, diffh4v, diffh5v, diffh6v, diffh7v,
3879 sa8dh0v, sa8dh1v, sa8dh2v, sa8dh3v, sa8dh4v, sa8dh5v, sa8dh6v, sa8dh7v );
3880 SA8D_1D_ALTIVEC(sa8dh0v, sa8dh1v, sa8dh2v, sa8dh3v, sa8dh4v, sa8dh5v, sa8dh6v, sa8dh7v);
3881
3882 SA8D_1D_ALTIVEC(diffl0v, diffl1v, diffl2v, diffl3v, diffl4v, diffl5v, diffl6v, diffl7v);
3883 VEC_TRANSPOSE_8(diffl0v, diffl1v, diffl2v, diffl3v, diffl4v, diffl5v, diffl6v, diffl7v,
3884 sa8dl0v, sa8dl1v, sa8dl2v, sa8dl3v, sa8dl4v, sa8dl5v, sa8dl6v, sa8dl7v );
3885 SA8D_1D_ALTIVEC(sa8dl0v, sa8dl1v, sa8dl2v, sa8dl3v, sa8dl4v, sa8dl5v, sa8dl6v, sa8dl7v);
3886
3887 /* accumulation of the absolute value of all elements of the resulting bloc */
3888 sa8dh0v = vec_max( sa8dh0v, vec_sub( zero_s16v, sa8dh0v ) );
3889 sa8dh1v = vec_max( sa8dh1v, vec_sub( zero_s16v, sa8dh1v ) );
3890 vec_s16_t sumh01v = vec_add(sa8dh0v, sa8dh1v);
3891
3892 sa8dh2v = vec_max( sa8dh2v, vec_sub( zero_s16v, sa8dh2v ) );
3893 sa8dh3v = vec_max( sa8dh3v, vec_sub( zero_s16v, sa8dh3v ) );
3894 vec_s16_t sumh23v = vec_add(sa8dh2v, sa8dh3v);
3895
3896 sa8dh4v = vec_max( sa8dh4v, vec_sub( zero_s16v, sa8dh4v ) );
3897 sa8dh5v = vec_max( sa8dh5v, vec_sub( zero_s16v, sa8dh5v ) );
3898 vec_s16_t sumh45v = vec_add(sa8dh4v, sa8dh5v);
3899
3900 sa8dh6v = vec_max( sa8dh6v, vec_sub( zero_s16v, sa8dh6v ) );
3901 sa8dh7v = vec_max( sa8dh7v, vec_sub( zero_s16v, sa8dh7v ) );
3902 vec_s16_t sumh67v = vec_add(sa8dh6v, sa8dh7v);
3903
3904 vec_s16_t sumh0123v = vec_add(sumh01v, sumh23v);
3905 vec_s16_t sumh4567v = vec_add(sumh45v, sumh67v);
3906
3907 vec_s32_t sumblocv_h;
3908
3909 sumblocv_h = vec_sum4s(sumh0123v, (vec_s32_t)zerov );
3910 //print_vec_s("sum0123v", &sum0123v);
3911 //print_vec_i("sumblocv = vec_sum4s(sum0123v, 0 )", &sumblocv);
3912 sumblocv_h = vec_sum4s(sumh4567v, sumblocv_h );
3913 //print_vec_s("sum4567v", &sum4567v);
3914 //print_vec_i("sumblocv = vec_sum4s(sum4567v, sumblocv )", &sumblocv);
3915 sumblocv_h = vec_sums(sumblocv_h, (vec_s32_t)zerov );
3916 //print_vec_i("sumblocv=vec_sums(sumblocv,0 )", &sumblocv);
3917 sumblocv_h = vec_splat(sumblocv_h, 3);
3918 //print_vec_i("sumblocv = vec_splat(sumblocv, 3)", &sumblocv);
3919 vec_ste(sumblocv_h, 0, &sumh);
3920
3921 sa8dl0v = vec_max( sa8dl0v, vec_sub( zero_s16v, sa8dl0v ) );
3922 sa8dl1v = vec_max( sa8dl1v, vec_sub( zero_s16v, sa8dl1v ) );
3923 vec_s16_t suml01v = vec_add(sa8dl0v, sa8dl1v);
3924
3925 sa8dl2v = vec_max( sa8dl2v, vec_sub( zero_s16v, sa8dl2v ) );
3926 sa8dl3v = vec_max( sa8dl3v, vec_sub( zero_s16v, sa8dl3v ) );
3927 vec_s16_t suml23v = vec_add(sa8dl2v, sa8dl3v);
3928
3929 sa8dl4v = vec_max( sa8dl4v, vec_sub( zero_s16v, sa8dl4v ) );
3930 sa8dl5v = vec_max( sa8dl5v, vec_sub( zero_s16v, sa8dl5v ) );
3931 vec_s16_t suml45v = vec_add(sa8dl4v, sa8dl5v);
3932
3933 sa8dl6v = vec_max( sa8dl6v, vec_sub( zero_s16v, sa8dl6v ) );
3934 sa8dl7v = vec_max( sa8dl7v, vec_sub( zero_s16v, sa8dl7v ) );
3935 vec_s16_t suml67v = vec_add(sa8dl6v, sa8dl7v);
3936
3937 vec_s16_t suml0123v = vec_add(suml01v, suml23v);
3938 vec_s16_t suml4567v = vec_add(suml45v, suml67v);
3939
3940 vec_s32_t sumblocv_l;
3941
3942 sumblocv_l = vec_sum4s(suml0123v, (vec_s32_t)zerov );
3943 //print_vec_s("sum0123v", &sum0123v);
3944 //print_vec_i("sumblocv = vec_sum4s(sum0123v, 0 )", &sumblocv);
3945 sumblocv_l = vec_sum4s(suml4567v, sumblocv_l );
3946 //print_vec_s("sum4567v", &sum4567v);
3947 //print_vec_i("sumblocv = vec_sum4s(sum4567v, sumblocv )", &sumblocv);
3948 sumblocv_l = vec_sums(sumblocv_l, (vec_s32_t)zerov );
3949 //print_vec_i("sumblocv=vec_sums(sumblocv,0 )", &sumblocv);
3950 sumblocv_l = vec_splat(sumblocv_l, 3);
3951 //print_vec_i("sumblocv = vec_splat(sumblocv, 3)", &sumblocv);
3952 vec_ste(sumblocv_l, 0, &suml);
3953
3954 sumh = (sumh + 2) >> 2;
3955 suml= (suml + 2) >> 2;
3956 return (sumh+suml);
3957 }
3958
sa8d_16x16_altivec(const pixel * pix1,intptr_t i_pix1,const pixel * pix2,intptr_t i_pix2)3959 inline int sa8d_16x16_altivec(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
3960 {
3961 ALIGN_VAR_16(int, sumh0);
3962 ALIGN_VAR_16(int, suml0);
3963
3964 ALIGN_VAR_16(int, sumh1);
3965 ALIGN_VAR_16(int, suml1);
3966
3967 ALIGN_VAR_16(int, sum);
3968
3969 LOAD_ZERO;
3970 vec_s16_t pix1v, pix2v;
3971 vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v,
3972 diffh4v, diffh5v, diffh6v, diffh7v;
3973 vec_s16_t diffl0v, diffl1v, diffl2v, diffl3v,
3974 diffl4v, diffl5v, diffl6v, diffl7v;
3975 vec_s16_t sa8dh0v, sa8dh1v, sa8dh2v, sa8dh3v, sa8dh4v, sa8dh5v, sa8dh6v, sa8dh7v;
3976 vec_s16_t sa8dl0v, sa8dl1v, sa8dl2v, sa8dl3v, sa8dl4v, sa8dl5v, sa8dl6v, sa8dl7v;
3977 vec_s16_t temp0v, temp1v, temp2v, temp3v;
3978
3979 VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh0v,diffl0v);
3980 VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh1v, diffl1v);
3981 VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh2v, diffl2v);
3982 VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh3v, diffl3v);
3983 VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh4v, diffl4v);
3984 VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh5v, diffl5v);
3985 VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh6v, diffl6v);
3986 VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh7v, diffl7v);
3987
3988 SA8D_1D_ALTIVEC(diffh0v, diffh1v, diffh2v, diffh3v, diffh4v, diffh5v, diffh6v, diffh7v);
3989 VEC_TRANSPOSE_8(diffh0v, diffh1v, diffh2v, diffh3v, diffh4v, diffh5v, diffh6v, diffh7v,
3990 sa8dh0v, sa8dh1v, sa8dh2v, sa8dh3v, sa8dh4v, sa8dh5v, sa8dh6v, sa8dh7v );
3991 SA8D_1D_ALTIVEC(sa8dh0v, sa8dh1v, sa8dh2v, sa8dh3v, sa8dh4v, sa8dh5v, sa8dh6v, sa8dh7v);
3992
3993 SA8D_1D_ALTIVEC(diffl0v, diffl1v, diffl2v, diffl3v, diffl4v, diffl5v, diffl6v, diffl7v);
3994 VEC_TRANSPOSE_8(diffl0v, diffl1v, diffl2v, diffl3v, diffl4v, diffl5v, diffl6v, diffl7v,
3995 sa8dl0v, sa8dl1v, sa8dl2v, sa8dl3v, sa8dl4v, sa8dl5v, sa8dl6v, sa8dl7v );
3996 SA8D_1D_ALTIVEC(sa8dl0v, sa8dl1v, sa8dl2v, sa8dl3v, sa8dl4v, sa8dl5v, sa8dl6v, sa8dl7v);
3997
3998 /* accumulation of the absolute value of all elements of the resulting bloc */
3999 sa8dh0v = vec_max( sa8dh0v, vec_sub( zero_s16v, sa8dh0v ) );
4000 sa8dh1v = vec_max( sa8dh1v, vec_sub( zero_s16v, sa8dh1v ) );
4001 vec_s16_t sumh01v = vec_add(sa8dh0v, sa8dh1v);
4002
4003 sa8dh2v = vec_max( sa8dh2v, vec_sub( zero_s16v, sa8dh2v ) );
4004 sa8dh3v = vec_max( sa8dh3v, vec_sub( zero_s16v, sa8dh3v ) );
4005 vec_s16_t sumh23v = vec_add(sa8dh2v, sa8dh3v);
4006
4007 sa8dh4v = vec_max( sa8dh4v, vec_sub( zero_s16v, sa8dh4v ) );
4008 sa8dh5v = vec_max( sa8dh5v, vec_sub( zero_s16v, sa8dh5v ) );
4009 vec_s16_t sumh45v = vec_add(sa8dh4v, sa8dh5v);
4010
4011 sa8dh6v = vec_max( sa8dh6v, vec_sub( zero_s16v, sa8dh6v ) );
4012 sa8dh7v = vec_max( sa8dh7v, vec_sub( zero_s16v, sa8dh7v ) );
4013 vec_s16_t sumh67v = vec_add(sa8dh6v, sa8dh7v);
4014
4015 vec_s16_t sumh0123v = vec_add(sumh01v, sumh23v);
4016 vec_s16_t sumh4567v = vec_add(sumh45v, sumh67v);
4017
4018 vec_s32_t sumblocv_h0;
4019
4020 sumblocv_h0 = vec_sum4s(sumh0123v, (vec_s32_t)zerov );
4021 sumblocv_h0 = vec_sum4s(sumh4567v, sumblocv_h0 );
4022 sumblocv_h0 = vec_sums(sumblocv_h0, (vec_s32_t)zerov );
4023 sumblocv_h0 = vec_splat(sumblocv_h0, 3);
4024 vec_ste(sumblocv_h0, 0, &sumh0);
4025
4026 sa8dl0v = vec_max( sa8dl0v, vec_sub( zero_s16v, sa8dl0v ) );
4027 sa8dl1v = vec_max( sa8dl1v, vec_sub( zero_s16v, sa8dl1v ) );
4028 vec_s16_t suml01v = vec_add(sa8dl0v, sa8dl1v);
4029
4030 sa8dl2v = vec_max( sa8dl2v, vec_sub( zero_s16v, sa8dl2v ) );
4031 sa8dl3v = vec_max( sa8dl3v, vec_sub( zero_s16v, sa8dl3v ) );
4032 vec_s16_t suml23v = vec_add(sa8dl2v, sa8dl3v);
4033
4034 sa8dl4v = vec_max( sa8dl4v, vec_sub( zero_s16v, sa8dl4v ) );
4035 sa8dl5v = vec_max( sa8dl5v, vec_sub( zero_s16v, sa8dl5v ) );
4036 vec_s16_t suml45v = vec_add(sa8dl4v, sa8dl5v);
4037
4038 sa8dl6v = vec_max( sa8dl6v, vec_sub( zero_s16v, sa8dl6v ) );
4039 sa8dl7v = vec_max( sa8dl7v, vec_sub( zero_s16v, sa8dl7v ) );
4040 vec_s16_t suml67v = vec_add(sa8dl6v, sa8dl7v);
4041
4042 vec_s16_t suml0123v = vec_add(suml01v, suml23v);
4043 vec_s16_t suml4567v = vec_add(suml45v, suml67v);
4044
4045 vec_s32_t sumblocv_l0;
4046
4047 sumblocv_l0 = vec_sum4s(suml0123v, (vec_s32_t)zerov );
4048 sumblocv_l0 = vec_sum4s(suml4567v, sumblocv_l0 );
4049 sumblocv_l0 = vec_sums(sumblocv_l0, (vec_s32_t)zerov );
4050 sumblocv_l0 = vec_splat(sumblocv_l0, 3);
4051 vec_ste(sumblocv_l0, 0, &suml0);
4052
4053 VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh0v,diffl0v);
4054 VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh1v, diffl1v);
4055 VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh2v, diffl2v);
4056 VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh3v, diffl3v);
4057 VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh4v, diffl4v);
4058 VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh5v, diffl5v);
4059 VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh6v, diffl6v);
4060 VEC_DIFF_S16(pix1,i_pix1,pix2,i_pix2,diffh7v, diffl7v);
4061
4062 SA8D_1D_ALTIVEC(diffh0v, diffh1v, diffh2v, diffh3v, diffh4v, diffh5v, diffh6v, diffh7v);
4063 VEC_TRANSPOSE_8(diffh0v, diffh1v, diffh2v, diffh3v, diffh4v, diffh5v, diffh6v, diffh7v,
4064 sa8dh0v, sa8dh1v, sa8dh2v, sa8dh3v, sa8dh4v, sa8dh5v, sa8dh6v, sa8dh7v );
4065 SA8D_1D_ALTIVEC(sa8dh0v, sa8dh1v, sa8dh2v, sa8dh3v, sa8dh4v, sa8dh5v, sa8dh6v, sa8dh7v);
4066
4067 SA8D_1D_ALTIVEC(diffl0v, diffl1v, diffl2v, diffl3v, diffl4v, diffl5v, diffl6v, diffl7v);
4068 VEC_TRANSPOSE_8(diffl0v, diffl1v, diffl2v, diffl3v, diffl4v, diffl5v, diffl6v, diffl7v,
4069 sa8dl0v, sa8dl1v, sa8dl2v, sa8dl3v, sa8dl4v, sa8dl5v, sa8dl6v, sa8dl7v );
4070 SA8D_1D_ALTIVEC(sa8dl0v, sa8dl1v, sa8dl2v, sa8dl3v, sa8dl4v, sa8dl5v, sa8dl6v, sa8dl7v);
4071
4072 /* accumulation of the absolute value of all elements of the resulting bloc */
4073 sa8dh0v = vec_max( sa8dh0v, vec_sub( zero_s16v, sa8dh0v ) );
4074 sa8dh1v = vec_max( sa8dh1v, vec_sub( zero_s16v, sa8dh1v ) );
4075 sumh01v = vec_add(sa8dh0v, sa8dh1v);
4076
4077 sa8dh2v = vec_max( sa8dh2v, vec_sub( zero_s16v, sa8dh2v ) );
4078 sa8dh3v = vec_max( sa8dh3v, vec_sub( zero_s16v, sa8dh3v ) );
4079 sumh23v = vec_add(sa8dh2v, sa8dh3v);
4080
4081 sa8dh4v = vec_max( sa8dh4v, vec_sub( zero_s16v, sa8dh4v ) );
4082 sa8dh5v = vec_max( sa8dh5v, vec_sub( zero_s16v, sa8dh5v ) );
4083 sumh45v = vec_add(sa8dh4v, sa8dh5v);
4084
4085 sa8dh6v = vec_max( sa8dh6v, vec_sub( zero_s16v, sa8dh6v ) );
4086 sa8dh7v = vec_max( sa8dh7v, vec_sub( zero_s16v, sa8dh7v ) );
4087 sumh67v = vec_add(sa8dh6v, sa8dh7v);
4088
4089 sumh0123v = vec_add(sumh01v, sumh23v);
4090 sumh4567v = vec_add(sumh45v, sumh67v);
4091
4092 vec_s32_t sumblocv_h1;
4093
4094 sumblocv_h1 = vec_sum4s(sumh0123v, (vec_s32_t)zerov );
4095 sumblocv_h1 = vec_sum4s(sumh4567v, sumblocv_h1 );
4096 sumblocv_h1 = vec_sums(sumblocv_h1, (vec_s32_t)zerov );
4097 sumblocv_h1 = vec_splat(sumblocv_h1, 3);
4098 vec_ste(sumblocv_h1, 0, &sumh1);
4099
4100 sa8dl0v = vec_max( sa8dl0v, vec_sub( zero_s16v, sa8dl0v ) );
4101 sa8dl1v = vec_max( sa8dl1v, vec_sub( zero_s16v, sa8dl1v ) );
4102 suml01v = vec_add(sa8dl0v, sa8dl1v);
4103
4104 sa8dl2v = vec_max( sa8dl2v, vec_sub( zero_s16v, sa8dl2v ) );
4105 sa8dl3v = vec_max( sa8dl3v, vec_sub( zero_s16v, sa8dl3v ) );
4106 suml23v = vec_add(sa8dl2v, sa8dl3v);
4107
4108 sa8dl4v = vec_max( sa8dl4v, vec_sub( zero_s16v, sa8dl4v ) );
4109 sa8dl5v = vec_max( sa8dl5v, vec_sub( zero_s16v, sa8dl5v ) );
4110 suml45v = vec_add(sa8dl4v, sa8dl5v);
4111
4112 sa8dl6v = vec_max( sa8dl6v, vec_sub( zero_s16v, sa8dl6v ) );
4113 sa8dl7v = vec_max( sa8dl7v, vec_sub( zero_s16v, sa8dl7v ) );
4114 suml67v = vec_add(sa8dl6v, sa8dl7v);
4115
4116 suml0123v = vec_add(suml01v, suml23v);
4117 suml4567v = vec_add(suml45v, suml67v);
4118
4119 vec_s32_t sumblocv_l1;
4120
4121 sumblocv_l1 = vec_sum4s(suml0123v, (vec_s32_t)zerov );
4122 sumblocv_l1 = vec_sum4s(suml4567v, sumblocv_l1 );
4123 sumblocv_l1 = vec_sums(sumblocv_l1, (vec_s32_t)zerov );
4124 sumblocv_l1 = vec_splat(sumblocv_l1, 3);
4125 vec_ste(sumblocv_l1, 0, &suml1);
4126
4127 sum = (sumh0+suml0+sumh1+suml1 + 2) >>2;
4128 return (sum );
4129 }
4130
sa8d_16x32_altivec(const pixel * pix1,intptr_t i_pix1,const pixel * pix2,intptr_t i_pix2)4131 int sa8d_16x32_altivec(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
4132 {
4133 ALIGN_VAR_16(int, sum);
4134 sum = sa8d_16x16_altivec(pix1, i_pix1, pix2, i_pix2)
4135 + sa8d_16x16_altivec(pix1+16*i_pix1, i_pix1, pix2+16*i_pix2, i_pix2);
4136 return sum;
4137 }
4138
sa8d_32x32_altivec(const pixel * pix1,intptr_t i_pix1,const pixel * pix2,intptr_t i_pix2)4139 int sa8d_32x32_altivec(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
4140 {
4141 ALIGN_VAR_16(int, sum);
4142 int offset1, offset2;
4143 offset1 = 16*i_pix1;
4144 offset2 = 16*i_pix2;
4145 sum = sa8d_16x16_altivec(pix1, i_pix1, pix2, i_pix2)
4146 + sa8d_16x16_altivec(pix1+16, i_pix1, pix2+16, i_pix2)
4147 + sa8d_16x16_altivec(pix1+offset1, i_pix1, pix2+offset2, i_pix2)
4148 + sa8d_16x16_altivec(pix1+16+offset1, i_pix1, pix2+16+offset2, i_pix2);
4149 return sum;
4150 }
4151
sa8d_32x64_altivec(const pixel * pix1,intptr_t i_pix1,const pixel * pix2,intptr_t i_pix2)4152 int sa8d_32x64_altivec(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
4153 {
4154 ALIGN_VAR_16(int, sum);
4155 int offset1, offset2;
4156 offset1 = 16*i_pix1;
4157 offset2 = 16*i_pix2;
4158 sum = sa8d_16x16_altivec(pix1, i_pix1, pix2, i_pix2)
4159 + sa8d_16x16_altivec(pix1+16, i_pix1, pix2+16, i_pix2)
4160 + sa8d_16x16_altivec(pix1+offset1, i_pix1, pix2+offset2, i_pix2)
4161 + sa8d_16x16_altivec(pix1+16+offset1, i_pix1, pix2+16+offset2, i_pix2)
4162 + sa8d_16x16_altivec(pix1+32*i_pix1, i_pix1, pix2+32*i_pix2, i_pix2)
4163 + sa8d_16x16_altivec(pix1+16+32*i_pix1, i_pix1, pix2+16+32*i_pix2, i_pix2)
4164 + sa8d_16x16_altivec(pix1+48*i_pix1, i_pix1, pix2+48*i_pix2, i_pix2)
4165 + sa8d_16x16_altivec(pix1+16+48*i_pix1, i_pix1, pix2+16+48*i_pix2, i_pix2);
4166 return sum;
4167 }
4168
sa8d_64x64_altivec(const pixel * pix1,intptr_t i_pix1,const pixel * pix2,intptr_t i_pix2)4169 int sa8d_64x64_altivec(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2)
4170 {
4171 ALIGN_VAR_16(int, sum);
4172 int offset1, offset2;
4173 offset1 = 16*i_pix1;
4174 offset2 = 16*i_pix2;
4175 sum = sa8d_16x16_altivec(pix1, i_pix1, pix2, i_pix2)
4176 + sa8d_16x16_altivec(pix1+16, i_pix1, pix2+16, i_pix2)
4177 + sa8d_16x16_altivec(pix1+32, i_pix1, pix2+32, i_pix2)
4178 + sa8d_16x16_altivec(pix1+48, i_pix1, pix2+48, i_pix2)
4179 + sa8d_16x16_altivec(pix1+offset1, i_pix1, pix2+offset2, i_pix2)
4180 + sa8d_16x16_altivec(pix1+16+offset1, i_pix1, pix2+16+offset2, i_pix2)
4181 + sa8d_16x16_altivec(pix1+32+offset1, i_pix1, pix2+32+offset2, i_pix2)
4182 + sa8d_16x16_altivec(pix1+48+offset1, i_pix1, pix2+48+offset2, i_pix2)
4183 + sa8d_16x16_altivec(pix1+32*i_pix1, i_pix1, pix2+32*i_pix2, i_pix2)
4184 + sa8d_16x16_altivec(pix1+16+32*i_pix1, i_pix1, pix2+16+32*i_pix2, i_pix2)
4185 + sa8d_16x16_altivec(pix1+32+32*i_pix1, i_pix1, pix2+32+32*i_pix2, i_pix2)
4186 + sa8d_16x16_altivec(pix1+48+32*i_pix1, i_pix1, pix2+48+32*i_pix2, i_pix2)
4187 + sa8d_16x16_altivec(pix1+48*i_pix1, i_pix1, pix2+48*i_pix2, i_pix2)
4188 + sa8d_16x16_altivec(pix1+16+48*i_pix1, i_pix1, pix2+16+48*i_pix2, i_pix2)
4189 + sa8d_16x16_altivec(pix1+32+48*i_pix1, i_pix1, pix2+32+48*i_pix2, i_pix2)
4190 + sa8d_16x16_altivec(pix1+48+48*i_pix1, i_pix1, pix2+48+48*i_pix2, i_pix2);
4191 return sum;
4192 }
4193
4194 /* Initialize entries for pixel functions defined in this file */
setupPixelPrimitives_altivec(EncoderPrimitives & p)4195 void setupPixelPrimitives_altivec(EncoderPrimitives &p)
4196 {
4197 #define LUMA_PU(W, H) \
4198 if (W<=16) { \
4199 p.pu[LUMA_ ## W ## x ## H].sad = sad16_altivec<W, H>; \
4200 p.pu[LUMA_ ## W ## x ## H].sad_x3 = sad16_x3_altivec<W, H>; \
4201 p.pu[LUMA_ ## W ## x ## H].sad_x4 = sad16_x4_altivec<W, H>; \
4202 } \
4203 else { \
4204 p.pu[LUMA_ ## W ## x ## H].sad = sad_altivec<W, H>; \
4205 p.pu[LUMA_ ## W ## x ## H].sad_x3 = sad_x3_altivec<W, H>; \
4206 p.pu[LUMA_ ## W ## x ## H].sad_x4 = sad_x4_altivec<W, H>; \
4207 }
4208
4209 LUMA_PU(4, 4);
4210 LUMA_PU(8, 8);
4211 LUMA_PU(16, 16);
4212 LUMA_PU(32, 32);
4213 LUMA_PU(64, 64);
4214 LUMA_PU(4, 8);
4215 LUMA_PU(8, 4);
4216 LUMA_PU(16, 8);
4217 LUMA_PU(8, 16);
4218 LUMA_PU(16, 12);
4219 LUMA_PU(12, 16);
4220 LUMA_PU(16, 4);
4221 LUMA_PU(4, 16);
4222 LUMA_PU(32, 16);
4223 LUMA_PU(16, 32);
4224 LUMA_PU(32, 24);
4225 LUMA_PU(24, 32);
4226 LUMA_PU(32, 8);
4227 LUMA_PU(8, 32);
4228 LUMA_PU(64, 32);
4229 LUMA_PU(32, 64);
4230 LUMA_PU(64, 48);
4231 LUMA_PU(48, 64);
4232 LUMA_PU(64, 16);
4233 LUMA_PU(16, 64);
4234
4235 p.pu[LUMA_4x4].satd = satd_4x4_altivec;//satd_4x4;
4236 p.pu[LUMA_8x8].satd = satd_8x8_altivec;//satd8<8, 8>;
4237 p.pu[LUMA_8x4].satd = satd_8x4_altivec;//satd_8x4;
4238 p.pu[LUMA_4x8].satd = satd_4x8_altivec;//satd4<4, 8>;
4239 p.pu[LUMA_16x16].satd = satd_16x16_altivec;//satd8<16, 16>;
4240 p.pu[LUMA_16x8].satd = satd_16x8_altivec;//satd8<16, 8>;
4241 p.pu[LUMA_8x16].satd = satd_8x16_altivec;//satd8<8, 16>;
4242 p.pu[LUMA_16x12].satd = satd_altivec<16, 12>;//satd8<16, 12>;
4243 p.pu[LUMA_12x16].satd = satd_altivec<12, 16>;//satd4<12, 16>;
4244 p.pu[LUMA_16x4].satd = satd_altivec<16, 4>;//satd8<16, 4>;
4245 p.pu[LUMA_4x16].satd = satd_altivec<4, 16>;//satd4<4, 16>;
4246 p.pu[LUMA_32x32].satd = satd_altivec<32, 32>;//satd8<32, 32>;
4247 p.pu[LUMA_32x16].satd = satd_altivec<32, 16>;//satd8<32, 16>;
4248 p.pu[LUMA_16x32].satd = satd_altivec<16, 32>;//satd8<16, 32>;
4249 p.pu[LUMA_32x24].satd = satd_altivec<32, 24>;//satd8<32, 24>;
4250 p.pu[LUMA_24x32].satd = satd_altivec<24, 32>;//satd8<24, 32>;
4251 p.pu[LUMA_32x8].satd = satd_altivec<32, 8>;//satd8<32, 8>;
4252 p.pu[LUMA_8x32].satd = satd_altivec<8,32>;//satd8<8, 32>;
4253 p.pu[LUMA_64x64].satd = satd_altivec<64, 64>;//satd8<64, 64>;
4254 p.pu[LUMA_64x32].satd = satd_altivec<64, 32>;//satd8<64, 32>;
4255 p.pu[LUMA_32x64].satd = satd_altivec<32, 64>;//satd8<32, 64>;
4256 p.pu[LUMA_64x48].satd = satd_altivec<64, 48>;//satd8<64, 48>;
4257 p.pu[LUMA_48x64].satd = satd_altivec<48, 64>;//satd8<48, 64>;
4258 p.pu[LUMA_64x16].satd = satd_altivec<64, 16>;//satd8<64, 16>;
4259 p.pu[LUMA_16x64].satd = satd_altivec<16, 64>;//satd8<16, 64>;
4260
4261 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd = satd_4x4_altivec;//satd_4x4;
4262 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd = satd_8x8_altivec;//satd8<8, 8>;
4263 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = satd_16x16_altivec;//satd8<16, 16>;
4264 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = satd_altivec<32, 32>;//satd8<32, 32>;
4265
4266 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = satd_8x4_altivec;//satd_8x4;
4267 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = satd_4x8_altivec;//satd4<4, 8>;
4268 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = satd_16x8_altivec;//satd8<16, 8>;
4269 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd = satd_8x16_altivec;//satd8<8, 16>;
4270 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = satd_altivec<32, 16>;//satd8<32, 16>;
4271 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = satd_altivec<16, 32>;//satd8<16, 32>;
4272
4273 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd = satd_altivec<16, 12>;//satd4<16, 12>;
4274 p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd = satd_altivec<12, 16>;//satd4<12, 16>;
4275 p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd = satd_altivec<16, 4>;//satd4<16, 4>;
4276 p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = satd_altivec<4, 16>;//satd4<4, 16>;
4277 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = satd_altivec<32, 24>;//satd8<32, 24>;
4278 p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = satd_altivec<24, 32>;//satd8<24, 32>;
4279 p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = satd_altivec<32, 8>;//satd8<32, 8>;
4280 p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd = satd_altivec<8,32>;//satd8<8, 32>;
4281
4282 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = satd_4x8_altivec;//satd4<4, 8>;
4283 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd = satd_8x16_altivec;//satd8<8, 16>;
4284 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd = satd_altivec<16, 32>;//satd8<16, 32>;
4285 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = satd_altivec<32, 64>;//satd8<32, 64>;
4286
4287 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = satd_4x4_altivec;//satd_4x4;
4288 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd = satd_8x8_altivec;//satd8<8, 8>;
4289 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = satd_altivec<4, 16>;//satd4<4, 16>;
4290 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = satd_16x16_altivec;//satd8<16, 16>;
4291 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd = satd_altivec<8,32>;//satd8<8, 32>;
4292 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = satd_altivec<32, 32>;//satd8<32, 32>;
4293 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = satd_altivec<16, 64>;//satd8<16, 64>;
4294
4295 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd = satd_altivec<8, 12>;//satd4<8, 12>;
4296 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = satd_8x4_altivec;//satd4<8, 4>;
4297 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].satd = satd_altivec<16, 24>;//satd8<16, 24>;
4298 p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd = satd_altivec<12, 32>;//satd4<12, 32>;
4299 p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = satd_16x8_altivec;//satd8<16, 8>;
4300 p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd = satd_altivec<4, 32>;//satd4<4, 32>;
4301 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = satd_altivec<32, 48>;//satd8<32, 48>;
4302 p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].satd = satd_altivec<24, 64>;//satd8<24, 64>;
4303 p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = satd_altivec<32, 16>;//satd8<32, 16>;
4304 p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].satd = satd_altivec<8,64>;//satd8<8, 64>;
4305
4306 p.cu[BLOCK_4x4].sa8d = satd_4x4_altivec;//satd_4x4;
4307 p.cu[BLOCK_8x8].sa8d = sa8d_8x8_altivec;//sa8d_8x8;
4308 p.cu[BLOCK_16x16].sa8d = sa8d_16x16_altivec;//sa8d_16x16;
4309 p.cu[BLOCK_32x32].sa8d = sa8d_32x32_altivec;//sa8d16<32, 32>;
4310 p.cu[BLOCK_64x64].sa8d = sa8d_64x64_altivec;//sa8d16<64, 64>;
4311
4312 p.chroma[X265_CSP_I420].cu[BLOCK_16x16].sa8d = sa8d_8x8_altivec;//sa8d8<8, 8>;
4313 p.chroma[X265_CSP_I420].cu[BLOCK_32x32].sa8d = sa8d_16x16_altivec;//sa8d16<16, 16>;
4314 p.chroma[X265_CSP_I420].cu[BLOCK_64x64].sa8d = sa8d_32x32_altivec;//sa8d16<32, 32>;
4315
4316 p.chroma[X265_CSP_I422].cu[BLOCK_16x16].sa8d = sa8d_8x16_altivec;//sa8d8<8, 16>;
4317 p.chroma[X265_CSP_I422].cu[BLOCK_32x32].sa8d = sa8d_16x32_altivec;//sa8d16<16, 32>;
4318 p.chroma[X265_CSP_I422].cu[BLOCK_64x64].sa8d = sa8d_32x64_altivec;//sa8d16<32, 64>;
4319
4320 }
4321 }
4322