1 /*
2 
3     Copyright (C) 2002 Benjamin Herrenschmidt <benh@kernel.crashing.org>
4 
5     This program is free software; you can redistribute it and/or modify
6     it under the terms of the GNU General Public License as published by
7     the Free Software Foundation; either version 2 of the License, or
8     (at your option) any later version.
9 
10     This program is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13     GNU General Public License for more details.
14 
15     You should have received a copy of the GNU General Public License
16     along with this program; if not, write to the Free Software
17     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18 
19 
20     $Id: sad_altivec.c 1985 2011-05-18 09:02:35Z Isibaar $
21 */
22 
23 #ifdef HAVE_ALTIVEC_H
24 #include <altivec.h>
25 #endif
26 
27 
28 #include "../../portab.h"
29 
30 /* no debugging by default */
31 #undef DEBUG
32 
33 #include <stdio.h>
34 
35 #define SAD16() \
36 t1  = vec_perm(ref[0], ref[1], perm);  /* align current vector  */ \
37 t2  = vec_max(t1, *cur);      	 /* find largest of two           */ \
38 t1  = vec_min(t1, *cur); 	         /* find smaller of two           */ \
39 t1  = vec_sub(t2, t1);                   /* find absolute difference      */ \
40 sad = vec_sum4s(t1, vec_splat_u32(0));                /* sum of differences */ \
41 sumdiffs = (vector unsigned int)vec_sums((vector signed int)sad, (vector signed int)sumdiffs);    /* accumulate sumdiffs */ \
42 if(vec_any_ge(sumdiffs, best_vec)) \
43     goto bail; \
44 cur += stride; ref += stride;
45 
46 /*
47  * This function assumes cur and stride are 16 bytes aligned and ref is unaligned
48  */
49 
50 uint32_t
sad16_altivec_c(vector unsigned char * cur,vector unsigned char * ref,uint32_t stride,const uint32_t best_sad)51 sad16_altivec_c(vector unsigned char *cur,
52 			  vector unsigned char *ref,
53 			  uint32_t stride,
54 			  const uint32_t best_sad)
55 {
56 	vector unsigned char perm;
57 	vector unsigned char t1, t2;
58 	vector unsigned int sad;
59 	vector unsigned int sumdiffs;
60 	vector unsigned int best_vec;
61 	uint32_t result;
62 
63 
64 #ifdef DEBUG
65         /* print alignment errors if DEBUG is on */
66 	if (((unsigned long) cur) & 0xf)
67 		fprintf(stderr, "sad16_altivec:incorrect align, cur: %lx\n", (long)cur);
68 	if (stride & 0xf)
69 		fprintf(stderr, "sad16_altivec:incorrect align, stride: %lu\n", stride);
70 #endif
71 	/* initialization */
72 	sad = vec_splat_u32(0);
73 	sumdiffs = sad;
74 	stride >>= 4;
75 	perm = vec_lvsl(0, (unsigned char *) ref);
76 	*((uint32_t*)&best_vec) = best_sad;
77 	best_vec = vec_splat(best_vec, 0);
78 
79 	/* perform sum of differences between current and previous */
80 	SAD16();
81 	SAD16();
82 	SAD16();
83 	SAD16();
84 
85 	SAD16();
86 	SAD16();
87 	SAD16();
88 	SAD16();
89 
90 	SAD16();
91 	SAD16();
92 	SAD16();
93 	SAD16();
94 
95 	SAD16();
96 	SAD16();
97 	SAD16();
98 	SAD16();
99 
100   bail:
101 	/* copy vector sum into unaligned result */
102 	sumdiffs = vec_splat(sumdiffs, 3);
103 	vec_ste(sumdiffs, 0, (uint32_t*) &result);
104 	return result;
105 }
106 
107 
108 #define SAD8() \
109 	c = vec_perm(vec_ld(0,cur),vec_ld(16,cur),vec_lvsl(0,cur));\
110 	r = vec_perm(vec_ld(0,ref),vec_ld(16,ref),vec_lvsl(0,ref));\
111 	c = vec_sub(vec_max(c,r),vec_min(c,r));\
112 	sad = vec_sum4s(c,sad);\
113 	cur += stride;\
114 	ref += stride
115 
116 /*
117  * This function assumes nothing
118  */
119 
120 uint32_t
sad8_altivec_c(const uint8_t * cur,const uint8_t * ref,const uint32_t stride)121 sad8_altivec_c(const uint8_t * cur,
122 	   const uint8_t *ref,
123 	   const uint32_t stride)
124 {
125 	uint32_t result = 0;
126 
127 	register vector unsigned int sad;
128 	register vector unsigned char c;
129 	register vector unsigned char r;
130 
131 	/* initialize */
132 	sad = vec_splat_u32(0);
133 
134 	/* Perform sad operations */
135 	SAD8();
136 	SAD8();
137 	SAD8();
138 	SAD8();
139 
140 	SAD8();
141 	SAD8();
142 	SAD8();
143 	SAD8();
144 
145 	/* finish addition, add the first 2 together */
146 	sad = vec_and(sad, (vector unsigned int)vec_pack(vec_splat_u16(-1),vec_splat_u16(0)));
147 	sad = (vector unsigned int)vec_sums((vector signed int)sad, vec_splat_s32(0));
148 	sad = vec_splat(sad,3);
149 	vec_ste(sad, 0, &result);
150 
151 	return result;
152 }
153 
154 
155 
156 
157 #define MEAN16() \
158 mean = vec_sum4s(*ptr,mean);\
159 ptr += stride
160 
161 #define DEV16() \
162 t2  = vec_max(*ptr, mn);                    /* find largest of two           */ \
163 t3  = vec_min(*ptr, mn);                    /* find smaller of two           */ \
164 t2  = vec_sub(t2, t3);                      /* find absolute difference      */ \
165 dev = vec_sum4s(t2, dev); \
166 ptr += stride
167 
168 /*
169  * This function assumes cur is 16 bytes aligned and stride is 16 bytes
170  * aligned
171 */
172 
173 uint32_t
dev16_altivec_c(vector unsigned char * cur,uint32_t stride)174 dev16_altivec_c(vector unsigned char *cur,
175 			  uint32_t stride)
176 {
177 	vector unsigned char t2, t3, mn;
178 	vector unsigned int mean, dev;
179 	vector unsigned int sumdiffs;
180 	vector unsigned char *ptr;
181 	uint32_t result;
182 
183 #ifdef DEBUG
184         /* print alignment errors if DEBUG is on */
185         if(((unsigned long)cur) & 0x7)
186             fprintf(stderr, "dev16_altivec:incorrect align, cur: %lx\n", (long)cur);
187         if(stride & 0xf)
188             fprintf(stderr, "dev16_altivec:incorrect align, stride: %lu\n", stride);
189 #endif
190 
191 	dev = mean = vec_splat_u32(0);
192 	stride >>= 4;
193 
194 	/* set pointer to iterate through cur */
195 	ptr = cur;
196 
197 	MEAN16();
198 	MEAN16();
199 	MEAN16();
200 	MEAN16();
201 	MEAN16();
202 	MEAN16();
203 	MEAN16();
204 	MEAN16();
205 	MEAN16();
206 	MEAN16();
207 	MEAN16();
208 	MEAN16();
209 	MEAN16();
210 	MEAN16();
211 	MEAN16();
212 	MEAN16();
213 
214         /* Add all together in sumdiffs */
215 	sumdiffs = (vector unsigned int)vec_sums((vector signed int) mean, vec_splat_s32(0));
216         /* teilen durch 16 * 16 */
217         mn = vec_perm((vector unsigned char)sumdiffs, (vector unsigned char)sumdiffs, vec_splat_u8(14));
218 
219         /* set pointer to iterate through cur */
220         ptr = cur;
221 
222 	DEV16();
223 	DEV16();
224 	DEV16();
225 	DEV16();
226 	DEV16();
227 	DEV16();
228 	DEV16();
229 	DEV16();
230 	DEV16();
231 	DEV16();
232 	DEV16();
233 	DEV16();
234 	DEV16();
235 	DEV16();
236 	DEV16();
237 	DEV16();
238 
239 	/* sum all parts of difference into one 32 bit quantity */
240 	sumdiffs = (vector unsigned int)vec_sums((vector signed int) dev, vec_splat_s32(0));
241 
242 	/* copy vector sum into unaligned result */
243 	sumdiffs = vec_splat(sumdiffs, 3);
244 	vec_ste(sumdiffs, 0, (uint32_t*) &result);
245 	return result;
246 }
247 
248 #define SAD16BI() \
249     t1 = vec_perm(ref1[0], ref1[1], mask1); \
250     t2 = vec_perm(ref2[0], ref2[1], mask2); \
251     t1 = vec_avg(t1, t2); \
252     t2 = vec_max(t1, *cur); \
253     t1 = vec_min(t1, *cur); \
254     sad = vec_sub(t2, t1); \
255     sum = vec_sum4s(sad, sum); \
256     cur += stride; \
257     ref1 += stride; \
258     ref2 += stride
259 
260 /*
261  * This function assumes cur is 16 bytes aligned, stride is 16 bytes
262  * aligned and ref1 and ref2 is unaligned
263 */
264 
265 uint32_t
sad16bi_altivec_c(vector unsigned char * cur,vector unsigned char * ref1,vector unsigned char * ref2,uint32_t stride)266 sad16bi_altivec_c(vector unsigned char *cur,
267                         vector unsigned char *ref1,
268                         vector unsigned char *ref2,
269                         uint32_t stride)
270 {
271     vector unsigned char t1, t2;
272     vector unsigned char mask1, mask2;
273     vector unsigned char sad;
274     vector unsigned int sum;
275     uint32_t result;
276 
277 #ifdef DEBUG
278     /* print alignment errors if this is on */
279     if((long)cur & 0xf)
280         fprintf(stderr, "sad16bi_altivec:incorrect align, cur: %lx\n", (long)cur);
281     if(stride & 0xf)
282         fprintf(stderr, "sad16bi_altivec:incorrect align, cur: %lu\n", stride);
283 #endif
284 
285     /* Initialisation stuff */
286     stride >>= 4;
287     mask1 = vec_lvsl(0, (unsigned char*)ref1);
288     mask2 = vec_lvsl(0, (unsigned char*)ref2);
289     sad = vec_splat_u8(0);
290     sum = (vector unsigned int)sad;
291 
292     SAD16BI();
293     SAD16BI();
294     SAD16BI();
295     SAD16BI();
296 
297     SAD16BI();
298     SAD16BI();
299     SAD16BI();
300     SAD16BI();
301 
302     SAD16BI();
303     SAD16BI();
304     SAD16BI();
305     SAD16BI();
306 
307     SAD16BI();
308     SAD16BI();
309     SAD16BI();
310     SAD16BI();
311 
312     sum = (vector unsigned int)vec_sums((vector signed int)sum, vec_splat_s32(0));
313     sum = vec_splat(sum, 3);
314     vec_ste(sum, 0, (uint32_t*)&result);
315 
316     return result;
317 }
318 
319 
320 #define SSE8_16BIT() \
321 b1_vec = vec_perm(vec_ld(0,b1), vec_ld(16,b1), vec_lvsl(0,b1)); \
322 b2_vec = vec_perm(vec_ld(0,b2), vec_ld(16,b2), vec_lvsl(0,b2)); \
323 diff = vec_sub(b1_vec,b2_vec);  \
324 sum = vec_msum(diff,diff,sum);  \
325 b1 = (const int16_t*)((int8_t*)b1+stride);  \
326 b2 = (const int16_t*)((int8_t*)b2+stride)
327 
328 uint32_t
sse8_16bit_altivec_c(const int16_t * b1,const int16_t * b2,const uint32_t stride)329 sse8_16bit_altivec_c(const int16_t * b1,
330 			 const int16_t * b2,
331 			 const uint32_t stride)
332 {
333     register vector signed short b1_vec;
334     register vector signed short b2_vec;
335     register vector signed short diff;
336     register vector signed int sum;
337     uint32_t result;
338 
339     /* initialize */
340     sum = vec_splat_s32(0);
341 
342     SSE8_16BIT();
343     SSE8_16BIT();
344     SSE8_16BIT();
345     SSE8_16BIT();
346 
347     SSE8_16BIT();
348     SSE8_16BIT();
349     SSE8_16BIT();
350     SSE8_16BIT();
351 
352     /* sum the vector */
353     sum = vec_sums(sum, vec_splat_s32(0));
354     sum = vec_splat(sum,3);
355 
356     vec_ste(sum,0,(int*)&result);
357 
358     /* and return */
359     return result;
360 }
361