1 /*****************************************************************************
2  *
3  *  XVID MPEG-4 VIDEO CODEC
4  *  - Altivec 8bit<->16bit transfer -
5  *
6  *  Copyright(C) 2004 Christoph Naegeli <chn@kbw.ch>
7  *
8  *  This program is free software ; you can redistribute it and/or modify
9  *  it under the terms of the GNU General Public License as published by
10  *  the Free Software Foundation ; either version 2 of the License, or
11  *  (at your option) any later version.
12  *
13  *  This program is distributed in the hope that it will be useful,
14  *  but WITHOUT ANY WARRANTY ; without even the implied warranty of
15  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  *  GNU General Public License for more details.
17  *
18  *  You should have received a copy of the GNU General Public License
19  *  along with this program ; if not, write to the Free Software
20  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
21  *
22  * $Id: mem_transfer_altivec.c 1985 2011-05-18 09:02:35Z Isibaar $
23  *
24  ****************************************************************************/
25 
26 #ifdef HAVE_ALTIVEC_H
27 #include <altivec.h>
28 #endif
29 
30 #include "../../portab.h"
31 
32 
33 /* Turn this on if you like debugging the alignment */
34 #undef DEBUG
35 
36 #include <stdio.h>
37 
38 /* This function assumes:
39  *	dst: 16 byte aligned
40  */
41 
42 #define COPY8TO16() \
43 s = vec_perm(vec_ld(0,src),vec_ld(16,src),vec_lvsl(0,src));\
44 vec_st((vector signed short)vec_mergeh(zerovec,s),0,dst);\
45 src += stride;\
46 dst += 8
47 
48 void
transfer_8to16copy_altivec_c(int16_t * dst,uint8_t * src,uint32_t stride)49 transfer_8to16copy_altivec_c(int16_t *dst,
50                             uint8_t * src,
51                             uint32_t stride)
52 {
53 	register vector unsigned char s;
54 	register vector unsigned char zerovec;
55 
56 #ifdef DEBUG
57 	/* Check the alignment */
58 	if((long)dst & 0xf)
59 		fprintf(stderr, "transfer_8to16copy_altivec_c:incorrect align, dst: %lx\n", (long)dst);
60 #endif
61 
62 	/* initialization */
63 	zerovec = vec_splat_u8(0);
64 
65 	COPY8TO16();
66 	COPY8TO16();
67 	COPY8TO16();
68 	COPY8TO16();
69 
70 	COPY8TO16();
71 	COPY8TO16();
72 	COPY8TO16();
73 	COPY8TO16();
74 }
75 
76 
77 /*
78  * This function assumes dst is 8 byte aligned and stride is a multiple of 8
79  * src may be unaligned
80  */
81 
82 #define COPY16TO8() \
83 s = vec_perm(src[0], src[1], load_src_perm); \
84 packed = vec_packsu(s, vec_splat_s16(0)); \
85 mask = vec_perm(mask_stencil, mask_stencil, vec_lvsl(0, dst)); \
86 packed = vec_perm(packed, packed, vec_lvsl(0, dst)); \
87 packed = vec_sel(packed, vec_ld(0, dst), mask); \
88 vec_st(packed, 0, dst); \
89 src++; \
90 dst += stride
91 
transfer_16to8copy_altivec_c(uint8_t * dst,vector signed short * src,uint32_t stride)92 void transfer_16to8copy_altivec_c(uint8_t *dst,
93                             vector signed short *src,
94                             uint32_t stride)
95 {
96     register vector signed short s;
97     register vector unsigned char packed;
98     register vector unsigned char mask_stencil;
99     register vector unsigned char mask;
100     register vector unsigned char load_src_perm;
101 
102 #ifdef DEBUG
103     /* if this is on, print alignment errors */
104     if(((unsigned long) dst) & 0x7)
105         fprintf(stderr, "transfer_16to8copy_altivec:incorrect align, dst %lx\n", (long)dst);
106     if(stride & 0x7)
107         fprintf(stderr, "transfer_16to8copy_altivec:incorrect align, stride %u\n", stride);
108 #endif
109     /* Initialisation stuff */
110     load_src_perm = vec_lvsl(0, (unsigned char*)src);
111     mask_stencil = vec_pack(vec_splat_u16(0), vec_splat_u16(-1));
112 
113     COPY16TO8();
114     COPY16TO8();
115     COPY16TO8();
116     COPY16TO8();
117 
118     COPY16TO8();
119     COPY16TO8();
120     COPY16TO8();
121     COPY16TO8();
122 }
123 
124 
125 
126 /*
127  * This function assumes dst is 8 byte aligned and src is unaligned. Stride has
128  * to be a multiple of 8
129  */
130 
131 #define COPY8TO8() \
132 tmp = vec_perm(vec_ld(0, src), vec_ld(16, src), vec_lvsl(0, src)); \
133 t0 = vec_perm(tmp, tmp, vec_lvsl(0, dst));\
134 t1 = vec_perm(mask, mask, vec_lvsl(0, dst));\
135 tmp = vec_sel(t0, vec_ld(0, dst), t1);\
136 vec_st(tmp, 0, dst);\
137 dst += stride;\
138 src += stride
139 
140 void
transfer8x8_copy_altivec_c(uint8_t * dst,uint8_t * src,uint32_t stride)141 transfer8x8_copy_altivec_c( uint8_t * dst,
142                             uint8_t * src,
143                             uint32_t stride)
144 {
145     register vector unsigned char tmp;
146     register vector unsigned char mask;
147 	register vector unsigned char t0, t1;
148 
149 #ifdef DEBUG
150     if(((unsigned long)dst) & 0x7)
151         fprintf(stderr, "transfer8x8_copy_altivec:incorrect align, dst: %lx\n", (long)dst);
152     if(stride & 0x7)
153         fprintf(stderr, "transfer8x8_copy_altivec:incorrect stride, stride: %u\n", stride);
154 #endif
155     mask = vec_pack(vec_splat_u16(0), vec_splat_u16(-1));
156 
157     COPY8TO8();
158     COPY8TO8();
159     COPY8TO8();
160     COPY8TO8();
161 
162     COPY8TO8();
163     COPY8TO8();
164     COPY8TO8();
165     COPY8TO8();
166 }
167 
168 
169 #define SUB8TO16() \
170 	c = vec_perm(vec_ld(0,cur),vec_ld(16,cur),vec_lvsl(0,cur));\
171 	r = vec_perm(vec_ld(0,ref),vec_ld(16,ref),vec_lvsl(0,ref));\
172 	cs = (vector signed short)vec_mergeh(ox00,c);\
173 	rs = (vector signed short)vec_mergeh(ox00,r);\
174 	\
175 	c = vec_lvsr(0,cur);\
176 	mask = vec_perm(mask_00ff, mask_00ff, c);\
177 	r = vec_perm(r, r, c);\
178 	r = vec_sel(r, vec_ld(0,cur), mask);\
179 	vec_st(r,0,cur);\
180 	vec_st( vec_sub(cs,rs), 0, dct );\
181 	\
182 	dct += 8;\
183 	cur += stride;\
184 	ref += stride
185 
186 
187 /* This function assumes:
188  *	dct: 16 Byte aligned
189  *	cur:  8 Byte aligned
190  *	stride: multiple of 8
191  */
192 
193 void
transfer_8to16sub_altivec_c(int16_t * dct,uint8_t * cur,uint8_t * ref,const uint32_t stride)194 transfer_8to16sub_altivec_c(int16_t * dct,
195 							uint8_t * cur,
196 							uint8_t * ref,
197 							const uint32_t stride)
198 {
199 	register vector unsigned char c,r;
200 	register vector unsigned char ox00;
201 	register vector unsigned char mask_00ff;
202 	register vector unsigned char mask;
203 	register vector signed short cs,rs;
204 
205 #ifdef DEBUG
206 	if((long)dct & 0xf)
207 		fprintf(stderr, "transfer_8to16sub_altivec_c:incorrect align, dct: %lx\n", (long)dct);
208 	if((long)cur & 0x7)
209 		fprintf(stderr, "transfer_8to16sub_altivec_c:incorrect align, cur: %lx\n", (long)cur);
210 	if(stride & 0x7)
211 		fprintf(stderr, "transfer_8to16sub_altivec_c:incorrect stride, stride: %lu\n", (long)stride);
212 #endif
213 	/* initialize */
214 	ox00 = vec_splat_u8(0);
215 	mask_00ff = vec_pack((vector unsigned short)ox00,vec_splat_u16(-1));
216 
217 	SUB8TO16();
218 	SUB8TO16();
219 	SUB8TO16();
220 	SUB8TO16();
221 
222 	SUB8TO16();
223 	SUB8TO16();
224 	SUB8TO16();
225 	SUB8TO16();
226 }
227 
228 
229 #define SUBRO8TO16() \
230 	c = vec_perm(vec_ld(0,cur),vec_ld(16,cur),vec_lvsl(0,cur));\
231 	r = vec_perm(vec_ld(0,ref),vec_ld(16,ref),vec_lvsl(0,ref));\
232 	cs = (vector signed short)vec_mergeh(z,c);\
233 	rs = (vector signed short)vec_mergeh(z,r);\
234 	vec_st( vec_sub(cs,rs), 0, dct );\
235 	dct += 8;\
236 	cur += stride;\
237 	ref += stride
238 
239 
240 /* This function assumes:
241  *	dct: 16 Byte aligned
242  */
243 
244 void
transfer_8to16subro_altivec_c(int16_t * dct,const uint8_t * cur,const uint8_t * ref,const uint32_t stride)245 transfer_8to16subro_altivec_c(int16_t * dct,
246 					const uint8_t * cur,
247 					const uint8_t * ref,
248 					const uint32_t stride)
249 {
250 	register vector unsigned char c;
251 	register vector unsigned char r;
252 	register vector unsigned char z;
253 	register vector signed short cs;
254 	register vector signed short rs;
255 
256 #ifdef DEBUG
257 	/* Check the alignment assumptions if this is on */
258 	if((long)dct & 0xf)
259 		fprintf(stderr, "transfer_8to16subro_altivec_c:incorrect align, dct: %lx\n", (long)dct);
260 #endif
261 	/* initialize */
262 	z = vec_splat_u8(0);
263 
264 	SUBRO8TO16();
265 	SUBRO8TO16();
266 	SUBRO8TO16();
267 	SUBRO8TO16();
268 
269 	SUBRO8TO16();
270 	SUBRO8TO16();
271 	SUBRO8TO16();
272 	SUBRO8TO16();
273 }
274 
275 /*
276  * This function assumes:
277  *  dct: 16 bytes alignment
278  *  cur: 8 bytes alignment
279  *  ref1: unaligned
280  *  ref2: unaligned
281  *  stride: multiple of 8
282  */
283 
284 #define SUB28TO16() \
285 r1 = vec_perm(vec_ld(0, ref1), vec_ld(16, ref1), vec_lvsl(0, ref1)); \
286 r2 = vec_perm(vec_ld(0, ref2), vec_ld(16, ref2), vec_lvsl(0, ref2)); \
287 c = vec_perm(vec_ld(0, cur), vec_ld(16, cur), vec_lvsl(0, cur)); \
288 r = vec_avg(r1, r2); \
289 cs = (vector signed short)vec_mergeh(vec_splat_u8(0), c); \
290 rs = (vector signed short)vec_mergeh(vec_splat_u8(0), r); \
291 c = vec_perm(mask, mask, vec_lvsl(0, cur));\
292 r = vec_sel(r, vec_ld(0, cur), c);\
293 vec_st(r, 0, cur); \
294 *dct++ = vec_sub(cs, rs); \
295 cur += stride; \
296 ref1 += stride; \
297 ref2 += stride
298 
299 void
transfer_8to16sub2_altivec_c(vector signed short * dct,uint8_t * cur,uint8_t * ref1,uint8_t * ref2,const uint32_t stride)300 transfer_8to16sub2_altivec_c(vector signed short *dct,
301                              uint8_t *cur,
302                              uint8_t *ref1,
303                              uint8_t *ref2,
304                              const uint32_t stride)
305 {
306     vector unsigned char r1;
307     vector unsigned char r2;
308     vector unsigned char r;
309     vector unsigned char c;
310     vector unsigned char mask;
311     vector signed short cs;
312     vector signed short rs;
313 
314 #ifdef DEBUG
315     /* Dump alignment erros if DEBUG is set */
316     if(((unsigned long)dct) & 0xf)
317         fprintf(stderr, "transfer_8to16sub2_altivec_c:incorrect align, dct: %lx\n", (long)dct);
318     if(((unsigned long)cur) & 0x7)
319         fprintf(stderr, "transfer_8to16sub2_altivec_c:incorrect align, cur: %lx\n", (long)cur);
320     if(stride & 0x7)
321         fprintf(stderr, "transfer_8to16sub2_altivec_c:incorrect align, dct: %u\n", stride);
322 #endif
323 
324     /* Initialisation */
325     mask = vec_pack(vec_splat_u16(0), vec_splat_u16(-1));
326 
327     SUB28TO16();
328     SUB28TO16();
329     SUB28TO16();
330     SUB28TO16();
331 
332     SUB28TO16();
333     SUB28TO16();
334     SUB28TO16();
335     SUB28TO16();
336 }
337 
338 
339 
340 /*
341  * This function assumes:
342  *  dst: 8 byte aligned
343  *  src: unaligned
344  *  stride: multiple of 8
345  */
346 
347 #define ADD16TO8() \
348 s = vec_perm(vec_ld(0, src), vec_ld(16, src), vec_lvsl(0, src)); \
349 d = vec_perm(vec_ld(0, dst), vec_ld(16, dst), vec_lvsl(0, dst)); \
350 ds = (vector signed short)vec_mergeh(vec_splat_u8(0), d); \
351 ds = vec_add(ds, s); \
352 packed = vec_packsu(ds, vec_splat_s16(0)); \
353 mask = vec_pack(vec_splat_u16(0), vec_splat_u16(-1)); \
354 mask = vec_perm(mask, mask, vec_lvsl(0, dst)); \
355 packed = vec_perm(packed, packed, vec_lvsl(0, dst)); \
356 packed = vec_sel(packed, vec_ld(0, dst), mask); \
357 vec_st(packed, 0, dst); \
358 src += 8; \
359 dst += stride
360 
361 void
transfer_16to8add_altivec_c(uint8_t * dst,int16_t * src,uint32_t stride)362 transfer_16to8add_altivec_c(uint8_t *dst,
363                             int16_t *src,
364                             uint32_t stride)
365 {
366     vector signed short s;
367     vector signed short ds;
368     vector unsigned char d;
369     vector unsigned char packed;
370     vector unsigned char mask;
371 
372 #ifdef DEBUG
373     /* if this is set, dump alignment errors */
374     if(((unsigned long)dst) & 0x7)
375         fprintf(stderr, "transfer_16to8add_altivec_c:incorrect align, dst: %lx\n", (long)dst);
376     if(stride & 0x7)
377         fprintf(stderr, "transfer_16to8add_altivec_c:incorrect align, dst: %u\n", stride);
378 #endif
379 
380     ADD16TO8();
381     ADD16TO8();
382     ADD16TO8();
383     ADD16TO8();
384 
385     ADD16TO8();
386     ADD16TO8();
387     ADD16TO8();
388     ADD16TO8();
389 }
390