1 /* subsample_image.c, this file is part of the
2 * AltiVec optimized library for MJPEG tools MPEG-1/2 Video Encoder
3 * Copyright (C) 2002 James Klicman <james@klicman.org>
4 *
5 * This library is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20 #ifdef HAVE_CONFIG_H
21 #include <config.h>
22 #endif
23
24 #include "altivec_motion.h"
25
26 #if defined(ALTIVEC_VERIFY) && ALTIVEC_TEST_FUNCTION(subsample_image)
27 #include <stdlib.h>
28 #endif
29
30 #include "vectorize.h"
31 #include "../mjpeg_logging.h"
32
33 /* #define AMBER_ENABLE */
34 #include "amber.h"
35
36 #ifdef HAVE_ALTIVEC_H
37 /* include last to ensure AltiVec type semantics, especially for bool. */
38 #include <altivec.h>
39 #endif
40
41
42 #define SUBSAMPLE_IMAGE_PDECL /* {{{ */ \
43 uint8_t *image, int rowstride, \
44 uint8_t *sub22_image, \
45 uint8_t *sub44_image \
46 /* }}} */
47 #define SUBSAMPLE_IMAGE_ARGS image, rowstride, sub22_image, sub44_image
48 #define SUBSAMPLE_IMAGE_PFMT /* {{{ */ \
49 "image=0x%X, rowstride=%d, sub22_image=0x%X, sub44_image=0x%X" \
50 /* }}} */
51
subsample_image_altivec(SUBSAMPLE_IMAGE_PDECL)52 void subsample_image_altivec(SUBSAMPLE_IMAGE_PDECL)
53 {
54 int i, ii, j, stride1, stride2, stride3, stride4, halfstride;
55 unsigned char *pB, *pB2, *pB4;
56 vector unsigned char l0, l1, l2, l3;
57 vector unsigned short s0, s1, s2, s3;
58 vector unsigned short s22_0, s22_1, s22_2, s22_3;
59 vector unsigned short s44, s44_0, s44_1;
60 vector unsigned short zero, two;
61 #ifdef ALTIVEC_DST
62 DataStreamControl dsc;
63 #endif
64
65 #ifdef ALTIVEC_VERIFY
66 if (NOT_VECTOR_ALIGNED(image))
67 mjpeg_error_exit1("subsample_image: %s %% %d != 0, (%d)",
68 "image", 16, image);
69 if (NOT_VECTOR_ALIGNED(sub22_image))
70 mjpeg_error_exit1("subsample_image: %s %% %d != 0, (%d)",
71 "sub22_image", 16, sub22_image);
72 if (NOT_VECTOR_ALIGNED(sub44_image))
73 mjpeg_error_exit1("subsample_image: %s %% %d != 0, (%d)",
74 "sub44_image", 16, sub44_image);
75
76 if ((rowstride & 63) != 0)
77 mjpeg_error_exit1("subsample_image: %s %% %d != 0, (%d)",
78 "rowstride", 64, rowstride);
79 #endif
80
81 AMBER_START;
82
83 pB = image;
84
85 #ifdef ALTIVEC_DST
86 dsc.control = DATA_STREAM_CONTROL(6,4,0);
87 dsc.block.stride = rowstride;
88
89 vec_dst(pB, dsc.control, 0);
90 #endif
91
92 pB2 = sub22_image;
93 pB4 = sub44_image;
94
95 j = ((unsigned long)(pB2 - pB) / rowstride) >> 2; /* height/4 */
96
97 stride1 = rowstride;
98 stride2 = stride1 + stride1;
99 stride3 = stride2 + stride1;
100 stride4 = stride2 + stride2;
101 halfstride = stride1 >> 1; /* /2 */
102
103 ii = rowstride >> 6; /* rowstride/16/4 */
104
105 zero = vec_splat_u16(0);
106 two = vec_splat_u16(2);
107
108 do {
109 i = ii;
110 do {
111 l0 = vec_ld(0, pB);
112 l1 = vec_ld(stride1, pB);
113 l2 = vec_ld(stride2, pB);
114 l3 = vec_ld(stride3, pB);
115 pB += 16;
116 #ifdef ALTIVEC_DST
117 vec_dst(pB + (16 * 3), dsc.control, 0);
118 #endif
119
120 /* l0 = 0x[00,01,02,03,04,05,06,07,08,09,0A,0B,0C,0D,0E,0F] */
121 /* l1 = 0x[10,11,12,13,14,15,16,17,18,19,1A,1B,1C,1D,1E,1F] */
122 /* l2 = 0x[20,21,22,23,24,25,26,27,28,29,2A,2B,2C,2D,2E,2F] */
123 /* l3 = 0x[30,31,32,33,34,35,36,37,38,39,3A,3B,3C,3D,3E,3F] */
124
125 /* s0 = 0x[00,01, 02,03, 04,05, 06,07, ] */
126 /* [ 10,11, 12,13, 14,15, 16,17] */
127 s0 = vu16(vec_mergeh(vu16(l0), vu16(l1)));
128 /* s0 = 0x[00+01+10+11,02+03+12+13,04+05+14+15,06+07+16+17] */
129 s0 = vu16(vec_sum4s(vu8(s0), vu32(zero)));
130
131 /* s1 = 0x[08,09, 0A,0B, 0C,0D, 0E,0F, ] */
132 /* [ 18,19, 1A,1B, 1C,1D, 1E,1F] */
133 s1 = vu16(vec_mergel(vu16(l0), vu16(l1)));
134 /* s1 = 0x[08+09+18+19,0A+0B+1A+1B,0C+0D+1C+1D,0E+0F+1E+1F] */
135 s1 = vu16(vec_sum4s(vu8(s1), vu32(zero)));
136
137 /* s2 = 0x[20,21, 22,23, 24,25, 26,27, ] */
138 /* [ 30,31, 32,33, 34,35, 36,37] */
139 s2 = vu16(vec_mergeh(vu16(l2), vu16(l3)));
140 /* s2 = 0x[20+21+30+31,22+23+32+33,24+25+34+35,26+27+36+37] */
141 s2 = vu16(vec_sum4s(vu8(s2), vu32(zero)));
142
143 /* s3 = 0x[28,29, 2A,2B, 2C,2D, 2E,2F, ] */
144 /* [ 38,39, 3A,3B, 3C,3D, 3E,3F] */
145 s3 = vu16(vec_mergel(vu16(l2), vu16(l3)));
146 /* s3 = 0x[28+29+38+39,2A+2B+3A+3B,2C+2D+3C+3D,2E+2F+3E+3F] */
147 s3 = vu16(vec_sum4s(vu8(s3), vu32(zero)));
148
149 /* start loading next block */
150 l0 = vec_ld(0, pB);
151 l1 = vec_ld(stride1, pB);
152 l2 = vec_ld(stride2, pB);
153 l3 = vec_ld(stride3, pB);
154 pB += 16;
155
156 /* s0 = 0x[00+01+10+11, 02+03+12+13, 04+05+14+15, 06+07+16+17] */
157 /* s1 = 0x[08+09+18+19, 0A+0B+1A+1B, 0C+0D+1C+1D, 0E+0F+1E+1F] */
158 /* s2 = 0x[20+21+30+31, 22+23+32+33, 24+25+34+35, 26+27+36+37] */
159 /* s3 = 0x[28+29+38+39, 2A+2B+3A+3B, 2C+2D+3C+3D, 2E+2F+3E+3F] */
160
161 /* s22_0 = 0x[ 00, 02, 04, 06, 08, 0A, 0C, 0E] */
162 s22_0 = vec_packsu(vu32(s0), vu32(s1));
163 /* s22_1 = 0x[ 20, 22, 24, 26, 28, 2A, 2C, 2E] */
164 s22_1 = vec_packsu(vu32(s2), vu32(s3));
165
166 /* (pB[i]+pB[i+1]+pN[i]+pN[i+1]) + 2 */
167 s22_0 = vec_add(s22_0, two);
168 /* (pNN[i]+pNN[i+1]+pNNN[i]+pNNN[i+1]) + 2 */
169 s22_1 = vec_add(s22_1, two);
170
171 /* (pB[i]+pB[i+1]+pN[i]+pN[i+1]+2) >> 2 */
172 s22_0 = vec_sra(s22_0, two);
173 /* (pNN[i]+pNN[i+1]+pNNN[i]+pNNN[i+1]+2) >> 2 */
174 s22_1 = vec_sra(s22_1, two);
175
176 /* s22_0 = 0x[ 00, 02, 04, 06, 08, 0A, 0C, 0E] */
177 /* s22_1 = 0x[ 20, 22, 24, 26, 28, 2A, 2C, 2E] */
178 /* s44_0 = 0x[00+20,02+22,04+24,06+26,08+28,0A+2A,0C+2C,0E+2E] */
179 s44_0 = vec_add(s22_0, s22_1);
180
181 /* s44_0 = 0x[00+20+02+22, 04+24+06+26, 08+28+0A+2A, 0C+2C+0E+2E] */
182 s44_0 = vu16(vec_sum4s(vs16(s44_0), vs32(zero)));
183
184 /* - - - - - - - - - - - - - - - - - - - */
185 s0 = vu16(vec_mergeh(vu16(l0), vu16(l1)));
186 s0 = vu16(vec_sum4s(vu8(s0), vu32(zero)));
187 s1 = vu16(vec_mergel(vu16(l0), vu16(l1)));
188 s1 = vu16(vec_sum4s(vu8(s1), vu32(zero)));
189 s2 = vu16(vec_mergeh(vu16(l2), vu16(l3)));
190 s2 = vu16(vec_sum4s(vu8(s2), vu32(zero)));
191 s3 = vu16(vec_mergel(vu16(l2), vu16(l3)));
192 s3 = vu16(vec_sum4s(vu8(s3), vu32(zero)));
193
194 /* start loading next l[0-3] */
195 l0 = vec_ld(0, pB);
196 l1 = vec_ld(stride1, pB);
197 l2 = vec_ld(stride2, pB);
198 l3 = vec_ld(stride3, pB);
199 pB += 16;
200
201
202 s22_2 = vec_packsu(vu32(s0), vu32(s1));
203 s22_3 = vec_packsu(vu32(s2), vu32(s3));
204
205 s22_2 = vec_add(s22_2, two);
206 s22_3 = vec_add(s22_3, two);
207
208 s22_2 = vec_sra(s22_2, two);
209 s22_3 = vec_sra(s22_3, two);
210
211
212 s44_1 = vec_add(s22_2, s22_3);
213 s44_1 = vu16(vec_sum4s(vs16(s44_1), vs32(zero)));
214
215 /* store s22 block */
216 s22_0 = vu16(vec_packsu(s22_0, s22_2));
217 s22_1 = vu16(vec_packsu(s22_1, s22_3));
218 vec_st(vu8(s22_0), 0, pB2);
219 vec_st(vu8(s22_1), halfstride, pB2);
220 pB2 += 16;
221
222 /* - - - - - - - - - - - - - - - - - - - */
223 s0 = vu16(vec_mergeh(vu16(l0), vu16(l1)));
224 s0 = vu16(vec_sum4s(vu8(s0), vu32(zero)));
225 s1 = vu16(vec_mergel(vu16(l0), vu16(l1)));
226 s1 = vu16(vec_sum4s(vu8(s1), vu32(zero)));
227 s2 = vu16(vec_mergeh(vu16(l2), vu16(l3)));
228 s2 = vu16(vec_sum4s(vu8(s2), vu32(zero)));
229 s3 = vu16(vec_mergel(vu16(l2), vu16(l3)));
230 s3 = vu16(vec_sum4s(vu8(s3), vu32(zero)));
231
232 /* starting loading next l[0-3] */
233 l0 = vec_ld(0, pB);
234 l1 = vec_ld(stride1, pB);
235 l2 = vec_ld(stride2, pB);
236 l3 = vec_ld(stride3, pB);
237 pB += 16;
238
239
240 s22_0 = vec_packsu(vu32(s0), vu32(s1));
241 s22_1 = vec_packsu(vu32(s2), vu32(s3));
242
243 s22_0 = vec_add(s22_0, two);
244 s22_1 = vec_add(s22_1, two);
245
246 s22_0 = vec_sra(s22_0, two);
247 s22_1 = vec_sra(s22_1, two);
248
249
250 s44 = vec_packsu(vu32(s44_0), vu32(s44_1));
251 s44 = vec_add(s44, two);
252 s44 = vec_sra(s44, two);
253
254 s44_0 = vec_add(s22_0, s22_1);
255 s44_0 = vu16(vec_sum4s(vs16(s44_0), vs32(zero)));
256
257 /* - - - - - - - - - - - - - - - - - - - */
258 s0 = vu16(vec_mergeh(vu16(l0), vu16(l1)));
259 s0 = vu16(vec_sum4s(vu8(s0), vu32(zero)));
260 s1 = vu16(vec_mergel(vu16(l0), vu16(l1)));
261 s1 = vu16(vec_sum4s(vu8(s1), vu32(zero)));
262 s2 = vu16(vec_mergeh(vu16(l2), vu16(l3)));
263 s2 = vu16(vec_sum4s(vu8(s2), vu32(zero)));
264 s3 = vu16(vec_mergel(vu16(l2), vu16(l3)));
265 s3 = vu16(vec_sum4s(vu8(s3), vu32(zero)));
266
267 s22_2 = vec_packsu(vu32(s0), vu32(s1));
268 s22_3 = vec_packsu(vu32(s2), vu32(s3));
269
270 s22_2 = vec_add(s22_2, two);
271 s22_3 = vec_add(s22_3, two);
272
273 s22_2 = vec_sra(s22_2, two);
274 s22_3 = vec_sra(s22_3, two);
275
276 s44_1 = vec_add(s22_2, s22_3);
277 s44_1 = vu16(vec_sum4s(vs16(s44_1), vs32(zero)));
278
279 /* store s22 block */
280 s22_0 = vu16(vec_packsu(s22_0, s22_2));
281 s22_1 = vu16(vec_packsu(s22_1, s22_3));
282 vec_st(vu8(s22_0), 0, pB2);
283 vec_st(vu8(s22_1), halfstride, pB2);
284 pB2 += 16;
285
286 /* pack all four s44 chunks */
287 s44_0 = vec_packsu(vu32(s44_0), vu32(s44_1));
288 s44_0 = vec_add(s44_0, two);
289 s44_0 = vec_sra(s44_0, two);
290 s44 = vu16(vec_packsu(s44, s44_0));
291
292 vec_st(vu8(s44), 0, pB4);
293 pB4 += 16;
294
295 } while (--i);
296
297 pB += stride3;
298 pB2 += halfstride;
299
300 } while (--j);
301
302 #ifdef ALTIVEC_DST
303 vec_dss(0);
304 #endif
305
306 AMBER_STOP;
307 }
308
309 #if ALTIVEC_TEST_FUNCTION(subsample_image) /* {{{ */
310 # ifdef ALTIVEC_VERIFY
311
imgcpy(uint8_t * d,uint8_t * s,int width,int height,int stride)312 static void imgcpy(uint8_t *d, uint8_t *s, int width, int height, int stride)
313 {
314 int i, j;
315
316 for (j = 0; j < height; j++) {
317 for (i = 0; i < width; i++)
318 d[i] = s[i];
319 d += width;
320 s += stride;
321 }
322 }
323
checksum(uint8_t * p,int width,int height,int stride)324 static unsigned long checksum(uint8_t *p, int width, int height, int stride)
325 {
326 int i, j;
327 unsigned long checksum;
328
329 for (checksum = j = 0; j < height; j++) {
330 for (i = 0; i < width; i++)
331 checksum += p[i];
332 p += stride;
333 }
334
335 return checksum;
336 }
337
imgcmp(const char * ss,uint8_t * a,uint8_t * b,int width,int height,int stride)338 static void imgcmp(const char *ss, uint8_t *a, uint8_t *b,
339 int width, int height, int stride)
340 {
341 int i, j;
342
343 for (j = 0; j < height; j++) {
344 for (i = 0; i < width; i++)
345 if (a[i] != b[i])
346 mjpeg_debug("subsample_image: %s[%d][%d] %d != %d",
347 ss, j, i, a[i], b[i]);
348
349 a += width;
350 b += stride;
351 }
352 }
353
subsample_image_altivec_verify(SUBSAMPLE_IMAGE_PDECL)354 void subsample_image_altivec_verify(SUBSAMPLE_IMAGE_PDECL)
355 {
356 int width, height;
357 unsigned long checksum44_1, checksum44_2;
358 unsigned long checksum22_1, checksum22_2;
359 unsigned char *cpy22, *cpy44;
360
361 width = rowstride;
362 height = (unsigned long)(sub22_image - image) / rowstride;
363
364 cpy22 = (unsigned char*)malloc((width/2) * (height/2));
365 cpy44 = (unsigned char*)malloc((width/4) * (height/4));
366 if (cpy22 == NULL || cpy44 == NULL)
367 mjpeg_error_exit1("subsample_image: malloc failed");
368
369 subsample_image_altivec(SUBSAMPLE_IMAGE_ARGS);
370 checksum22_1 = checksum(sub22_image, width/2, height/2, rowstride/2);
371 checksum44_1 = checksum(sub44_image, width/4, height/4, rowstride/4);
372
373 /* copy data for imgcmp */
374 imgcpy(cpy22, sub22_image, width/2, height/2, rowstride/2);
375 imgcpy(cpy44, sub44_image, width/4, height/4, rowstride/4);
376
377 ALTIVEC_TEST_WITH(subsample_image)(SUBSAMPLE_IMAGE_ARGS);
378 checksum22_2 = checksum(sub22_image, width/2, height/2, rowstride/2);
379 checksum44_2 = checksum(sub44_image, width/4, height/4, rowstride/4);
380
381 if (checksum22_1 != checksum22_2 || checksum44_1 != checksum44_2) {
382 mjpeg_debug("subsample_image(" SUBSAMPLE_IMAGE_PFMT ")",
383 SUBSAMPLE_IMAGE_ARGS);
384 if (checksum22_1 != checksum22_2)
385 mjpeg_debug("subsample_image: %s checksums differ %d != %d",
386 "2*2", checksum22_1, checksum22_2);
387 if (checksum44_1 != checksum44_2)
388 mjpeg_debug("subsample_image: %s checksums differ %d != %d",
389 "4*4", checksum44_1, checksum44_2);
390
391 imgcmp("2*2", cpy22, sub22_image, width/2, height/2, rowstride/2);
392 imgcmp("4*4", cpy44, sub44_image, width/4, height/4, rowstride/4);
393 }
394
395 free(cpy22);
396 free(cpy44);
397 }
398
399 # else
400
401 #undef BENCHMARK_ITERATIONS
402 #define BENCHMARK_ITERATIONS 1000
403 #undef BENCHMARK_FREQUENCY 1
404 #define BENCHMARK_FREQUENCY 1
405
406 ALTIVEC_TEST(subsample_image, void, (SUBSAMPLE_IMAGE_PDECL),
407 SUBSAMPLE_IMAGE_PFMT, SUBSAMPLE_IMAGE_ARGS);
408 # endif
409 #endif /* }}} */
410 /* vim:set foldmethod=marker foldlevel=0: */
411