1 /* subsample_image.c, this file is part of the
2  * AltiVec optimized library for MJPEG tools MPEG-1/2 Video Encoder
3  * Copyright (C) 2002  James Klicman <james@klicman.org>
4  *
5  * This library is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18  */
19 
20 #ifdef HAVE_CONFIG_H
21 #include <config.h>
22 #endif
23 
24 #include "altivec_motion.h"
25 
26 #if defined(ALTIVEC_VERIFY) && ALTIVEC_TEST_FUNCTION(subsample_image)
27 #include <stdlib.h>
28 #endif
29 
30 #include "vectorize.h"
31 #include "../mjpeg_logging.h"
32 
33 /* #define AMBER_ENABLE */
34 #include "amber.h"
35 
36 #ifdef HAVE_ALTIVEC_H
37 /* include last to ensure AltiVec type semantics, especially for bool. */
38 #include <altivec.h>
39 #endif
40 
41 
42 #define SUBSAMPLE_IMAGE_PDECL /* {{{ */                                      \
43 	uint8_t *image, int rowstride,                                       \
44 	uint8_t *sub22_image,                                                \
45 	uint8_t *sub44_image                                                 \
46 	/* }}} */
47 #define SUBSAMPLE_IMAGE_ARGS image, rowstride, sub22_image, sub44_image
48 #define SUBSAMPLE_IMAGE_PFMT /* {{{ */                                       \
49 	"image=0x%X, rowstride=%d, sub22_image=0x%X, sub44_image=0x%X"       \
50 	/* }}} */
51 
subsample_image_altivec(SUBSAMPLE_IMAGE_PDECL)52 void subsample_image_altivec(SUBSAMPLE_IMAGE_PDECL)
53 {
54     int i, ii, j, stride1, stride2, stride3, stride4, halfstride;
55     unsigned char *pB, *pB2, *pB4;
56     vector unsigned char l0, l1, l2, l3;
57     vector unsigned short s0, s1, s2, s3;
58     vector unsigned short s22_0, s22_1, s22_2, s22_3;
59     vector unsigned short s44, s44_0, s44_1;
60     vector unsigned short zero, two;
61 #ifdef ALTIVEC_DST
62     DataStreamControl dsc;
63 #endif
64 
65 #ifdef ALTIVEC_VERIFY
66     if (NOT_VECTOR_ALIGNED(image))
67 	mjpeg_error_exit1("subsample_image: %s %% %d != 0, (%d)",
68 	    "image", 16, image);
69     if (NOT_VECTOR_ALIGNED(sub22_image))
70 	mjpeg_error_exit1("subsample_image: %s %% %d != 0, (%d)",
71 	    "sub22_image", 16, sub22_image);
72     if (NOT_VECTOR_ALIGNED(sub44_image))
73 	mjpeg_error_exit1("subsample_image: %s %% %d != 0, (%d)",
74 	    "sub44_image", 16, sub44_image);
75 
76     if ((rowstride & 63) != 0)
77 	mjpeg_error_exit1("subsample_image: %s %% %d != 0, (%d)",
78 	    "rowstride", 64, rowstride);
79 #endif
80 
81     AMBER_START;
82 
83     pB = image;
84 
85 #ifdef ALTIVEC_DST
86     dsc.control = DATA_STREAM_CONTROL(6,4,0);
87     dsc.block.stride = rowstride;
88 
89     vec_dst(pB, dsc.control, 0);
90 #endif
91 
92     pB2 = sub22_image;
93     pB4 = sub44_image;
94 
95     j = ((unsigned long)(pB2 - pB) / rowstride) >> 2; /* height/4 */
96 
97     stride1 = rowstride;
98     stride2 = stride1 + stride1;
99     stride3 = stride2 + stride1;
100     stride4 = stride2 + stride2;
101     halfstride = stride1 >> 1; /* /2 */
102 
103     ii = rowstride >> 6; /* rowstride/16/4 */
104 
105     zero = vec_splat_u16(0);
106     two = vec_splat_u16(2);
107 
108     do {
109 	i = ii;
110 	do {
111 	    l0 = vec_ld(0, pB);
112 	    l1 = vec_ld(stride1, pB);
113 	    l2 = vec_ld(stride2, pB);
114 	    l3 = vec_ld(stride3, pB);
115 	    pB += 16;
116 #ifdef ALTIVEC_DST
117 	    vec_dst(pB + (16 * 3), dsc.control, 0);
118 #endif
119 
120 	    /* l0 = 0x[00,01,02,03,04,05,06,07,08,09,0A,0B,0C,0D,0E,0F] */
121 	    /* l1 = 0x[10,11,12,13,14,15,16,17,18,19,1A,1B,1C,1D,1E,1F] */
122 	    /* l2 = 0x[20,21,22,23,24,25,26,27,28,29,2A,2B,2C,2D,2E,2F] */
123 	    /* l3 = 0x[30,31,32,33,34,35,36,37,38,39,3A,3B,3C,3D,3E,3F] */
124 
125 	    /* s0 = 0x[00,01,      02,03,      04,05,      06,07,     ] */
126 	    /*        [      10,11,      12,13,      14,15,      16,17] */
127 	    s0 = vu16(vec_mergeh(vu16(l0), vu16(l1)));
128 	    /* s0 = 0x[00+01+10+11,02+03+12+13,04+05+14+15,06+07+16+17] */
129 	    s0 = vu16(vec_sum4s(vu8(s0), vu32(zero)));
130 
131 	    /* s1 = 0x[08,09,      0A,0B,      0C,0D,      0E,0F,     ] */
132 	    /*        [      18,19,      1A,1B,      1C,1D,      1E,1F] */
133 	    s1 = vu16(vec_mergel(vu16(l0), vu16(l1)));
134 	    /* s1 = 0x[08+09+18+19,0A+0B+1A+1B,0C+0D+1C+1D,0E+0F+1E+1F] */
135 	    s1 = vu16(vec_sum4s(vu8(s1), vu32(zero)));
136 
137 	    /* s2 = 0x[20,21,      22,23,      24,25,      26,27,     ] */
138 	    /*        [      30,31,      32,33,      34,35,      36,37] */
139 	    s2 = vu16(vec_mergeh(vu16(l2), vu16(l3)));
140 	    /* s2 = 0x[20+21+30+31,22+23+32+33,24+25+34+35,26+27+36+37] */
141 	    s2 = vu16(vec_sum4s(vu8(s2), vu32(zero)));
142 
143 	    /* s3 = 0x[28,29,      2A,2B,      2C,2D,      2E,2F,     ] */
144 	    /*        [      38,39,      3A,3B,      3C,3D,      3E,3F] */
145 	    s3 = vu16(vec_mergel(vu16(l2), vu16(l3)));
146 	    /* s3 = 0x[28+29+38+39,2A+2B+3A+3B,2C+2D+3C+3D,2E+2F+3E+3F] */
147 	    s3 = vu16(vec_sum4s(vu8(s3), vu32(zero)));
148 
149 	    /* start loading next block */
150 	    l0 = vec_ld(0, pB);
151 	    l1 = vec_ld(stride1, pB);
152 	    l2 = vec_ld(stride2, pB);
153 	    l3 = vec_ld(stride3, pB);
154 	    pB += 16;
155 
156 	    /* s0 = 0x[00+01+10+11, 02+03+12+13, 04+05+14+15, 06+07+16+17] */
157 	    /* s1 = 0x[08+09+18+19, 0A+0B+1A+1B, 0C+0D+1C+1D, 0E+0F+1E+1F] */
158 	    /* s2 = 0x[20+21+30+31, 22+23+32+33, 24+25+34+35, 26+27+36+37] */
159 	    /* s3 = 0x[28+29+38+39, 2A+2B+3A+3B, 2C+2D+3C+3D, 2E+2F+3E+3F] */
160 
161 	    /* s22_0 = 0x[   00,   02,   04,   06,   08,   0A,   0C,   0E] */
162 	    s22_0 = vec_packsu(vu32(s0), vu32(s1));
163 	    /* s22_1 = 0x[   20,   22,   24,   26,   28,   2A,   2C,   2E] */
164 	    s22_1 = vec_packsu(vu32(s2), vu32(s3));
165 
166 	    /* (pB[i]+pB[i+1]+pN[i]+pN[i+1]) + 2 */
167 	    s22_0 = vec_add(s22_0, two);
168 	    /* (pNN[i]+pNN[i+1]+pNNN[i]+pNNN[i+1]) + 2 */
169 	    s22_1 = vec_add(s22_1, two);
170 
171 	    /* (pB[i]+pB[i+1]+pN[i]+pN[i+1]+2) >> 2 */
172 	    s22_0 = vec_sra(s22_0, two);
173 	    /* (pNN[i]+pNN[i+1]+pNNN[i]+pNNN[i+1]+2) >> 2 */
174 	    s22_1 = vec_sra(s22_1, two);
175 
176 	    /* s22_0 = 0x[   00,   02,   04,   06,   08,   0A,   0C,   0E] */
177 	    /* s22_1 = 0x[   20,   22,   24,   26,   28,   2A,   2C,   2E] */
178 	    /* s44_0 = 0x[00+20,02+22,04+24,06+26,08+28,0A+2A,0C+2C,0E+2E] */
179 	    s44_0 = vec_add(s22_0, s22_1);
180 
181 	    /* s44_0 = 0x[00+20+02+22, 04+24+06+26, 08+28+0A+2A, 0C+2C+0E+2E] */
182 	    s44_0 = vu16(vec_sum4s(vs16(s44_0), vs32(zero)));
183 
184 	    /* - - - - - - - - - - - - - - - - - - - */
185 	    s0 = vu16(vec_mergeh(vu16(l0), vu16(l1)));
186 	    s0 = vu16(vec_sum4s(vu8(s0), vu32(zero)));
187 	    s1 = vu16(vec_mergel(vu16(l0), vu16(l1)));
188 	    s1 = vu16(vec_sum4s(vu8(s1), vu32(zero)));
189 	    s2 = vu16(vec_mergeh(vu16(l2), vu16(l3)));
190 	    s2 = vu16(vec_sum4s(vu8(s2), vu32(zero)));
191 	    s3 = vu16(vec_mergel(vu16(l2), vu16(l3)));
192 	    s3 = vu16(vec_sum4s(vu8(s3), vu32(zero)));
193 
194 	    /* start loading next l[0-3] */
195 	    l0 = vec_ld(0, pB);
196 	    l1 = vec_ld(stride1, pB);
197 	    l2 = vec_ld(stride2, pB);
198 	    l3 = vec_ld(stride3, pB);
199 	    pB += 16;
200 
201 
202 	    s22_2 = vec_packsu(vu32(s0), vu32(s1));
203 	    s22_3 = vec_packsu(vu32(s2), vu32(s3));
204 
205 	    s22_2 = vec_add(s22_2, two);
206 	    s22_3 = vec_add(s22_3, two);
207 
208 	    s22_2 = vec_sra(s22_2, two);
209 	    s22_3 = vec_sra(s22_3, two);
210 
211 
212 	    s44_1 = vec_add(s22_2, s22_3);
213 	    s44_1 = vu16(vec_sum4s(vs16(s44_1), vs32(zero)));
214 
215 	    /* store s22 block */
216 	    s22_0 = vu16(vec_packsu(s22_0, s22_2));
217 	    s22_1 = vu16(vec_packsu(s22_1, s22_3));
218 	    vec_st(vu8(s22_0), 0, pB2);
219 	    vec_st(vu8(s22_1), halfstride, pB2);
220 	    pB2 += 16;
221 
222 	    /* - - - - - - - - - - - - - - - - - - - */
223 	    s0 = vu16(vec_mergeh(vu16(l0), vu16(l1)));
224 	    s0 = vu16(vec_sum4s(vu8(s0), vu32(zero)));
225 	    s1 = vu16(vec_mergel(vu16(l0), vu16(l1)));
226 	    s1 = vu16(vec_sum4s(vu8(s1), vu32(zero)));
227 	    s2 = vu16(vec_mergeh(vu16(l2), vu16(l3)));
228 	    s2 = vu16(vec_sum4s(vu8(s2), vu32(zero)));
229 	    s3 = vu16(vec_mergel(vu16(l2), vu16(l3)));
230 	    s3 = vu16(vec_sum4s(vu8(s3), vu32(zero)));
231 
232 	    /* starting loading next l[0-3] */
233 	    l0 = vec_ld(0, pB);
234 	    l1 = vec_ld(stride1, pB);
235 	    l2 = vec_ld(stride2, pB);
236 	    l3 = vec_ld(stride3, pB);
237 	    pB += 16;
238 
239 
240 	    s22_0 = vec_packsu(vu32(s0), vu32(s1));
241 	    s22_1 = vec_packsu(vu32(s2), vu32(s3));
242 
243 	    s22_0 = vec_add(s22_0, two);
244 	    s22_1 = vec_add(s22_1, two);
245 
246 	    s22_0 = vec_sra(s22_0, two);
247 	    s22_1 = vec_sra(s22_1, two);
248 
249 
250 	    s44 = vec_packsu(vu32(s44_0), vu32(s44_1));
251 	    s44 = vec_add(s44, two);
252 	    s44 = vec_sra(s44, two);
253 
254 	    s44_0 = vec_add(s22_0, s22_1);
255 	    s44_0 = vu16(vec_sum4s(vs16(s44_0), vs32(zero)));
256 
257 	    /* - - - - - - - - - - - - - - - - - - - */
258 	    s0 = vu16(vec_mergeh(vu16(l0), vu16(l1)));
259 	    s0 = vu16(vec_sum4s(vu8(s0), vu32(zero)));
260 	    s1 = vu16(vec_mergel(vu16(l0), vu16(l1)));
261 	    s1 = vu16(vec_sum4s(vu8(s1), vu32(zero)));
262 	    s2 = vu16(vec_mergeh(vu16(l2), vu16(l3)));
263 	    s2 = vu16(vec_sum4s(vu8(s2), vu32(zero)));
264 	    s3 = vu16(vec_mergel(vu16(l2), vu16(l3)));
265 	    s3 = vu16(vec_sum4s(vu8(s3), vu32(zero)));
266 
267 	    s22_2 = vec_packsu(vu32(s0), vu32(s1));
268 	    s22_3 = vec_packsu(vu32(s2), vu32(s3));
269 
270 	    s22_2 = vec_add(s22_2, two);
271 	    s22_3 = vec_add(s22_3, two);
272 
273 	    s22_2 = vec_sra(s22_2, two);
274 	    s22_3 = vec_sra(s22_3, two);
275 
276 	    s44_1 = vec_add(s22_2, s22_3);
277 	    s44_1 = vu16(vec_sum4s(vs16(s44_1), vs32(zero)));
278 
279 	    /* store s22 block */
280 	    s22_0 = vu16(vec_packsu(s22_0, s22_2));
281 	    s22_1 = vu16(vec_packsu(s22_1, s22_3));
282 	    vec_st(vu8(s22_0), 0, pB2);
283 	    vec_st(vu8(s22_1), halfstride, pB2);
284 	    pB2 += 16;
285 
286 	    /* pack all four s44 chunks */
287 	    s44_0 = vec_packsu(vu32(s44_0), vu32(s44_1));
288 	    s44_0 = vec_add(s44_0, two);
289 	    s44_0 = vec_sra(s44_0, two);
290 	    s44 = vu16(vec_packsu(s44, s44_0));
291 
292 	    vec_st(vu8(s44), 0, pB4);
293 	    pB4 += 16;
294 
295 	} while (--i);
296 
297 	pB += stride3;
298 	pB2 += halfstride;
299 
300     } while (--j);
301 
302 #ifdef ALTIVEC_DST
303     vec_dss(0);
304 #endif
305 
306     AMBER_STOP;
307 }
308 
309 #if ALTIVEC_TEST_FUNCTION(subsample_image) /* {{{ */
310 #  ifdef ALTIVEC_VERIFY
311 
imgcpy(uint8_t * d,uint8_t * s,int width,int height,int stride)312 static void imgcpy(uint8_t *d, uint8_t *s, int width, int height, int stride)
313 {
314     int i, j;
315 
316     for (j = 0; j < height; j++) {
317 	for (i = 0; i < width; i++)
318 	    d[i] = s[i];
319 	d += width;
320 	s += stride;
321     }
322 }
323 
checksum(uint8_t * p,int width,int height,int stride)324 static unsigned long checksum(uint8_t *p, int width, int height, int stride)
325 {
326     int i, j;
327     unsigned long checksum;
328 
329     for (checksum = j = 0; j < height; j++) {
330 	for (i = 0; i < width; i++)
331 	    checksum += p[i];
332 	p += stride;
333     }
334 
335     return checksum;
336 }
337 
imgcmp(const char * ss,uint8_t * a,uint8_t * b,int width,int height,int stride)338 static void imgcmp(const char *ss, uint8_t *a, uint8_t *b,
339     int width, int height, int stride)
340 {
341     int i, j;
342 
343     for (j = 0; j < height; j++) {
344 	for (i = 0; i < width; i++)
345 	    if (a[i] != b[i])
346 		mjpeg_debug("subsample_image: %s[%d][%d] %d != %d",
347 		    ss, j, i, a[i], b[i]);
348 
349 	a += width;
350 	b += stride;
351     }
352 }
353 
subsample_image_altivec_verify(SUBSAMPLE_IMAGE_PDECL)354 void subsample_image_altivec_verify(SUBSAMPLE_IMAGE_PDECL)
355 {
356     int width, height;
357     unsigned long checksum44_1, checksum44_2;
358     unsigned long checksum22_1, checksum22_2;
359     unsigned char *cpy22, *cpy44;
360 
361     width = rowstride;
362     height = (unsigned long)(sub22_image - image) / rowstride;
363 
364     cpy22 = (unsigned char*)malloc((width/2) * (height/2));
365     cpy44 = (unsigned char*)malloc((width/4) * (height/4));
366     if (cpy22 == NULL || cpy44 == NULL)
367 	mjpeg_error_exit1("subsample_image: malloc failed");
368 
369     subsample_image_altivec(SUBSAMPLE_IMAGE_ARGS);
370     checksum22_1 = checksum(sub22_image, width/2, height/2, rowstride/2);
371     checksum44_1 = checksum(sub44_image, width/4, height/4, rowstride/4);
372 
373     /* copy data for imgcmp */
374     imgcpy(cpy22, sub22_image, width/2, height/2, rowstride/2);
375     imgcpy(cpy44, sub44_image, width/4, height/4, rowstride/4);
376 
377     ALTIVEC_TEST_WITH(subsample_image)(SUBSAMPLE_IMAGE_ARGS);
378     checksum22_2 = checksum(sub22_image, width/2, height/2, rowstride/2);
379     checksum44_2 = checksum(sub44_image, width/4, height/4, rowstride/4);
380 
381     if (checksum22_1 != checksum22_2 || checksum44_1 != checksum44_2) {
382 	mjpeg_debug("subsample_image(" SUBSAMPLE_IMAGE_PFMT ")",
383 	    SUBSAMPLE_IMAGE_ARGS);
384 	if (checksum22_1 != checksum22_2)
385 	    mjpeg_debug("subsample_image: %s checksums differ %d != %d",
386 		"2*2", checksum22_1, checksum22_2);
387 	if (checksum44_1 != checksum44_2)
388 	    mjpeg_debug("subsample_image: %s checksums differ %d != %d",
389 		"4*4", checksum44_1, checksum44_2);
390 
391 	imgcmp("2*2", cpy22, sub22_image, width/2, height/2, rowstride/2);
392 	imgcmp("4*4", cpy44, sub44_image, width/4, height/4, rowstride/4);
393     }
394 
395     free(cpy22);
396     free(cpy44);
397 }
398 
399 #  else
400 
401 #undef BENCHMARK_ITERATIONS
402 #define BENCHMARK_ITERATIONS 1000
403 #undef BENCHMARK_FREQUENCY  1
404 #define BENCHMARK_FREQUENCY  1
405 
406 ALTIVEC_TEST(subsample_image, void, (SUBSAMPLE_IMAGE_PDECL),
407     SUBSAMPLE_IMAGE_PFMT, SUBSAMPLE_IMAGE_ARGS);
408 #  endif
409 #endif /* }}} */
410 /* vim:set foldmethod=marker foldlevel=0: */
411