1 /* find_best_one_pel.c, this file is part of the
2  * AltiVec optimized library for MJPEG tools MPEG-1/2 Video Encoder
3  * Copyright (C) 2002  James Klicman <james@klicman.org>
4  *
5  * This library is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18  */
19 
20 #ifdef HAVE_CONFIG_H
21 #include <config.h>
22 #endif
23 
24 #include <limits.h>
25 
26 #include "altivec_motion.h"
27 #include "vectorize.h"
28 #include "../mjpeg_logging.h"
29 
30 /* #define AMBER_ENABLE */
31 /* #define AMBER_MAX_TRACES 10 */
32 #include "amber.h"
33 
34 #ifdef HAVE_ALTIVEC_H
35 /* include last to ensure AltiVec type semantics, especially for bool. */
36 #include <altivec.h>
37 #endif
38 
39 
40 /*
41  * Search for the best 1-pel match within 1-pel of a good 2*2-pel
42  *
43  * Input requirements:
44  *   a) ref is always vector aligned
45  *   b) rowstride is a multiple of 16
46  *   c) h is either 8 or 16
47  *
48  */
49 #define FIND_BEST_ONE_PEL_PDECL /* {{{ */                                    \
50   me_result_set *sub22set,                                                   \
51   uint8_t *org, uint8_t *ref,                                                \
52   int i0, int j0,                                                            \
53   int ihigh, int jhigh,                                                      \
54   int rowstride, int h,                                                      \
55   me_result_s *best_so_far                                                   \
56   /* }}} */
57 
58 #define FIND_BEST_ONE_PEL_ARGS /* {{{ */                                     \
59   sub22set, org, ref,                                                        \
60   i0, j0, ihigh, jhigh,                                                      \
61   rowstride, h, best_so_far                                                  \
62   /* }}} */
63 
64 /* void find_best_one_pel_altivec(FIND_BEST_ONE_PEL_PDECL) {{{ */
65 #if defined(ALTIVEC_VERIFY) && ALTIVEC_TEST_FUNCTION(find_best_one_pel)
66 #define VERIFY_FIND_BEST_ONE_PEL
67 static void verify_sads(uint8_t *blk1, uint8_t *blk2, int stride,
68 			int h, signed int *sads, int count);
69 
70 static void _find_best_one_pel_altivec(FIND_BEST_ONE_PEL_PDECL, int verify);
find_best_one_pel_altivec(FIND_BEST_ONE_PEL_PDECL)71 void find_best_one_pel_altivec(FIND_BEST_ONE_PEL_PDECL)
72 {
73   _find_best_one_pel_altivec(FIND_BEST_ONE_PEL_ARGS, 0 /* no verify */);
74 }
75 
_find_best_one_pel_altivec(FIND_BEST_ONE_PEL_PDECL,int verify)76 static void _find_best_one_pel_altivec(FIND_BEST_ONE_PEL_PDECL, int verify)
77 #else
78 void find_best_one_pel_altivec(FIND_BEST_ONE_PEL_PDECL)
79 #endif
80 /* }}} */
81 {
82     int i;
83     uint8_t *orgblk;
84     me_result_s *sub22mests;
85     int len;
86     uint8_t *pblk, *pref;
87     int x, y;
88     me_result_s mres;
89     vector unsigned char t0, t1, t2;
90     vector unsigned char l0, l1;
91     vector unsigned char perm0, perm1;
92     vector unsigned char blk1_0, blk1_1;
93     vector unsigned char vref;
94     vector unsigned int zero;
95     vector unsigned int sad00, sad10, sad01, sad11;
96     vector unsigned int sads;
97     vector unsigned int minsad;
98     vector bool int minsel;
99     vector signed char xy;
100     vector signed char xylim;
101     vector signed char minxy;
102     vector signed char xy11;
103     vector unsigned char xint,
104 			 yint;
105     union {
106 	vector unsigned int _align16;
107 	struct {
108 	    me_result_s xylim;
109 	} init;
110 	me_result_s xy;
111 	me_result_s best;
112     } vio;
113 #ifdef ALTIVEC_DST
114     DataStreamControl dsc;
115 #endif
116 #ifdef VERIFY_FIND_BEST_ONE_PEL
117     vector signed int versads;
118 #endif
119 
120 #ifdef ALTIVEC_VERIFY
121   if (NOT_VECTOR_ALIGNED(org))
122     mjpeg_error_exit1("find_best_one_pel: org %% 16 != 0, (0x%X)", org);
123 
124   if (NOT_VECTOR_ALIGNED(ref))
125     mjpeg_error_exit1("find_best_one_pel: ref %% 16 != 0, (0x%X)", ref);
126 
127   if (NOT_VECTOR_ALIGNED(rowstride))
128     mjpeg_error_exit1("find_best_one_pel: rowstride %% 16 != 0, (%d)",
129       rowstride);
130 #endif
131 
132   if (h != 8 && h != 16)
133     mjpeg_error_exit1("find_best_one_pel: h != [8|16], (%d)", h);
134 
135     AMBER_START;
136 
137     len = sub22set->len;
138     if (len < 1) {			/* sub22set->len is sometimes zero.  */
139 	best_so_far->weight = 255*255;	/* we can save a lot of effort if we */
140 	return;				/* stop short.                       */
141     }
142 
143 #ifdef ALTIVEC_DST
144     dsc.control = DATA_STREAM_CONTROL(1,0,0);
145     dsc.block.count = h;
146     dsc.block.stride = rowstride;
147     vec_dst(ref, dsc.control, 0);
148 
149     /* increase size to 2 and increment count */
150     dsc.control += DATA_STREAM_CONTROL(1,1,0);
151 #endif
152 
153     xy11 = (vector signed char)VCONST(0,0,0,0, 0,0,1,0, 0,0,0,1, 0,0,1,1);
154 
155     mres.weight = 0;		/* weight must be zero */
156     mres.x = ihigh - i0;	/* x <= xylim.x */
157     mres.y = jhigh - j0;	/* y <= xylim.y */
158     vio.init.xylim = mres;
159 
160     yint = vec_lvsl(0, (unsigned char*)0);
161     xint = vu8(vec_splat_u32(0xf));
162     xint = vec_add(xint, yint /* lvsl */ );
163     yint = vu8(vec_splat_u32(1));
164     yint = vec_add(yint, xint);
165 
166     /* initialize to zero */
167     zero = vec_splat_u32(0);
168 
169     xylim = vec_ld(0, (signed char*) &vio.init.xylim);
170     xylim = vs8(vec_splat(vu32(xylim), 0));
171 
172     minsad = vu32(vec_splat_s8(-1));
173 
174     sub22mests = sub22set->mests;
175 
176     do {
177 	mres = *sub22mests;
178 	x = mres.x;
179 	y = mres.y;
180 
181 	orgblk = org + (i0 + x) + rowstride*(j0 + y);
182 #ifdef ALTIVEC_DST
183 	vec_dst(orgblk, dsc.control, 1);
184 #endif
185 
186 	mres.weight = 0; /* weight must be zero */
187 	vio.xy = mres;
188 	sub22mests++;
189 
190 
191 	/* orgblk alignment should always be a multiple of 2 {0,2,4,6,8,A,C,E}
192 	 * this is important to avoid the edge case where (orgblk&15)==15
193 	 */
194 	if (((unsigned int)orgblk & 1) != 0)
195 	    mjpeg_warn("find_best_one_pel: orgblk %% 2 != 0 (0x%X)", orgblk);
196 
197 	/* calculate SAD for macroblocks:
198 	 * orgblk(0, 0), orgblk(+1, 0),
199 	 * orgblk(0,+1), orgblk(+1,+1)
200 	 */
201 
202 	/* initialize to sad vectors to zero {{{ */
203 	sad00 = vec_splat_u32(0);
204 	sad10 = vec_splat_u32(0);
205 	sad01 = vec_splat_u32(0);
206 	sad11 = vec_splat_u32(0);
207 	/* }}} */
208 
209 	pblk = orgblk; /* always aligned by 2 {0,2,4,6,8,A,C,E} */
210 	l0 = vec_ld(0, pblk);
211 	l1 = vec_ld(16, pblk);
212 
213 	pref = ref;
214 	vref = vec_ld(0, pref);
215 
216 	perm0 = vec_lvsl(0, pblk);
217 	perm1 = vec_splat_u8(1);
218 	perm1 = vec_add(perm0, perm1);
219 
220 	blk1_0 = vec_perm(l0, l1, perm0);
221 	blk1_1 = vec_perm(l0, l1, perm1);
222 
223 	i = h - 1;
224 	do {
225 	    /* start loading next */
226 	    pblk += rowstride;
227 	    l0 = vec_ld(0, pblk);
228 	    l1 = vec_ld(16, pblk);
229 
230 	    t0 = vec_max(blk1_0, vref);
231 	    t1 = vec_min(blk1_0, vref);
232 	    t2 = vec_sub(t0, t1);
233 	    sad00  = vec_sum4s(t2, sad00);
234 
235 	    t0 = vec_max(blk1_1, vref);
236 	    t1 = vec_min(blk1_1, vref);
237 	    t2 = vec_sub(t0, t1);
238 	    sad10  = vec_sum4s(t2, sad10);
239 
240 
241 	    blk1_0 = vec_perm(l0, l1, perm0);
242 	    blk1_1 = vec_perm(l0, l1, perm1);
243 
244 
245 	    t0 = vec_max(blk1_0, vref);
246 	    t1 = vec_min(blk1_0, vref);
247 	    t2 = vec_sub(t0, t1);
248 	    sad01  = vec_sum4s(t2, sad01);
249 
250 	    t0 = vec_max(blk1_1, vref);
251 	    t1 = vec_min(blk1_1, vref);
252 
253 	    pref += rowstride;
254 	    vref = vec_ld(0, pref);
255 
256 	    t2 = vec_sub(t0, t1);
257 	    sad11  = vec_sum4s(t2, sad11);
258 	} while (--i);
259 
260 	/* start loading last */
261 	pblk += rowstride;
262 	l0 = vec_ld(0, pblk);
263 	l1 = vec_ld(16, pblk);
264 
265 	t0 = vec_max(blk1_0, vref);
266 	t1 = vec_min(blk1_0, vref);
267 	t2 = vec_sub(t0, t1);
268 	sad00  = vec_sum4s(t2, sad00);
269 
270 	t0 = vec_max(blk1_1, vref);
271 	t1 = vec_min(blk1_1, vref);
272 	t2 = vec_sub(t0, t1);
273 	sad10  = vec_sum4s(t2, sad10);
274 
275 	blk1_0 = vec_perm(l0, l1, perm0);
276 	blk1_1 = vec_perm(l0, l1, perm1);
277 
278 	t0 = vec_max(blk1_0, vref);
279 	t1 = vec_min(blk1_0, vref);
280 	t2 = vec_sub(t0, t1);
281 	sad01  = vec_sum4s(t2, sad01);
282 
283 	t0 = vec_max(blk1_1, vref);
284 	t1 = vec_min(blk1_1, vref);
285 	t2 = vec_sub(t0, t1);
286 	sad11  = vec_sum4s(t2, sad11);
287 
288 
289 	/* calculate final sums {{{ */
290 	sad00 = vu32(vec_sums(vs32(sad00), vs32(zero)));
291 	sad10 = vu32(vec_sums(vs32(sad10), vs32(zero)));
292 	sad01 = vu32(vec_sums(vs32(sad01), vs32(zero)));
293 	sad11 = vu32(vec_sums(vs32(sad11), vs32(zero)));
294 	/* }}} */
295 
296 	/* sads = {sad00, sad10, sad01, sad11} {{{ */
297 	sad00 = vu32(vec_mergel(vu32(sad00), vu32(sad01)));
298 	sad10 = vu32(vec_mergel(vu32(sad10), vu32(sad11)));
299 	sads = vu32(vec_mergel(vu32(sad00), vu32(sad10)));
300 	/* }}} */
301 
302 #ifdef VERIFY_FIND_BEST_ONE_PEL /* {{{ */
303 	if (verify) {
304 	    vec_st(sads, 0, (unsigned int*)&versads);
305 	    verify_sads(orgblk, ref, rowstride, h, (signed int*)&versads, 4);
306 	}
307 #endif /* }}} */
308 
309 	/* add penalty, clip xy, arrange into me_result_s ... {{{ */
310 	{
311 	    xy = vec_ld(0, (signed char*) &vio.xy);
312 	    xy = vs8(vec_splat(vu32(xy), 0)); /* splat vio.xy */
313 
314 	    /* add distance penalty {{{ */
315 	    /* penalty = (abs(x) + abs(y)) << 3 */
316 	    {
317 		vector signed char  xyabs;
318 		vector unsigned int xxxx, yyyy;
319 		vector unsigned int penalty;
320 
321 		/* (abs(x),abs(y)) */
322 		xyabs = vec_subs(vs8(zero), xy);
323 		xyabs = vec_max(xyabs, xy);
324 
325 		/* xxxx = (x, x, x, x), yyyy = (y, y, y, y)
326 		 * (0,0,x,y, 0,0,x,y, 0,0,x,y, 0,0,x,y) |/- permute vector  -\|
327 		 * (0,0,0,x, 0,0,0,x, 0,0,0,x, 0,0,0,x) |lvsl+(0x0000000F,...)|
328 		 * (0,0,0,y, 0,0,0,y, 0,0,0,y, 0,0,0,y) |lvsl+(0x00000010,...)|
329 		 */
330 		xxxx = vu32(vec_perm(vs8(zero), xyabs, xint));
331 		yyyy = vu32(vec_perm(vs8(zero), xyabs, yint));
332 
333 		/* penalty = (abs(x) + abs(y)) << 3 */
334 		xxxx = vec_add(xxxx, yyyy);
335 		penalty = vec_splat_u32(3);
336 		penalty = vec_sl(xxxx, penalty /* (3,...) */ );
337 
338 		sads = vec_add(sads, penalty);
339 	    } /* }}} */
340 
341 
342 	    /* original version adds same penalty for each sad
343 	     * so xy adjustment must be after penalty calc.
344 	     */
345 	    xy = vec_add(xy, xy11); /* adjust xy values for elements 1-3 */
346 
347 	    /* mask sads  x <= (ihigh - i0) && y <= (jhigh - j0) {{{ */
348 	    /* the first cmpgt (s8) will flag any x and/or y coordinates... {{{
349 	     * as out of bounds. the second cmpgt (u32) will complete the
350 	     * mask if the x or y flag for that result is set.
351 	     *
352 	     * Example: {{{
353 	     *        X  Y         X  Y         X  Y         X  Y
354 	     * [0  0  <  <] [0  0  <  <] [0  0  >  <] [0  0  <  >]
355 	     * vb8(xymask)  = vec_cmpgt(vu8(xy), xylim)
356 	     * [0  0  0  0] [0  0  0  0] [0  0  1  0] [0  0  0  1]
357 	     * vb32(xymask) = vec_cmpgt(vu32(xymask), vu32(zero))
358 	     * [0  0  0  0] [0  0  0  0] [1  1  1  1] [1  1  1  1]
359 	     *
360 	     * Legend: 0=0x00  (<)=(xy[n] <= xymax[n])
361 	     *         1=0xff  (>)=(xy[n] >  xymax[n])
362 	     * }}}
363 	     */ /* }}} */
364 	    {
365 		vector bool int xymask;
366 
367 		xymask = vb32(vec_cmpgt(xy, xylim));
368 		xymask = vec_cmpgt(vu32(xymask), zero);
369 
370 		/* 'or' xymask to sads thereby forcing
371 		 * masked values above the threshold.
372 		 */
373 		sads = vec_or(sads, vu32(xymask));
374 	    } /* }}} */
375 	} /* }}} */
376 
377 	/* find sads lower than minsad */
378 	minsel = vec_cmplt(sads, minsad);
379 
380 	minsad = vec_sel(minsad, sads, minsel);
381 	minxy = vec_sel(minxy, xy, vb8(minsel));
382 
383 #define minsad32 vu32(t0)
384 #define minxy32  vs8(t1)
385 	t0 = vu8(vec_sld(vu32(zero), vu32(minsad), 12));
386 	t1 = vu8(vec_sld(vu32(zero), vu32(minxy), 12));
387 
388 	minsel = vec_cmplt(minsad, minsad32);
389 	minsad = vec_sel(minsad32, minsad, minsel);
390 	minxy = vec_sel(minxy32, minxy, vb8(minsel));
391 #undef minsad32 /* t0 */
392 #undef minxy32  /* t1 */
393 
394 #define minsad64 vu32(t0)
395 #define minxy64  vs8(t1)
396 	t0 = vu8(vec_sld(vu32(zero), vu32(minsad), 8));
397 	t1 = vu8(vec_sld(vu32(zero), vu32(minxy), 8));
398 
399 	minsel = vec_cmplt(minsad, minsad64);
400 	minsad = vec_sel(minsad64, minsad, minsel);
401 	minxy = vec_sel(minxy64, minxy, vb8(minsel));
402 #undef minsad64 /* t0 */
403 #undef minxy64  /* t1 */
404 
405 	minsad = vec_splat(minsad, 3);
406 	minxy = vs8(vec_splat(vu32(minxy), 3));
407 	/* }}} */
408     } while (--len);
409 
410 
411     /* arrange sad and xy into me_result_s form {{{ */
412     /* (   0, sad,   0, sad,   0, sad,   0, sad )
413      * ( sad, sad, sad, sad, sad, sad, sad, sad )
414      *
415      * (   0,  xy,   0,  xy,   0,  xy,   0,  xy )
416      * (  xy,  xy,  xy,  xy,  xy,  xy,  xy,  xy )
417      *
418      * ( sad,  xy, sad,  xy, sad,  xy, sad,  xy )
419      */
420     minsad = vu32(vec_pack(vu32(minsad), vu32(minsad)));
421     minxy = vs8(vec_pack(vu32(minxy), vu32(minxy)));
422     minsad = vu32(vec_mergeh(vu16(minsad), vu16(minxy)));
423     /* }}} */
424 
425     /* store mests to vo for scalar access */
426     vec_st(minsad, 0, (unsigned int*) &vio.best);
427 
428     mres = vio.best;
429     if (mres.weight > 255*255)
430 	mres.weight = 255*255;
431 
432     *best_so_far = mres;
433 
434   AMBER_STOP;
435 
436 #undef sads
437 }
438 
439 
440 #if ALTIVEC_TEST_FUNCTION(find_best_one_pel) /* {{{ */
441 
442 #define FIND_BEST_ONE_PEL_PFMT                                               \
443   "sub22set=0x%X, org=0x%X, blk=0x%X, i0=%d, j0=%d, ihigh=%d, jhigh=%d, "    \
444   "rowstride=%d, h=%d, best_so_far=0x%X"
445 
446 #  ifdef ALTIVEC_VERIFY
find_best_one_pel_altivec_verify(FIND_BEST_ONE_PEL_PDECL)447 void find_best_one_pel_altivec_verify(FIND_BEST_ONE_PEL_PDECL)
448 {
449   me_result_s best, best1, best2;
450 
451   best = *best_so_far; /* save best */
452   _find_best_one_pel_altivec(FIND_BEST_ONE_PEL_ARGS, 1 /* verify */);
453   best1 = *best_so_far;
454 
455   *best_so_far = best; /* restore best */
456   ALTIVEC_TEST_WITH(find_best_one_pel)(FIND_BEST_ONE_PEL_ARGS);
457   best2 = *best_so_far;
458 
459   if (best1.weight != best2.weight ||
460       best1.x != best2.x ||
461       best1.y != best2.y)
462   {
463     mjpeg_debug("find_best_one_pel(" FIND_BEST_ONE_PEL_PFMT ")",
464 		FIND_BEST_ONE_PEL_ARGS);
465     mjpeg_debug("find_best_one_pel: sub22set->len=%d", sub22set->len);
466     mjpeg_debug("find_best_one_pel: best_so_far "
467 		"{weight=%d,x=%d,y=%d} != {weight=%d,x=%d,y=%d}",
468 		best1.weight, best1.x, best1.y,
469 		best2.weight, best2.x, best2.y);
470   }
471 }
472 
verify_sads(uint8_t * blk1,uint8_t * blk2,int stride,int h,signed int * sads,int count)473 static void verify_sads(uint8_t *blk1, uint8_t *blk2, int stride,
474 			int h, signed int *sads, int count)
475 {
476   int i, d, d2, dmin;
477   uint8_t *pblk;
478 
479   pblk = blk1;
480   dmin = INT_MAX;
481 
482   for (i = 0; i < count; i++) {
483     /* d = sad_00(blk1, blk2, stride, h, dmin); {{{ */
484 #if ALTIVEC_TEST_FUNCTION(sad_00)
485     d = ALTIVEC_TEST_WITH(sad_00)(pblk, blk2, stride, h, dmin);
486 #else
487     d = sad_00_altivec(pblk, blk2, stride, h, dmin);
488 #endif /* }}} */
489     d2 = sads[i];
490     if (d != d2 && d2 <= dmin) {
491       mjpeg_debug("find_best_one_pel: %d[%d] != %d=sad_00"
492 	"(blk1=0x%X(0x%X), blk2=0x%X, stride=%d, h=%d, dmin=%d)",
493 	d2, i, d, pblk, blk1, blk2, stride, h, dmin);
494     }
495 
496     if (i == 1)
497       pblk += stride-1;
498     else
499       pblk += 1;
500   }
501 }
502 
503 #  else
504 #undef BENCHMARK_FREQUENCY
505 #define BENCHMARK_FREQUENCY 543
506 
507 #undef BENCHMARK_EPILOG
508 #define BENCHMARK_EPILOG                                                     \
509   mjpeg_info("find_best_one_pel: sub22set->len=%d", sub22set->len);
510 
511 ALTIVEC_TEST(find_best_one_pel, void, (FIND_BEST_ONE_PEL_PDECL),
512     FIND_BEST_ONE_PEL_PFMT, FIND_BEST_ONE_PEL_ARGS);
513 #  endif
514 #endif /* }}} */
515 /* vim:set sw=4 softtabstop=4 foldmethod=marker foldlevel=0: */
516