1 /* find_best_one_pel.c, this file is part of the
2 * AltiVec optimized library for MJPEG tools MPEG-1/2 Video Encoder
3 * Copyright (C) 2002 James Klicman <james@klicman.org>
4 *
5 * This library is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20 #ifdef HAVE_CONFIG_H
21 #include <config.h>
22 #endif
23
24 #include <limits.h>
25
26 #include "altivec_motion.h"
27 #include "vectorize.h"
28 #include "../mjpeg_logging.h"
29
30 /* #define AMBER_ENABLE */
31 /* #define AMBER_MAX_TRACES 10 */
32 #include "amber.h"
33
34 #ifdef HAVE_ALTIVEC_H
35 /* include last to ensure AltiVec type semantics, especially for bool. */
36 #include <altivec.h>
37 #endif
38
39
40 /*
41 * Search for the best 1-pel match within 1-pel of a good 2*2-pel
42 *
43 * Input requirements:
44 * a) ref is always vector aligned
45 * b) rowstride is a multiple of 16
46 * c) h is either 8 or 16
47 *
48 */
49 #define FIND_BEST_ONE_PEL_PDECL /* {{{ */ \
50 me_result_set *sub22set, \
51 uint8_t *org, uint8_t *ref, \
52 int i0, int j0, \
53 int ihigh, int jhigh, \
54 int rowstride, int h, \
55 me_result_s *best_so_far \
56 /* }}} */
57
58 #define FIND_BEST_ONE_PEL_ARGS /* {{{ */ \
59 sub22set, org, ref, \
60 i0, j0, ihigh, jhigh, \
61 rowstride, h, best_so_far \
62 /* }}} */
63
64 /* void find_best_one_pel_altivec(FIND_BEST_ONE_PEL_PDECL) {{{ */
65 #if defined(ALTIVEC_VERIFY) && ALTIVEC_TEST_FUNCTION(find_best_one_pel)
66 #define VERIFY_FIND_BEST_ONE_PEL
67 static void verify_sads(uint8_t *blk1, uint8_t *blk2, int stride,
68 int h, signed int *sads, int count);
69
70 static void _find_best_one_pel_altivec(FIND_BEST_ONE_PEL_PDECL, int verify);
find_best_one_pel_altivec(FIND_BEST_ONE_PEL_PDECL)71 void find_best_one_pel_altivec(FIND_BEST_ONE_PEL_PDECL)
72 {
73 _find_best_one_pel_altivec(FIND_BEST_ONE_PEL_ARGS, 0 /* no verify */);
74 }
75
_find_best_one_pel_altivec(FIND_BEST_ONE_PEL_PDECL,int verify)76 static void _find_best_one_pel_altivec(FIND_BEST_ONE_PEL_PDECL, int verify)
77 #else
78 void find_best_one_pel_altivec(FIND_BEST_ONE_PEL_PDECL)
79 #endif
80 /* }}} */
81 {
82 int i;
83 uint8_t *orgblk;
84 me_result_s *sub22mests;
85 int len;
86 uint8_t *pblk, *pref;
87 int x, y;
88 me_result_s mres;
89 vector unsigned char t0, t1, t2;
90 vector unsigned char l0, l1;
91 vector unsigned char perm0, perm1;
92 vector unsigned char blk1_0, blk1_1;
93 vector unsigned char vref;
94 vector unsigned int zero;
95 vector unsigned int sad00, sad10, sad01, sad11;
96 vector unsigned int sads;
97 vector unsigned int minsad;
98 vector bool int minsel;
99 vector signed char xy;
100 vector signed char xylim;
101 vector signed char minxy;
102 vector signed char xy11;
103 vector unsigned char xint,
104 yint;
105 union {
106 vector unsigned int _align16;
107 struct {
108 me_result_s xylim;
109 } init;
110 me_result_s xy;
111 me_result_s best;
112 } vio;
113 #ifdef ALTIVEC_DST
114 DataStreamControl dsc;
115 #endif
116 #ifdef VERIFY_FIND_BEST_ONE_PEL
117 vector signed int versads;
118 #endif
119
120 #ifdef ALTIVEC_VERIFY
121 if (NOT_VECTOR_ALIGNED(org))
122 mjpeg_error_exit1("find_best_one_pel: org %% 16 != 0, (0x%X)", org);
123
124 if (NOT_VECTOR_ALIGNED(ref))
125 mjpeg_error_exit1("find_best_one_pel: ref %% 16 != 0, (0x%X)", ref);
126
127 if (NOT_VECTOR_ALIGNED(rowstride))
128 mjpeg_error_exit1("find_best_one_pel: rowstride %% 16 != 0, (%d)",
129 rowstride);
130 #endif
131
132 if (h != 8 && h != 16)
133 mjpeg_error_exit1("find_best_one_pel: h != [8|16], (%d)", h);
134
135 AMBER_START;
136
137 len = sub22set->len;
138 if (len < 1) { /* sub22set->len is sometimes zero. */
139 best_so_far->weight = 255*255; /* we can save a lot of effort if we */
140 return; /* stop short. */
141 }
142
143 #ifdef ALTIVEC_DST
144 dsc.control = DATA_STREAM_CONTROL(1,0,0);
145 dsc.block.count = h;
146 dsc.block.stride = rowstride;
147 vec_dst(ref, dsc.control, 0);
148
149 /* increase size to 2 and increment count */
150 dsc.control += DATA_STREAM_CONTROL(1,1,0);
151 #endif
152
153 xy11 = (vector signed char)VCONST(0,0,0,0, 0,0,1,0, 0,0,0,1, 0,0,1,1);
154
155 mres.weight = 0; /* weight must be zero */
156 mres.x = ihigh - i0; /* x <= xylim.x */
157 mres.y = jhigh - j0; /* y <= xylim.y */
158 vio.init.xylim = mres;
159
160 yint = vec_lvsl(0, (unsigned char*)0);
161 xint = vu8(vec_splat_u32(0xf));
162 xint = vec_add(xint, yint /* lvsl */ );
163 yint = vu8(vec_splat_u32(1));
164 yint = vec_add(yint, xint);
165
166 /* initialize to zero */
167 zero = vec_splat_u32(0);
168
169 xylim = vec_ld(0, (signed char*) &vio.init.xylim);
170 xylim = vs8(vec_splat(vu32(xylim), 0));
171
172 minsad = vu32(vec_splat_s8(-1));
173
174 sub22mests = sub22set->mests;
175
176 do {
177 mres = *sub22mests;
178 x = mres.x;
179 y = mres.y;
180
181 orgblk = org + (i0 + x) + rowstride*(j0 + y);
182 #ifdef ALTIVEC_DST
183 vec_dst(orgblk, dsc.control, 1);
184 #endif
185
186 mres.weight = 0; /* weight must be zero */
187 vio.xy = mres;
188 sub22mests++;
189
190
191 /* orgblk alignment should always be a multiple of 2 {0,2,4,6,8,A,C,E}
192 * this is important to avoid the edge case where (orgblk&15)==15
193 */
194 if (((unsigned int)orgblk & 1) != 0)
195 mjpeg_warn("find_best_one_pel: orgblk %% 2 != 0 (0x%X)", orgblk);
196
197 /* calculate SAD for macroblocks:
198 * orgblk(0, 0), orgblk(+1, 0),
199 * orgblk(0,+1), orgblk(+1,+1)
200 */
201
202 /* initialize to sad vectors to zero {{{ */
203 sad00 = vec_splat_u32(0);
204 sad10 = vec_splat_u32(0);
205 sad01 = vec_splat_u32(0);
206 sad11 = vec_splat_u32(0);
207 /* }}} */
208
209 pblk = orgblk; /* always aligned by 2 {0,2,4,6,8,A,C,E} */
210 l0 = vec_ld(0, pblk);
211 l1 = vec_ld(16, pblk);
212
213 pref = ref;
214 vref = vec_ld(0, pref);
215
216 perm0 = vec_lvsl(0, pblk);
217 perm1 = vec_splat_u8(1);
218 perm1 = vec_add(perm0, perm1);
219
220 blk1_0 = vec_perm(l0, l1, perm0);
221 blk1_1 = vec_perm(l0, l1, perm1);
222
223 i = h - 1;
224 do {
225 /* start loading next */
226 pblk += rowstride;
227 l0 = vec_ld(0, pblk);
228 l1 = vec_ld(16, pblk);
229
230 t0 = vec_max(blk1_0, vref);
231 t1 = vec_min(blk1_0, vref);
232 t2 = vec_sub(t0, t1);
233 sad00 = vec_sum4s(t2, sad00);
234
235 t0 = vec_max(blk1_1, vref);
236 t1 = vec_min(blk1_1, vref);
237 t2 = vec_sub(t0, t1);
238 sad10 = vec_sum4s(t2, sad10);
239
240
241 blk1_0 = vec_perm(l0, l1, perm0);
242 blk1_1 = vec_perm(l0, l1, perm1);
243
244
245 t0 = vec_max(blk1_0, vref);
246 t1 = vec_min(blk1_0, vref);
247 t2 = vec_sub(t0, t1);
248 sad01 = vec_sum4s(t2, sad01);
249
250 t0 = vec_max(blk1_1, vref);
251 t1 = vec_min(blk1_1, vref);
252
253 pref += rowstride;
254 vref = vec_ld(0, pref);
255
256 t2 = vec_sub(t0, t1);
257 sad11 = vec_sum4s(t2, sad11);
258 } while (--i);
259
260 /* start loading last */
261 pblk += rowstride;
262 l0 = vec_ld(0, pblk);
263 l1 = vec_ld(16, pblk);
264
265 t0 = vec_max(blk1_0, vref);
266 t1 = vec_min(blk1_0, vref);
267 t2 = vec_sub(t0, t1);
268 sad00 = vec_sum4s(t2, sad00);
269
270 t0 = vec_max(blk1_1, vref);
271 t1 = vec_min(blk1_1, vref);
272 t2 = vec_sub(t0, t1);
273 sad10 = vec_sum4s(t2, sad10);
274
275 blk1_0 = vec_perm(l0, l1, perm0);
276 blk1_1 = vec_perm(l0, l1, perm1);
277
278 t0 = vec_max(blk1_0, vref);
279 t1 = vec_min(blk1_0, vref);
280 t2 = vec_sub(t0, t1);
281 sad01 = vec_sum4s(t2, sad01);
282
283 t0 = vec_max(blk1_1, vref);
284 t1 = vec_min(blk1_1, vref);
285 t2 = vec_sub(t0, t1);
286 sad11 = vec_sum4s(t2, sad11);
287
288
289 /* calculate final sums {{{ */
290 sad00 = vu32(vec_sums(vs32(sad00), vs32(zero)));
291 sad10 = vu32(vec_sums(vs32(sad10), vs32(zero)));
292 sad01 = vu32(vec_sums(vs32(sad01), vs32(zero)));
293 sad11 = vu32(vec_sums(vs32(sad11), vs32(zero)));
294 /* }}} */
295
296 /* sads = {sad00, sad10, sad01, sad11} {{{ */
297 sad00 = vu32(vec_mergel(vu32(sad00), vu32(sad01)));
298 sad10 = vu32(vec_mergel(vu32(sad10), vu32(sad11)));
299 sads = vu32(vec_mergel(vu32(sad00), vu32(sad10)));
300 /* }}} */
301
302 #ifdef VERIFY_FIND_BEST_ONE_PEL /* {{{ */
303 if (verify) {
304 vec_st(sads, 0, (unsigned int*)&versads);
305 verify_sads(orgblk, ref, rowstride, h, (signed int*)&versads, 4);
306 }
307 #endif /* }}} */
308
309 /* add penalty, clip xy, arrange into me_result_s ... {{{ */
310 {
311 xy = vec_ld(0, (signed char*) &vio.xy);
312 xy = vs8(vec_splat(vu32(xy), 0)); /* splat vio.xy */
313
314 /* add distance penalty {{{ */
315 /* penalty = (abs(x) + abs(y)) << 3 */
316 {
317 vector signed char xyabs;
318 vector unsigned int xxxx, yyyy;
319 vector unsigned int penalty;
320
321 /* (abs(x),abs(y)) */
322 xyabs = vec_subs(vs8(zero), xy);
323 xyabs = vec_max(xyabs, xy);
324
325 /* xxxx = (x, x, x, x), yyyy = (y, y, y, y)
326 * (0,0,x,y, 0,0,x,y, 0,0,x,y, 0,0,x,y) |/- permute vector -\|
327 * (0,0,0,x, 0,0,0,x, 0,0,0,x, 0,0,0,x) |lvsl+(0x0000000F,...)|
328 * (0,0,0,y, 0,0,0,y, 0,0,0,y, 0,0,0,y) |lvsl+(0x00000010,...)|
329 */
330 xxxx = vu32(vec_perm(vs8(zero), xyabs, xint));
331 yyyy = vu32(vec_perm(vs8(zero), xyabs, yint));
332
333 /* penalty = (abs(x) + abs(y)) << 3 */
334 xxxx = vec_add(xxxx, yyyy);
335 penalty = vec_splat_u32(3);
336 penalty = vec_sl(xxxx, penalty /* (3,...) */ );
337
338 sads = vec_add(sads, penalty);
339 } /* }}} */
340
341
342 /* original version adds same penalty for each sad
343 * so xy adjustment must be after penalty calc.
344 */
345 xy = vec_add(xy, xy11); /* adjust xy values for elements 1-3 */
346
347 /* mask sads x <= (ihigh - i0) && y <= (jhigh - j0) {{{ */
348 /* the first cmpgt (s8) will flag any x and/or y coordinates... {{{
349 * as out of bounds. the second cmpgt (u32) will complete the
350 * mask if the x or y flag for that result is set.
351 *
352 * Example: {{{
353 * X Y X Y X Y X Y
354 * [0 0 < <] [0 0 < <] [0 0 > <] [0 0 < >]
355 * vb8(xymask) = vec_cmpgt(vu8(xy), xylim)
356 * [0 0 0 0] [0 0 0 0] [0 0 1 0] [0 0 0 1]
357 * vb32(xymask) = vec_cmpgt(vu32(xymask), vu32(zero))
358 * [0 0 0 0] [0 0 0 0] [1 1 1 1] [1 1 1 1]
359 *
360 * Legend: 0=0x00 (<)=(xy[n] <= xymax[n])
361 * 1=0xff (>)=(xy[n] > xymax[n])
362 * }}}
363 */ /* }}} */
364 {
365 vector bool int xymask;
366
367 xymask = vb32(vec_cmpgt(xy, xylim));
368 xymask = vec_cmpgt(vu32(xymask), zero);
369
370 /* 'or' xymask to sads thereby forcing
371 * masked values above the threshold.
372 */
373 sads = vec_or(sads, vu32(xymask));
374 } /* }}} */
375 } /* }}} */
376
377 /* find sads lower than minsad */
378 minsel = vec_cmplt(sads, minsad);
379
380 minsad = vec_sel(minsad, sads, minsel);
381 minxy = vec_sel(minxy, xy, vb8(minsel));
382
383 #define minsad32 vu32(t0)
384 #define minxy32 vs8(t1)
385 t0 = vu8(vec_sld(vu32(zero), vu32(minsad), 12));
386 t1 = vu8(vec_sld(vu32(zero), vu32(minxy), 12));
387
388 minsel = vec_cmplt(minsad, minsad32);
389 minsad = vec_sel(minsad32, minsad, minsel);
390 minxy = vec_sel(minxy32, minxy, vb8(minsel));
391 #undef minsad32 /* t0 */
392 #undef minxy32 /* t1 */
393
394 #define minsad64 vu32(t0)
395 #define minxy64 vs8(t1)
396 t0 = vu8(vec_sld(vu32(zero), vu32(minsad), 8));
397 t1 = vu8(vec_sld(vu32(zero), vu32(minxy), 8));
398
399 minsel = vec_cmplt(minsad, minsad64);
400 minsad = vec_sel(minsad64, minsad, minsel);
401 minxy = vec_sel(minxy64, minxy, vb8(minsel));
402 #undef minsad64 /* t0 */
403 #undef minxy64 /* t1 */
404
405 minsad = vec_splat(minsad, 3);
406 minxy = vs8(vec_splat(vu32(minxy), 3));
407 /* }}} */
408 } while (--len);
409
410
411 /* arrange sad and xy into me_result_s form {{{ */
412 /* ( 0, sad, 0, sad, 0, sad, 0, sad )
413 * ( sad, sad, sad, sad, sad, sad, sad, sad )
414 *
415 * ( 0, xy, 0, xy, 0, xy, 0, xy )
416 * ( xy, xy, xy, xy, xy, xy, xy, xy )
417 *
418 * ( sad, xy, sad, xy, sad, xy, sad, xy )
419 */
420 minsad = vu32(vec_pack(vu32(minsad), vu32(minsad)));
421 minxy = vs8(vec_pack(vu32(minxy), vu32(minxy)));
422 minsad = vu32(vec_mergeh(vu16(minsad), vu16(minxy)));
423 /* }}} */
424
425 /* store mests to vo for scalar access */
426 vec_st(minsad, 0, (unsigned int*) &vio.best);
427
428 mres = vio.best;
429 if (mres.weight > 255*255)
430 mres.weight = 255*255;
431
432 *best_so_far = mres;
433
434 AMBER_STOP;
435
436 #undef sads
437 }
438
439
440 #if ALTIVEC_TEST_FUNCTION(find_best_one_pel) /* {{{ */
441
442 #define FIND_BEST_ONE_PEL_PFMT \
443 "sub22set=0x%X, org=0x%X, blk=0x%X, i0=%d, j0=%d, ihigh=%d, jhigh=%d, " \
444 "rowstride=%d, h=%d, best_so_far=0x%X"
445
446 # ifdef ALTIVEC_VERIFY
find_best_one_pel_altivec_verify(FIND_BEST_ONE_PEL_PDECL)447 void find_best_one_pel_altivec_verify(FIND_BEST_ONE_PEL_PDECL)
448 {
449 me_result_s best, best1, best2;
450
451 best = *best_so_far; /* save best */
452 _find_best_one_pel_altivec(FIND_BEST_ONE_PEL_ARGS, 1 /* verify */);
453 best1 = *best_so_far;
454
455 *best_so_far = best; /* restore best */
456 ALTIVEC_TEST_WITH(find_best_one_pel)(FIND_BEST_ONE_PEL_ARGS);
457 best2 = *best_so_far;
458
459 if (best1.weight != best2.weight ||
460 best1.x != best2.x ||
461 best1.y != best2.y)
462 {
463 mjpeg_debug("find_best_one_pel(" FIND_BEST_ONE_PEL_PFMT ")",
464 FIND_BEST_ONE_PEL_ARGS);
465 mjpeg_debug("find_best_one_pel: sub22set->len=%d", sub22set->len);
466 mjpeg_debug("find_best_one_pel: best_so_far "
467 "{weight=%d,x=%d,y=%d} != {weight=%d,x=%d,y=%d}",
468 best1.weight, best1.x, best1.y,
469 best2.weight, best2.x, best2.y);
470 }
471 }
472
verify_sads(uint8_t * blk1,uint8_t * blk2,int stride,int h,signed int * sads,int count)473 static void verify_sads(uint8_t *blk1, uint8_t *blk2, int stride,
474 int h, signed int *sads, int count)
475 {
476 int i, d, d2, dmin;
477 uint8_t *pblk;
478
479 pblk = blk1;
480 dmin = INT_MAX;
481
482 for (i = 0; i < count; i++) {
483 /* d = sad_00(blk1, blk2, stride, h, dmin); {{{ */
484 #if ALTIVEC_TEST_FUNCTION(sad_00)
485 d = ALTIVEC_TEST_WITH(sad_00)(pblk, blk2, stride, h, dmin);
486 #else
487 d = sad_00_altivec(pblk, blk2, stride, h, dmin);
488 #endif /* }}} */
489 d2 = sads[i];
490 if (d != d2 && d2 <= dmin) {
491 mjpeg_debug("find_best_one_pel: %d[%d] != %d=sad_00"
492 "(blk1=0x%X(0x%X), blk2=0x%X, stride=%d, h=%d, dmin=%d)",
493 d2, i, d, pblk, blk1, blk2, stride, h, dmin);
494 }
495
496 if (i == 1)
497 pblk += stride-1;
498 else
499 pblk += 1;
500 }
501 }
502
503 # else
504 #undef BENCHMARK_FREQUENCY
505 #define BENCHMARK_FREQUENCY 543
506
507 #undef BENCHMARK_EPILOG
508 #define BENCHMARK_EPILOG \
509 mjpeg_info("find_best_one_pel: sub22set->len=%d", sub22set->len);
510
511 ALTIVEC_TEST(find_best_one_pel, void, (FIND_BEST_ONE_PEL_PDECL),
512 FIND_BEST_ONE_PEL_PFMT, FIND_BEST_ONE_PEL_ARGS);
513 # endif
514 #endif /* }}} */
515 /* vim:set sw=4 softtabstop=4 foldmethod=marker foldlevel=0: */
516