1 /* bsumsq.c, this file is part of the
2 * AltiVec optimized library for MJPEG tools MPEG-1/2 Video Encoder
3 * Copyright (C) 2002 James Klicman <james@klicman.org>
4 *
5 * This library is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20 #ifdef HAVE_CONFIG_H
21 #include <config.h>
22 #endif
23
24 #include "altivec_motion.h"
25 #include "vectorize.h"
26 #include "../mjpeg_logging.h"
27
28 /* #define AMBER_ENABLE */
29 #include "amber.h"
30
31 #ifdef HAVE_ALTIVEC_H
32 /* include last to ensure AltiVec type semantics, especially for bool. */
33 #include <altivec.h>
34 #endif
35
36 /*
37 * squared error between a (16*h) block and a bidirectional
38 * prediction
39 *
40 * p2: address of top left pel of block
41 * pf,hxf,hyf: address and half pel flags of forward ref. block
42 * pb,hxb,hyb: address and half pel flags of backward ref. block
43 * h: height of block
44 * rowstride: distance (in bytes) of vertically adjacent pels in p2,pf,pb
45 *
46 *
47 * Input hints:
48 * a) values for h[xy][fb] are 0 and 1
49 * b) h is 8 or 16
50 *
51 * {
52 * d = ((((pf[i]+pfa[i]+pfb[i]+pfc[i]+2)>>2)+
53 * ((pb[i]+pba[i]+pbb[i]+pbc[i]+2)>>2)+1)>>1) - p2[i];
54 * sum += d * d;
55 * }
56 */
57
58 #define BSUMSQ_PDECL \
59 uint8_t *pf, \
60 uint8_t *pb, \
61 uint8_t *p2, \
62 int rowstride, \
63 int hxf, \
64 int hyf, \
65 int hxb, \
66 int hyb, \
67 int h \
68
69 #define BSUMSQ_ARGS pf, pb, p2, rowstride, hxf, hyf, hxb, hyb, h
70
bsumsq_altivec(BSUMSQ_PDECL)71 int bsumsq_altivec(BSUMSQ_PDECL)
72 {
73 int i;
74 uint8_t *pfy, *pby;
75 vector unsigned char l0, l1, lR;
76 vector unsigned char permF0, permF1, permB0, permB1;
77 vector unsigned char vf, vfa, vfb, vfc;
78 vector unsigned char vb, vba, vbb, vbc;
79 vector unsigned short tH, tL, fH, fL, bH, bL;
80 vector unsigned char zero;
81 vector unsigned short one, two;
82 vector unsigned char max, min, dif;
83 vector unsigned int sum;
84 union {
85 vector signed int v;
86 struct {
87 signed int pad[3];
88 signed int sum;
89 } s;
90 } vo;
91
92
93
94 #ifdef ALTIVEC_VERIFY
95 if (NOT_VECTOR_ALIGNED(p2))
96 mjpeg_error_exit1("bsumsq: p2 %% 16 != 0, (0x%X)", p2);
97
98 if (NOT_VECTOR_ALIGNED(rowstride))
99 mjpeg_error_exit1("bsumsq: rowstride %% 16 != 0, (%d)", rowstride);
100
101 if (hxf != 0 && hxf != 1)
102 mjpeg_error_exit1("bsumsq: hxf != [0|1], (hxf=%d)", hxf);
103 if (hyf != 0 && hyf != 1)
104 mjpeg_error_exit1("bsumsq: hyf != [0|1], (hyf=%d)", hyf);
105 if (hxb != 0 && hxb != 1)
106 mjpeg_error_exit1("bsumsq: hxb != [0|1], (hxb=%d)", hxb);
107 if (hyb != 0 && hyb != 1)
108 mjpeg_error_exit1("bsumsq: hyb != [0|1], (hyb=%d)", hyb);
109 #endif
110
111 if (h != 8 && h != 16)
112 mjpeg_error_exit1("bsumsq: h != [8|16], (%d)", h);
113
114 AMBER_START;
115
116 /* start loading first set */
117 vfb = vec_ld(0, pf); /* use vfb & vfc as temp for vf & vfa */
118 vfc = vec_ld(16, pf);
119
120 pfy = pf + (rowstride * hyf);
121 l0 = vec_ld(0, pfy);
122 l1 = vec_ld(16, pfy);
123
124
125 pby = pb + (rowstride * hyb);
126
127
128 zero = vec_splat_u8(0);
129 one = vec_splat_u16(1);
130 two = vec_splat_u16(2);
131
132 sum = vec_splat_u32(0);
133
134
135 permF0 = vec_lvsl(0, pf);
136 permF1 = vec_lvsl(hxf, (unsigned char*)0);
137 permF1 = vec_splat(permF1, 0);
138 permF1 = vec_add(permF0, permF1);
139
140 permB0 = vec_lvsl(0, pb);
141 permB1 = vec_lvsl(hxb, (unsigned char*)0);
142 permB1 = vec_splat(permB1, 0);
143 permB1 = vec_add(permB0, permB1);
144
145
146 i = h - 1;
147 do { /* while (--i) */
148
149 vf = vec_perm(vfb, vfc, permF0);
150 vfa = vec_perm(vfb, vfc, permF1);
151 vfb = vec_perm(l0, l1, permF0);
152 vfc = vec_perm(l0, l1, permF1);
153
154 vbb = vec_ld(0, pb); /* use vbb & vbc as temp for vb & vba */
155 vbc = vec_ld(16, pb);
156 l0 = vec_ld(0, pby);
157 l1 = vec_ld(16, pby);
158
159 pb += rowstride;
160 pby += rowstride;
161
162 /* (unsigned short[]) pf[0-7] */
163 fH = vu16(vec_mergeh(zero, vf));
164
165 /* (unsigned short[]) pf[8-15] */
166 fL = vu16(vec_mergel(zero, vf));
167
168 /* (unsigned short[]) pfa[0-7] */
169 tH = vu16(vec_mergeh(zero, vfa));
170
171 /* (unsigned short[]) pfa[8-15] */
172 tL = vu16(vec_mergel(zero, vfa));
173
174 /* pf[i] + pfa[i] */
175 fH = vec_add(fH, tH);
176 fL = vec_add(fL, tL);
177
178 /* (unsigned short[]) pfb[0-7] */
179 tH = vu16(vec_mergeh(zero, vfb));
180
181 /* (unsigned short[]) pfb[8-15] */
182 tL = vu16(vec_mergel(zero, vfb));
183
184 /* (pf[i]+pfa[i]) + pfb[i] */
185 fH = vec_add(fH, tH);
186 fL = vec_add(fL, tL);
187
188 /* (unsigned short[]) pfc[0-7] */
189 tH = vu16(vec_mergeh(zero, vfc));
190
191 /* (unsigned short[]) pfc[8-15] */
192 tL = vu16(vec_mergel(zero, vfc));
193
194 /* (pf[i]+pfa[i]+pfb[i]) + pfc[i] */
195 fH = vec_add(fH, tH);
196 fL = vec_add(fL, tL);
197
198
199 /* (pf[i]+pfa[i]+pfb[i]+pfc[i]) + 2 */
200 fH = vec_add(fH, two);
201 fL = vec_add(fL, two);
202
203 /* (pf[i]+pfa[i]+pfb[i]+pfc[i]+2) >> 2 */
204 fH = vec_sra(fH, two);
205 fL = vec_sra(fL, two);
206
207
208 lR = vec_ld(0, p2);
209 p2 += rowstride;
210
211 vb = vec_perm(vbb, vbc, permB0);
212 vba = vec_perm(vbb, vbc, permB1);
213 vbb = vec_perm(l0, l1, permB0);
214 vbc = vec_perm(l0, l1, permB1);
215
216
217 pf += rowstride;
218 vfb = vec_ld(0, pf); /* use vfb & vfc as temp for vf & vfa */
219 vfc = vec_ld(16, pf);
220 pfy += rowstride;
221 l0 = vec_ld(0, pfy);
222 l1 = vec_ld(16, pfy);
223
224 /* (unsigned short[]) pb[0-7] */
225 bH = vu16(vec_mergeh(zero, vb));
226
227 /* (unsigned short[]) pb[8-15] */
228 bL = vu16(vec_mergel(zero, vb));
229
230 /* (unsigned short[]) pba[0-7] */
231 tH = vu16(vec_mergeh(zero, vba));
232
233 /* (unsigned short[]) pba[8-15] */
234 tL = vu16(vec_mergel(zero, vba));
235
236 /* pb[i] + pba[i] */
237 bH = vec_add(bH, tH);
238 bL = vec_add(bL, tL);
239
240 /* (unsigned short[]) pbb[0-7] */
241 tH = vu16(vec_mergeh(zero, vbb));
242
243 /* (unsigned short[]) pbb[8-15] */
244 tL = vu16(vec_mergel(zero, vbb));
245
246 /* (pb[i]+pba[i]) + pbb[i] */
247 bH = vec_add(bH, tH);
248 bL = vec_add(bL, tL);
249
250 /* (unsigned short[]) pbc[0-7] */
251 tH = vu16(vec_mergeh(zero, vbc));
252
253 /* (unsigned short[]) pbc[8-15] */
254 tL = vu16(vec_mergel(zero, vbc));
255
256 /* (pb[i]+pba[i]+pbb[i]) + pbc[i] */
257 bH = vec_add(bH, tH);
258 bL = vec_add(bL, tL);
259
260
261 /* (pb[i]+pba[i]+pbb[i]+pbc[i]) + 2 */
262 bH = vec_add(bH, two);
263 bL = vec_add(bL, two);
264
265 /* (pb[i]+pba[i]+pbb[i]+pbc[i]+2) >> 2 */
266 bH = vec_sra(bH, two);
267 bL = vec_sra(bL, two);
268
269 /* ((pf[i]+pfa[i]+pfb[i]+pfc[i]+2)>>2) +
270 * ((pb[i]+pba[i]+pbb[i]+pbc[i]+2)>>2)
271 */
272 tH = vec_add(fH, bH);
273 tL = vec_add(fL, bL);
274
275 /* (((pf[i]+pfa[i]+pfb[i]+pfc[i]+2)>>2)+
276 * ((pb[i]+pba[i]+pbb[i]+pbc[i]+2)>>2)) + 1
277 */
278 tH = vec_add(tH, one);
279 tL = vec_add(tL, one);
280
281 /* (((pf[i]+pfa[i]+pfb[i]+pfc[i]+2)>>2)+
282 * ((pb[i]+pba[i]+pbb[i]+pbc[i]+2)>>2)+1) >> 1
283 */
284 tH = vec_sra(tH, one);
285 tL = vec_sra(tL, one);
286
287 /* absolute value increases parallelism (x16 instead of x8)
288 * since a bit isn't lost on the sign.
289 *
290 * d = abs( ((((pf[i]+pfa[i]+pfb[i]+pfc[i]+2)>>2)+
291 * ((pb[i]+pba[i]+pbb[i]+pbc[i]+2)>>2)+1)>>1) - p2[i] )
292 */
293 tH = vu16(vec_packsu(tH, tL));
294 min = vec_min(vu8(tH), lR);
295 max = vec_max(vu8(tH), lR);
296 dif = vec_sub(max, min);
297
298 /* sum += (d * d) */
299 sum = vec_msum(dif, dif, sum);
300
301 } while (--i);
302
303 vf = vec_perm(vfb, vfc, permF0);
304 vfa = vec_perm(vfb, vfc, permF1);
305 vfb = vec_perm(l0, l1, permF0);
306 vfc = vec_perm(l0, l1, permF1);
307
308 vbb = vec_ld(0, pb); /* use vbb & vbc as temp for vb & vba */
309 vbc = vec_ld(16, pb);
310 l0 = vec_ld(0, pby);
311 l1 = vec_ld(16, pby);
312
313 /* (unsigned short[]) pf[0-7] */
314 fH = vu16(vec_mergeh(zero, vf));
315
316 /* (unsigned short[]) pf[8-15] */
317 fL = vu16(vec_mergel(zero, vf));
318
319 /* (unsigned short[]) pfa[0-7] */
320 tH = vu16(vec_mergeh(zero, vfa));
321
322 /* (unsigned short[]) pfa[8-15] */
323 tL = vu16(vec_mergel(zero, vfa));
324
325 /* pf[i] + pfa[i] */
326 fH = vec_add(fH, tH);
327 fL = vec_add(fL, tL);
328
329 /* (unsigned short[]) pfb[0-7] */
330 tH = vu16(vec_mergeh(zero, vfb));
331
332 /* (unsigned short[]) pfb[8-15] */
333 tL = vu16(vec_mergel(zero, vfb));
334
335 /* (pf[i]+pfa[i]) + pfb[i] */
336 fH = vec_add(fH, tH);
337 fL = vec_add(fL, tL);
338
339 /* (unsigned short[]) pfc[0-7] */
340 tH = vu16(vec_mergeh(zero, vfc));
341
342 /* (unsigned short[]) pfc[8-15] */
343 tL = vu16(vec_mergel(zero, vfc));
344
345 /* (pf[i]+pfa[i]+pfb[i]) + pfc[i] */
346 fH = vec_add(fH, tH);
347 fL = vec_add(fL, tL);
348
349 /* (pf[i]+pfa[i]+pfb[i]+pfc[i]) + 2 */
350 fH = vec_add(fH, two);
351 fL = vec_add(fL, two);
352
353 /* (pf[i]+pfa[i]+pfb[i]+pfc[i]+2) >> 2 */
354 fH = vec_sra(fH, two);
355 fL = vec_sra(fL, two);
356
357 lR = vec_ld(0, p2);
358
359 vb = vec_perm(vbb, vbc, permB0);
360 vba = vec_perm(vbb, vbc, permB1);
361 vbb = vec_perm(l0, l1, permB0);
362 vbc = vec_perm(l0, l1, permB1);
363
364 /* (unsigned short[]) pb[0-7] */
365 bH = vu16(vec_mergeh(zero, vb));
366
367 /* (unsigned short[]) pb[8-15] */
368 bL = vu16(vec_mergel(zero, vb));
369
370 /* (unsigned short[]) pba[0-7] */
371 tH = vu16(vec_mergeh(zero, vba));
372
373 /* (unsigned short[]) pba[8-15] */
374 tL = vu16(vec_mergel(zero, vba));
375
376 /* pb[i] + pba[i] */
377 bH = vec_add(bH, tH);
378 bL = vec_add(bL, tL);
379
380 /* (unsigned short[]) pbb[0-7] */
381 tH = vu16(vec_mergeh(zero, vbb));
382
383 /* (unsigned short[]) pbb[8-15] */
384 tL = vu16(vec_mergel(zero, vbb));
385
386 /* (pb[i]+pba[i]) + pbb[i] */
387 bH = vec_add(bH, tH);
388 bL = vec_add(bL, tL);
389
390 /* (unsigned short[]) pbc[0-7] */
391 tH = vu16(vec_mergeh(zero, vbc));
392
393 /* (unsigned short[]) pbc[8-15] */
394 tL = vu16(vec_mergel(zero, vbc));
395
396 /* (pb[i]+pba[i]+pbb[i]) + pbc[i] */
397 bH = vec_add(bH, tH);
398 bL = vec_add(bL, tL);
399
400
401 /* (pb[i]+pba[i]+pbb[i]+pbc[i]) + 2 */
402 bH = vec_add(bH, two);
403 bL = vec_add(bL, two);
404
405 /* (pb[i]+pba[i]+pbb[i]+pbc[i]+2) >> 2 */
406 bH = vec_sra(bH, two);
407 bL = vec_sra(bL, two);
408
409 /* ((pf[i]+pfa[i]+pfb[i]+pfc[i]+2)>>2) +
410 * ((pb[i]+pba[i]+pbb[i]+pbc[i]+2)>>2)
411 */
412 tH = vec_add(fH, bH);
413 tL = vec_add(fL, bL);
414
415 /* (((pf[i]+pfa[i]+pfb[i]+pfc[i]+2)>>2)+
416 * ((pb[i]+pba[i]+pbb[i]+pbc[i]+2)>>2)) + 1
417 */
418 tH = vec_add(tH, one);
419 tL = vec_add(tL, one);
420
421 /* (((pf[i]+pfa[i]+pfb[i]+pfc[i]+2)>>2)+
422 * ((pb[i]+pba[i]+pbb[i]+pbc[i]+2)>>2)+1) >> 1
423 */
424 tH = vec_sra(tH, one);
425 tL = vec_sra(tL, one);
426
427 /* absolute value increases parallelism (x16 instead of x8)
428 * since a bit isn't lost on the sign.
429 *
430 * d = abs( ((((pf[i]+pfa[i]+pfb[i]+pfc[i]+2)>>2)+
431 * ((pb[i]+pba[i]+pbb[i]+pbc[i]+2)>>2)+1)>>1) - p2[i] )
432 */
433 tH = vu16(vec_packsu(tH, tL));
434 min = vec_min(vu8(tH), lR);
435 max = vec_max(vu8(tH), lR);
436 dif = vec_sub(max, min);
437
438 /* sum += (d * d) */
439 sum = vec_msum(dif, dif, sum);
440
441 /* sum all parts of difference into one 32 bit quantity */
442 vo.v = vec_sums(vs32(sum), vs32(zero));
443
444 AMBER_STOP;
445 return vo.s.sum;
446 }
447
448 #if ALTIVEC_TEST_FUNCTION(bsumsq)
449
450 #undef BENCHMARK_FREQUENCY
451 #define BENCHMARK_FREQUENCY 1000 /* benchmark every (n) calls */
452
453 ALTIVEC_TEST(bsumsq, int, (BSUMSQ_PDECL),
454 "pf=0x%X, pb=0x%X, p2=0x%X, rowstride=%d, "
455 "hxf=%d, hyf=%d, hxb=%d, hyb=%d, h=%d",
456 BSUMSQ_ARGS);
457 #endif
458 /* vim:set foldmethod=marker foldlevel=0: */
459