1 /* bsumsq.c, this file is part of the
2  * AltiVec optimized library for MJPEG tools MPEG-1/2 Video Encoder
3  * Copyright (C) 2002  James Klicman <james@klicman.org>
4  *
5  * This library is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License as published by
7  * the Free Software Foundation; either version 2 of the License, or
8  * (at your option) any later version.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18  */
19 
20 #ifdef HAVE_CONFIG_H
21 #include <config.h>
22 #endif
23 
24 #include "altivec_motion.h"
25 #include "vectorize.h"
26 #include "../mjpeg_logging.h"
27 
28 /* #define AMBER_ENABLE */
29 #include "amber.h"
30 
31 #ifdef HAVE_ALTIVEC_H
32 /* include last to ensure AltiVec type semantics, especially for bool. */
33 #include <altivec.h>
34 #endif
35 
36 /*
37  * squared error between a (16*h) block and a bidirectional
38  * prediction
39  *
40  * p2: address of top left pel of block
41  * pf,hxf,hyf: address and half pel flags of forward ref. block
42  * pb,hxb,hyb: address and half pel flags of backward ref. block
43  * h: height of block
44  * rowstride: distance (in bytes) of vertically adjacent pels in p2,pf,pb
45  *
46  *
47  *  Input hints:
48  *  a) values for h[xy][fb] are 0 and 1
49  *  b) h is 8 or 16
50  *
51  * {
52  *     d = ((((pf[i]+pfa[i]+pfb[i]+pfc[i]+2)>>2)+
53  *           ((pb[i]+pba[i]+pbb[i]+pbc[i]+2)>>2)+1)>>1) - p2[i];
54  *     sum += d * d;
55  * }
56  */
57 
58 #define BSUMSQ_PDECL                                                         \
59   uint8_t *pf,                                                               \
60   uint8_t *pb,                                                               \
61   uint8_t *p2,                                                               \
62   int rowstride,                                                             \
63   int hxf,                                                                   \
64   int hyf,                                                                   \
65   int hxb,                                                                   \
66   int hyb,                                                                   \
67   int h                                                                      \
68 
69 #define BSUMSQ_ARGS pf, pb, p2, rowstride, hxf, hyf, hxb, hyb, h
70 
bsumsq_altivec(BSUMSQ_PDECL)71 int bsumsq_altivec(BSUMSQ_PDECL)
72 {
73     int i;
74     uint8_t *pfy, *pby;
75     vector unsigned char l0, l1, lR;
76     vector unsigned char permF0, permF1, permB0, permB1;
77     vector unsigned char vf, vfa, vfb, vfc;
78     vector unsigned char vb, vba, vbb, vbc;
79     vector unsigned short tH, tL, fH, fL, bH, bL;
80     vector unsigned char zero;
81     vector unsigned short one, two;
82     vector unsigned char max, min, dif;
83     vector unsigned int sum;
84     union {
85 	vector signed int v;
86 	struct {
87 	    signed int pad[3];
88 	    signed int sum;
89 	} s;
90     } vo;
91 
92 
93 
94 #ifdef ALTIVEC_VERIFY
95     if (NOT_VECTOR_ALIGNED(p2))
96 	mjpeg_error_exit1("bsumsq: p2 %% 16 != 0, (0x%X)", p2);
97 
98     if (NOT_VECTOR_ALIGNED(rowstride))
99 	mjpeg_error_exit1("bsumsq: rowstride %% 16 != 0, (%d)", rowstride);
100 
101     if (hxf != 0 && hxf != 1)
102 	mjpeg_error_exit1("bsumsq: hxf != [0|1], (hxf=%d)", hxf);
103     if (hyf != 0 && hyf != 1)
104 	mjpeg_error_exit1("bsumsq: hyf != [0|1], (hyf=%d)", hyf);
105     if (hxb != 0 && hxb != 1)
106 	mjpeg_error_exit1("bsumsq: hxb != [0|1], (hxb=%d)", hxb);
107     if (hyb != 0 && hyb != 1)
108 	mjpeg_error_exit1("bsumsq: hyb != [0|1], (hyb=%d)", hyb);
109 #endif
110 
111     if (h != 8 && h != 16)
112 	mjpeg_error_exit1("bsumsq: h != [8|16], (%d)", h);
113 
114     AMBER_START;
115 
116     /* start loading first set  */
117     vfb = vec_ld(0, pf);	 /* use vfb & vfc as temp for vf & vfa */
118     vfc = vec_ld(16, pf);
119 
120     pfy = pf + (rowstride * hyf);
121     l0 = vec_ld(0, pfy);
122     l1 = vec_ld(16, pfy);
123 
124 
125     pby = pb + (rowstride * hyb);
126 
127 
128     zero  = vec_splat_u8(0);
129     one = vec_splat_u16(1);
130     two = vec_splat_u16(2);
131 
132     sum = vec_splat_u32(0);
133 
134 
135     permF0 = vec_lvsl(0, pf);
136     permF1 = vec_lvsl(hxf, (unsigned char*)0);
137     permF1 = vec_splat(permF1, 0);
138     permF1 = vec_add(permF0, permF1);
139 
140     permB0 = vec_lvsl(0, pb);
141     permB1 = vec_lvsl(hxb, (unsigned char*)0);
142     permB1 = vec_splat(permB1, 0);
143     permB1 = vec_add(permB0, permB1);
144 
145 
146     i = h - 1;
147     do { /* while (--i) */
148 
149 	vf = vec_perm(vfb, vfc, permF0);
150 	vfa = vec_perm(vfb, vfc, permF1);
151 	vfb = vec_perm(l0, l1, permF0);
152 	vfc = vec_perm(l0, l1, permF1);
153 
154 	vbb = vec_ld(0, pb);	 /* use vbb & vbc as temp for vb & vba */
155 	vbc = vec_ld(16, pb);
156 	l0 = vec_ld(0, pby);
157 	l1 = vec_ld(16, pby);
158 
159 	pb += rowstride;
160 	pby += rowstride;
161 
162 	/* (unsigned short[]) pf[0-7] */
163 	fH = vu16(vec_mergeh(zero, vf));
164 
165 	/* (unsigned short[]) pf[8-15] */
166 	fL = vu16(vec_mergel(zero, vf));
167 
168 	/* (unsigned short[]) pfa[0-7] */
169 	tH = vu16(vec_mergeh(zero, vfa));
170 
171 	/* (unsigned short[]) pfa[8-15] */
172 	tL = vu16(vec_mergel(zero, vfa));
173 
174 	/* pf[i] + pfa[i] */
175 	fH = vec_add(fH, tH);
176 	fL = vec_add(fL, tL);
177 
178 	/* (unsigned short[]) pfb[0-7] */
179 	tH = vu16(vec_mergeh(zero, vfb));
180 
181 	/* (unsigned short[]) pfb[8-15] */
182 	tL = vu16(vec_mergel(zero, vfb));
183 
184 	/* (pf[i]+pfa[i]) + pfb[i] */
185 	fH = vec_add(fH, tH);
186 	fL = vec_add(fL, tL);
187 
188 	/* (unsigned short[]) pfc[0-7] */
189 	tH = vu16(vec_mergeh(zero, vfc));
190 
191 	/* (unsigned short[]) pfc[8-15] */
192 	tL = vu16(vec_mergel(zero, vfc));
193 
194 	/* (pf[i]+pfa[i]+pfb[i]) + pfc[i] */
195 	fH = vec_add(fH, tH);
196 	fL = vec_add(fL, tL);
197 
198 
199 	/* (pf[i]+pfa[i]+pfb[i]+pfc[i]) + 2 */
200 	fH = vec_add(fH, two);
201 	fL = vec_add(fL, two);
202 
203 	/* (pf[i]+pfa[i]+pfb[i]+pfc[i]+2) >> 2 */
204 	fH = vec_sra(fH, two);
205 	fL = vec_sra(fL, two);
206 
207 
208 	lR = vec_ld(0, p2);
209 	p2 += rowstride;
210 
211 	vb = vec_perm(vbb, vbc, permB0);
212 	vba = vec_perm(vbb, vbc, permB1);
213 	vbb = vec_perm(l0, l1, permB0);
214 	vbc = vec_perm(l0, l1, permB1);
215 
216 
217 	pf += rowstride;
218 	vfb = vec_ld(0, pf);	 /* use vfb & vfc as temp for vf & vfa */
219 	vfc = vec_ld(16, pf);
220 	pfy += rowstride;
221 	l0 = vec_ld(0, pfy);
222 	l1 = vec_ld(16, pfy);
223 
224 	/* (unsigned short[]) pb[0-7] */
225 	bH = vu16(vec_mergeh(zero, vb));
226 
227 	/* (unsigned short[]) pb[8-15] */
228 	bL = vu16(vec_mergel(zero, vb));
229 
230 	/* (unsigned short[]) pba[0-7] */
231 	tH = vu16(vec_mergeh(zero, vba));
232 
233 	/* (unsigned short[]) pba[8-15] */
234 	tL = vu16(vec_mergel(zero, vba));
235 
236 	/* pb[i] + pba[i] */
237 	bH = vec_add(bH, tH);
238 	bL = vec_add(bL, tL);
239 
240 	/* (unsigned short[]) pbb[0-7] */
241 	tH = vu16(vec_mergeh(zero, vbb));
242 
243 	/* (unsigned short[]) pbb[8-15] */
244 	tL = vu16(vec_mergel(zero, vbb));
245 
246 	/* (pb[i]+pba[i]) + pbb[i] */
247 	bH = vec_add(bH, tH);
248 	bL = vec_add(bL, tL);
249 
250 	/* (unsigned short[]) pbc[0-7] */
251 	tH = vu16(vec_mergeh(zero, vbc));
252 
253 	/* (unsigned short[]) pbc[8-15] */
254 	tL = vu16(vec_mergel(zero, vbc));
255 
256 	/* (pb[i]+pba[i]+pbb[i]) + pbc[i] */
257 	bH = vec_add(bH, tH);
258 	bL = vec_add(bL, tL);
259 
260 
261 	/* (pb[i]+pba[i]+pbb[i]+pbc[i]) + 2 */
262 	bH = vec_add(bH, two);
263 	bL = vec_add(bL, two);
264 
265 	/* (pb[i]+pba[i]+pbb[i]+pbc[i]+2) >> 2 */
266 	bH = vec_sra(bH, two);
267 	bL = vec_sra(bL, two);
268 
269 	/* ((pf[i]+pfa[i]+pfb[i]+pfc[i]+2)>>2) +
270 	 * ((pb[i]+pba[i]+pbb[i]+pbc[i]+2)>>2)
271          */
272 	tH = vec_add(fH, bH);
273 	tL = vec_add(fL, bL);
274 
275 	/* (((pf[i]+pfa[i]+pfb[i]+pfc[i]+2)>>2)+
276 	 *  ((pb[i]+pba[i]+pbb[i]+pbc[i]+2)>>2)) + 1
277          */
278 	tH = vec_add(tH, one);
279 	tL = vec_add(tL, one);
280 
281 	/* (((pf[i]+pfa[i]+pfb[i]+pfc[i]+2)>>2)+
282 	 *  ((pb[i]+pba[i]+pbb[i]+pbc[i]+2)>>2)+1) >> 1
283          */
284 	tH = vec_sra(tH, one);
285 	tL = vec_sra(tL, one);
286 
287 	/* absolute value increases parallelism (x16 instead of x8)
288 	 * since a bit isn't lost on the sign.
289 	 *
290 	 * d = abs( ((((pf[i]+pfa[i]+pfb[i]+pfc[i]+2)>>2)+
291 	 *            ((pb[i]+pba[i]+pbb[i]+pbc[i]+2)>>2)+1)>>1) - p2[i] )
292          */
293 	tH = vu16(vec_packsu(tH, tL));
294 	min = vec_min(vu8(tH), lR);
295 	max = vec_max(vu8(tH), lR);
296 	dif = vec_sub(max, min);
297 
298 	/* sum += (d * d) */
299 	sum = vec_msum(dif, dif, sum);
300 
301     } while (--i);
302 
303     vf = vec_perm(vfb, vfc, permF0);
304     vfa = vec_perm(vfb, vfc, permF1);
305     vfb = vec_perm(l0, l1, permF0);
306     vfc = vec_perm(l0, l1, permF1);
307 
308     vbb = vec_ld(0, pb);	 /* use vbb & vbc as temp for vb & vba */
309     vbc = vec_ld(16, pb);
310     l0 = vec_ld(0, pby);
311     l1 = vec_ld(16, pby);
312 
313     /* (unsigned short[]) pf[0-7] */
314     fH = vu16(vec_mergeh(zero, vf));
315 
316     /* (unsigned short[]) pf[8-15] */
317     fL = vu16(vec_mergel(zero, vf));
318 
319     /* (unsigned short[]) pfa[0-7] */
320     tH = vu16(vec_mergeh(zero, vfa));
321 
322     /* (unsigned short[]) pfa[8-15] */
323     tL = vu16(vec_mergel(zero, vfa));
324 
325     /* pf[i] + pfa[i] */
326     fH = vec_add(fH, tH);
327     fL = vec_add(fL, tL);
328 
329     /* (unsigned short[]) pfb[0-7] */
330     tH = vu16(vec_mergeh(zero, vfb));
331 
332     /* (unsigned short[]) pfb[8-15] */
333     tL = vu16(vec_mergel(zero, vfb));
334 
335     /* (pf[i]+pfa[i]) + pfb[i] */
336     fH = vec_add(fH, tH);
337     fL = vec_add(fL, tL);
338 
339     /* (unsigned short[]) pfc[0-7] */
340     tH = vu16(vec_mergeh(zero, vfc));
341 
342     /* (unsigned short[]) pfc[8-15] */
343     tL = vu16(vec_mergel(zero, vfc));
344 
345     /* (pf[i]+pfa[i]+pfb[i]) + pfc[i] */
346     fH = vec_add(fH, tH);
347     fL = vec_add(fL, tL);
348 
349     /* (pf[i]+pfa[i]+pfb[i]+pfc[i]) + 2 */
350     fH = vec_add(fH, two);
351     fL = vec_add(fL, two);
352 
353     /* (pf[i]+pfa[i]+pfb[i]+pfc[i]+2) >> 2 */
354     fH = vec_sra(fH, two);
355     fL = vec_sra(fL, two);
356 
357     lR = vec_ld(0, p2);
358 
359     vb = vec_perm(vbb, vbc, permB0);
360     vba = vec_perm(vbb, vbc, permB1);
361     vbb = vec_perm(l0, l1, permB0);
362     vbc = vec_perm(l0, l1, permB1);
363 
364     /* (unsigned short[]) pb[0-7] */
365     bH = vu16(vec_mergeh(zero, vb));
366 
367     /* (unsigned short[]) pb[8-15] */
368     bL = vu16(vec_mergel(zero, vb));
369 
370     /* (unsigned short[]) pba[0-7] */
371     tH = vu16(vec_mergeh(zero, vba));
372 
373     /* (unsigned short[]) pba[8-15] */
374     tL = vu16(vec_mergel(zero, vba));
375 
376     /* pb[i] + pba[i] */
377     bH = vec_add(bH, tH);
378     bL = vec_add(bL, tL);
379 
380     /* (unsigned short[]) pbb[0-7] */
381     tH = vu16(vec_mergeh(zero, vbb));
382 
383     /* (unsigned short[]) pbb[8-15] */
384     tL = vu16(vec_mergel(zero, vbb));
385 
386     /* (pb[i]+pba[i]) + pbb[i] */
387     bH = vec_add(bH, tH);
388     bL = vec_add(bL, tL);
389 
390     /* (unsigned short[]) pbc[0-7] */
391     tH = vu16(vec_mergeh(zero, vbc));
392 
393     /* (unsigned short[]) pbc[8-15] */
394     tL = vu16(vec_mergel(zero, vbc));
395 
396     /* (pb[i]+pba[i]+pbb[i]) + pbc[i] */
397     bH = vec_add(bH, tH);
398     bL = vec_add(bL, tL);
399 
400 
401     /* (pb[i]+pba[i]+pbb[i]+pbc[i]) + 2 */
402     bH = vec_add(bH, two);
403     bL = vec_add(bL, two);
404 
405     /* (pb[i]+pba[i]+pbb[i]+pbc[i]+2) >> 2 */
406     bH = vec_sra(bH, two);
407     bL = vec_sra(bL, two);
408 
409     /* ((pf[i]+pfa[i]+pfb[i]+pfc[i]+2)>>2) +
410      * ((pb[i]+pba[i]+pbb[i]+pbc[i]+2)>>2)
411      */
412     tH = vec_add(fH, bH);
413     tL = vec_add(fL, bL);
414 
415     /* (((pf[i]+pfa[i]+pfb[i]+pfc[i]+2)>>2)+
416      *  ((pb[i]+pba[i]+pbb[i]+pbc[i]+2)>>2)) + 1
417      */
418     tH = vec_add(tH, one);
419     tL = vec_add(tL, one);
420 
421     /* (((pf[i]+pfa[i]+pfb[i]+pfc[i]+2)>>2)+
422      *  ((pb[i]+pba[i]+pbb[i]+pbc[i]+2)>>2)+1) >> 1
423      */
424     tH = vec_sra(tH, one);
425     tL = vec_sra(tL, one);
426 
427     /* absolute value increases parallelism (x16 instead of x8)
428      * since a bit isn't lost on the sign.
429      *
430      * d = abs( ((((pf[i]+pfa[i]+pfb[i]+pfc[i]+2)>>2)+
431      *            ((pb[i]+pba[i]+pbb[i]+pbc[i]+2)>>2)+1)>>1) - p2[i] )
432      */
433     tH = vu16(vec_packsu(tH, tL));
434     min = vec_min(vu8(tH), lR);
435     max = vec_max(vu8(tH), lR);
436     dif = vec_sub(max, min);
437 
438     /* sum += (d * d) */
439     sum = vec_msum(dif, dif, sum);
440 
441     /* sum all parts of difference into one 32 bit quantity */
442     vo.v = vec_sums(vs32(sum), vs32(zero));
443 
444     AMBER_STOP;
445     return vo.s.sum;
446 }
447 
448 #if ALTIVEC_TEST_FUNCTION(bsumsq)
449 
450 #undef BENCHMARK_FREQUENCY
451 #define BENCHMARK_FREQUENCY  1000   /* benchmark every (n) calls */
452 
453 ALTIVEC_TEST(bsumsq, int, (BSUMSQ_PDECL),
454   "pf=0x%X, pb=0x%X, p2=0x%X, rowstride=%d, "
455   "hxf=%d, hyf=%d, hxb=%d, hyb=%d, h=%d",
456   BSUMSQ_ARGS);
457 #endif
458 /* vim:set foldmethod=marker foldlevel=0: */
459