1 /*
2 * motion_comp_vis.c
3 * Copyright (C) 2003 David S. Miller <davem@redhat.com>
4 *
5 * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
6 * See http://libmpeg2.sourceforge.net/ for updates.
7 *
8 * mpeg2dec is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * mpeg2dec is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21 */
22
23 #include "config.h"
24
25 #if defined(ARCH_SPARC) && defined(ENABLE_VIS)
26
27 #include <inttypes.h>
28
29 #include "../include/mpeg2.h"
30 #include "../include/attributes.h"
31 #include "mpeg2_internal.h"
32 #include "../include/vis.h"
33
34 /* The trick used in some of this file is the formula from the MMX
35 * motion comp code, which is:
36 *
37 * (x+y+1)>>1 == (x|y)-((x^y)>>1)
38 *
39 * This allows us to average 8 bytes at a time in a 64-bit FPU reg.
40 * We avoid overflows by masking before we do the shift, and we
41 * implement the shift by multiplying by 1/2 using mul8x16. So in
42 * VIS this is (assume 'x' is in f0, 'y' is in f2, a repeating mask
43 * of '0xfe' is in f4, a repeating mask of '0x7f' is in f6, and
44 * the value 0x80808080 is in f8):
45 *
46 * fxor f0, f2, f10
47 * fand f10, f4, f10
48 * fmul8x16 f8, f10, f10
49 * fand f10, f6, f10
50 * for f0, f2, f12
51 * fpsub16 f12, f10, f10
52 */
53
54 #define DUP4(x) {x, x, x, x}
55 #define DUP8(x) {x, x, x, x, x, x, x, x}
56 static const int16_t constants1[] ATTR_ALIGN(8) = DUP4 (1);
57 static const int16_t constants2[] ATTR_ALIGN(8) = DUP4 (2);
58 static const int16_t constants3[] ATTR_ALIGN(8) = DUP4 (3);
59 static const int16_t constants6[] ATTR_ALIGN(8) = DUP4 (6);
60 static const int8_t constants_fe[] ATTR_ALIGN(8) = DUP8 (0xfe);
61 static const int8_t constants_7f[] ATTR_ALIGN(8) = DUP8 (0x7f);
62 static const int8_t constants128[] ATTR_ALIGN(8) = DUP8 (128);
63 static const int16_t constants256_512[] ATTR_ALIGN(8) =
64 {256, 512, 256, 512};
65 static const int16_t constants256_1024[] ATTR_ALIGN(8) =
66 {256, 1024, 256, 1024};
67
68 #define REF_0 0
69 #define REF_0_1 1
70 #define REF_2 2
71 #define REF_2_1 3
72 #define REF_4 4
73 #define REF_4_1 5
74 #define REF_6 6
75 #define REF_6_1 7
76 #define REF_S0 8
77 #define REF_S0_1 9
78 #define REF_S2 10
79 #define REF_S2_1 11
80 #define REF_S4 12
81 #define REF_S4_1 13
82 #define REF_S6 14
83 #define REF_S6_1 15
84 #define DST_0 16
85 #define DST_1 17
86 #define DST_2 18
87 #define DST_3 19
88 #define CONST_1 20
89 #define CONST_2 20
90 #define CONST_3 20
91 #define CONST_6 20
92 #define MASK_fe 20
93 #define CONST_128 22
94 #define CONST_256 22
95 #define CONST_512 22
96 #define CONST_1024 22
97 #define TMP0 24
98 #define TMP1 25
99 #define TMP2 26
100 #define TMP3 27
101 #define TMP4 28
102 #define TMP5 29
103 #define ZERO 30
104 #define MASK_7f 30
105
106 #define TMP6 32
107 #define TMP8 34
108 #define TMP10 36
109 #define TMP12 38
110 #define TMP14 40
111 #define TMP16 42
112 #define TMP18 44
113 #define TMP20 46
114 #define TMP22 48
115 #define TMP24 50
116 #define TMP26 52
117 #define TMP28 54
118 #define TMP30 56
119 #define TMP32 58
120
MC_put_o_16_vis(uint8_t * dest,const uint8_t * _ref,const int stride,int height)121 static void MC_put_o_16_vis (uint8_t * dest, const uint8_t * _ref,
122 const int stride, int height)
123 {
124 uint8_t *ref = (uint8_t *) _ref;
125 int offset;
126
127 ref = vis_alignaddr(ref);
128 offset = (ref != _ref) ? 16 : 0;
129 do { /* 5 cycles */
130 vis_ld64(ref[0], TMP0);
131
132 vis_ld64_2(ref, 8, TMP2);
133
134 vis_ld64_2(ref, offset, TMP4);
135 ref += stride;
136
137 vis_faligndata(TMP0, TMP2, REF_0);
138 vis_st64(REF_0, dest[0]);
139
140 vis_faligndata(TMP2, TMP4, REF_2);
141 vis_st64_2(REF_2, dest, 8);
142 dest += stride;
143 } while (--height);
144 }
145
MC_put_o_8_vis(uint8_t * dest,const uint8_t * _ref,const int stride,int height)146 static void MC_put_o_8_vis (uint8_t * dest, const uint8_t * _ref,
147 const int stride, int height)
148 {
149 uint8_t *ref = (uint8_t *) _ref;
150 int offset;
151
152 ref = vis_alignaddr(ref);
153 offset = (ref != _ref) ? 8 : 0;
154 do { /* 4 cycles */
155 vis_ld64(ref[0], TMP0);
156
157 vis_ld64_2(ref, offset, TMP2);
158 ref += stride;
159
160 /* stall */
161
162 vis_faligndata(TMP0, TMP2, REF_0);
163 vis_st64(REF_0, dest[0]);
164 dest += stride;
165 } while (--height);
166 }
167
168
MC_avg_o_16_vis(uint8_t * dest,const uint8_t * _ref,const int stride,int height)169 static void MC_avg_o_16_vis (uint8_t * dest, const uint8_t * _ref,
170 const int stride, int height)
171 {
172 uint8_t *ref = (uint8_t *) _ref;
173 int stride_8 = stride + 8;
174 int offset;
175
176 ref = vis_alignaddr(ref);
177 offset = (ref != _ref) ? 16 : 0;
178
179 vis_ld64(ref[0], TMP0);
180
181 vis_ld64(ref[8], TMP2);
182
183 vis_ld64_2(ref, offset, TMP4);
184
185 vis_ld64(dest[0], DST_0);
186
187 vis_ld64(dest[8], DST_2);
188
189 vis_ld64(constants_fe[0], MASK_fe);
190 vis_faligndata(TMP0, TMP2, REF_0);
191
192 vis_ld64(constants_7f[0], MASK_7f);
193 vis_faligndata(TMP2, TMP4, REF_2);
194
195 vis_ld64(constants128[0], CONST_128);
196
197 ref += stride;
198 height = (height >> 1) - 1;
199
200 do { /* 24 cycles */
201 vis_ld64(ref[0], TMP0);
202 vis_xor(DST_0, REF_0, TMP6);
203
204 vis_ld64_2(ref, 8, TMP2);
205 vis_and(TMP6, MASK_fe, TMP6);
206
207 vis_ld64_2(ref, offset, TMP4);
208 ref += stride;
209 vis_mul8x16(CONST_128, TMP6, TMP6);
210 vis_xor(DST_2, REF_2, TMP8);
211
212 vis_and(TMP8, MASK_fe, TMP8);
213
214 vis_or(DST_0, REF_0, TMP10);
215 vis_ld64_2(dest, stride, DST_0);
216 vis_mul8x16(CONST_128, TMP8, TMP8);
217
218 vis_or(DST_2, REF_2, TMP12);
219 vis_ld64_2(dest, stride_8, DST_2);
220
221 vis_ld64(ref[0], TMP14);
222 vis_and(TMP6, MASK_7f, TMP6);
223
224 vis_and(TMP8, MASK_7f, TMP8);
225
226 vis_psub16(TMP10, TMP6, TMP6);
227 vis_st64(TMP6, dest[0]);
228
229 vis_psub16(TMP12, TMP8, TMP8);
230 vis_st64_2(TMP8, dest, 8);
231
232 dest += stride;
233 vis_ld64_2(ref, 8, TMP16);
234 vis_faligndata(TMP0, TMP2, REF_0);
235
236 vis_ld64_2(ref, offset, TMP18);
237 vis_faligndata(TMP2, TMP4, REF_2);
238 ref += stride;
239
240 vis_xor(DST_0, REF_0, TMP20);
241
242 vis_and(TMP20, MASK_fe, TMP20);
243
244 vis_xor(DST_2, REF_2, TMP22);
245 vis_mul8x16(CONST_128, TMP20, TMP20);
246
247 vis_and(TMP22, MASK_fe, TMP22);
248
249 vis_or(DST_0, REF_0, TMP24);
250 vis_mul8x16(CONST_128, TMP22, TMP22);
251
252 vis_or(DST_2, REF_2, TMP26);
253
254 vis_ld64_2(dest, stride, DST_0);
255 vis_faligndata(TMP14, TMP16, REF_0);
256
257 vis_ld64_2(dest, stride_8, DST_2);
258 vis_faligndata(TMP16, TMP18, REF_2);
259
260 vis_and(TMP20, MASK_7f, TMP20);
261
262 vis_and(TMP22, MASK_7f, TMP22);
263
264 vis_psub16(TMP24, TMP20, TMP20);
265 vis_st64(TMP20, dest[0]);
266
267 vis_psub16(TMP26, TMP22, TMP22);
268 vis_st64_2(TMP22, dest, 8);
269 dest += stride;
270 } while (--height);
271
272 vis_ld64(ref[0], TMP0);
273 vis_xor(DST_0, REF_0, TMP6);
274
275 vis_ld64_2(ref, 8, TMP2);
276 vis_and(TMP6, MASK_fe, TMP6);
277
278 vis_ld64_2(ref, offset, TMP4);
279 vis_mul8x16(CONST_128, TMP6, TMP6);
280 vis_xor(DST_2, REF_2, TMP8);
281
282 vis_and(TMP8, MASK_fe, TMP8);
283
284 vis_or(DST_0, REF_0, TMP10);
285 vis_ld64_2(dest, stride, DST_0);
286 vis_mul8x16(CONST_128, TMP8, TMP8);
287
288 vis_or(DST_2, REF_2, TMP12);
289 vis_ld64_2(dest, stride_8, DST_2);
290
291 vis_ld64(ref[0], TMP14);
292 vis_and(TMP6, MASK_7f, TMP6);
293
294 vis_and(TMP8, MASK_7f, TMP8);
295
296 vis_psub16(TMP10, TMP6, TMP6);
297 vis_st64(TMP6, dest[0]);
298
299 vis_psub16(TMP12, TMP8, TMP8);
300 vis_st64_2(TMP8, dest, 8);
301
302 dest += stride;
303 vis_faligndata(TMP0, TMP2, REF_0);
304
305 vis_faligndata(TMP2, TMP4, REF_2);
306
307 vis_xor(DST_0, REF_0, TMP20);
308
309 vis_and(TMP20, MASK_fe, TMP20);
310
311 vis_xor(DST_2, REF_2, TMP22);
312 vis_mul8x16(CONST_128, TMP20, TMP20);
313
314 vis_and(TMP22, MASK_fe, TMP22);
315
316 vis_or(DST_0, REF_0, TMP24);
317 vis_mul8x16(CONST_128, TMP22, TMP22);
318
319 vis_or(DST_2, REF_2, TMP26);
320
321 vis_and(TMP20, MASK_7f, TMP20);
322
323 vis_and(TMP22, MASK_7f, TMP22);
324
325 vis_psub16(TMP24, TMP20, TMP20);
326 vis_st64(TMP20, dest[0]);
327
328 vis_psub16(TMP26, TMP22, TMP22);
329 vis_st64_2(TMP22, dest, 8);
330 }
331
MC_avg_o_8_vis(uint8_t * dest,const uint8_t * _ref,const int stride,int height)332 static void MC_avg_o_8_vis (uint8_t * dest, const uint8_t * _ref,
333 const int stride, int height)
334 {
335 uint8_t *ref = (uint8_t *) _ref;
336 int offset;
337
338 ref = vis_alignaddr(ref);
339 offset = (ref != _ref) ? 8 : 0;
340
341 vis_ld64(ref[0], TMP0);
342
343 vis_ld64_2(ref, offset, TMP2);
344
345 vis_ld64(dest[0], DST_0);
346
347 vis_ld64(constants_fe[0], MASK_fe);
348
349 vis_ld64(constants_7f[0], MASK_7f);
350 vis_faligndata(TMP0, TMP2, REF_0);
351
352 vis_ld64(constants128[0], CONST_128);
353
354 ref += stride;
355 height = (height >> 1) - 1;
356
357 do { /* 12 cycles */
358 vis_ld64(ref[0], TMP0);
359 vis_xor(DST_0, REF_0, TMP4);
360
361 vis_ld64_2(ref, offset, TMP2);
362 vis_and(TMP4, MASK_fe, TMP4);
363
364 vis_or(DST_0, REF_0, TMP6);
365 vis_ld64_2(dest, stride, DST_0);
366 ref += stride;
367 vis_mul8x16(CONST_128, TMP4, TMP4);
368
369 vis_ld64(ref[0], TMP12);
370 vis_faligndata(TMP0, TMP2, REF_0);
371
372 vis_ld64_2(ref, offset, TMP2);
373 vis_xor(DST_0, REF_0, TMP0);
374 ref += stride;
375
376 vis_and(TMP0, MASK_fe, TMP0);
377
378 vis_and(TMP4, MASK_7f, TMP4);
379
380 vis_psub16(TMP6, TMP4, TMP4);
381 vis_st64(TMP4, dest[0]);
382 dest += stride;
383 vis_mul8x16(CONST_128, TMP0, TMP0);
384
385 vis_or(DST_0, REF_0, TMP6);
386 vis_ld64_2(dest, stride, DST_0);
387
388 vis_faligndata(TMP12, TMP2, REF_0);
389
390 vis_and(TMP0, MASK_7f, TMP0);
391
392 vis_psub16(TMP6, TMP0, TMP4);
393 vis_st64(TMP4, dest[0]);
394 dest += stride;
395 } while (--height);
396
397 vis_ld64(ref[0], TMP0);
398 vis_xor(DST_0, REF_0, TMP4);
399
400 vis_ld64_2(ref, offset, TMP2);
401 vis_and(TMP4, MASK_fe, TMP4);
402
403 vis_or(DST_0, REF_0, TMP6);
404 vis_ld64_2(dest, stride, DST_0);
405 vis_mul8x16(CONST_128, TMP4, TMP4);
406
407 vis_faligndata(TMP0, TMP2, REF_0);
408
409 vis_xor(DST_0, REF_0, TMP0);
410
411 vis_and(TMP0, MASK_fe, TMP0);
412
413 vis_and(TMP4, MASK_7f, TMP4);
414
415 vis_psub16(TMP6, TMP4, TMP4);
416 vis_st64(TMP4, dest[0]);
417 dest += stride;
418 vis_mul8x16(CONST_128, TMP0, TMP0);
419
420 vis_or(DST_0, REF_0, TMP6);
421
422 vis_and(TMP0, MASK_7f, TMP0);
423
424 vis_psub16(TMP6, TMP0, TMP4);
425 vis_st64(TMP4, dest[0]);
426 }
427
MC_put_x_16_vis(uint8_t * dest,const uint8_t * _ref,const int stride,int height)428 static void MC_put_x_16_vis (uint8_t * dest, const uint8_t * _ref,
429 const int stride, int height)
430 {
431 uint8_t *ref = (uint8_t *) _ref;
432 unsigned long off = (unsigned long) ref & 0x7;
433 unsigned long off_plus_1 = off + 1;
434
435 ref = vis_alignaddr(ref);
436
437 vis_ld64(ref[0], TMP0);
438
439 vis_ld64_2(ref, 8, TMP2);
440
441 vis_ld64_2(ref, 16, TMP4);
442
443 vis_ld64(constants_fe[0], MASK_fe);
444
445 vis_ld64(constants_7f[0], MASK_7f);
446 vis_faligndata(TMP0, TMP2, REF_0);
447
448 vis_ld64(constants128[0], CONST_128);
449 vis_faligndata(TMP2, TMP4, REF_4);
450
451 if (off != 0x7) {
452 vis_alignaddr_g0((void *)off_plus_1);
453 vis_faligndata(TMP0, TMP2, REF_2);
454 vis_faligndata(TMP2, TMP4, REF_6);
455 } else {
456 vis_src1(TMP2, REF_2);
457 vis_src1(TMP4, REF_6);
458 }
459
460 ref += stride;
461 height = (height >> 1) - 1;
462
463 do { /* 34 cycles */
464 vis_ld64(ref[0], TMP0);
465 vis_xor(REF_0, REF_2, TMP6);
466
467 vis_ld64_2(ref, 8, TMP2);
468 vis_xor(REF_4, REF_6, TMP8);
469
470 vis_ld64_2(ref, 16, TMP4);
471 vis_and(TMP6, MASK_fe, TMP6);
472 ref += stride;
473
474 vis_ld64(ref[0], TMP14);
475 vis_mul8x16(CONST_128, TMP6, TMP6);
476 vis_and(TMP8, MASK_fe, TMP8);
477
478 vis_ld64_2(ref, 8, TMP16);
479 vis_mul8x16(CONST_128, TMP8, TMP8);
480 vis_or(REF_0, REF_2, TMP10);
481
482 vis_ld64_2(ref, 16, TMP18);
483 ref += stride;
484 vis_or(REF_4, REF_6, TMP12);
485
486 vis_alignaddr_g0((void *)off);
487
488 vis_faligndata(TMP0, TMP2, REF_0);
489
490 vis_faligndata(TMP2, TMP4, REF_4);
491
492 if (off != 0x7) {
493 vis_alignaddr_g0((void *)off_plus_1);
494 vis_faligndata(TMP0, TMP2, REF_2);
495 vis_faligndata(TMP2, TMP4, REF_6);
496 } else {
497 vis_src1(TMP2, REF_2);
498 vis_src1(TMP4, REF_6);
499 }
500
501 vis_and(TMP6, MASK_7f, TMP6);
502
503 vis_and(TMP8, MASK_7f, TMP8);
504
505 vis_psub16(TMP10, TMP6, TMP6);
506 vis_st64(TMP6, dest[0]);
507
508 vis_psub16(TMP12, TMP8, TMP8);
509 vis_st64_2(TMP8, dest, 8);
510 dest += stride;
511
512 vis_xor(REF_0, REF_2, TMP6);
513
514 vis_xor(REF_4, REF_6, TMP8);
515
516 vis_and(TMP6, MASK_fe, TMP6);
517
518 vis_mul8x16(CONST_128, TMP6, TMP6);
519 vis_and(TMP8, MASK_fe, TMP8);
520
521 vis_mul8x16(CONST_128, TMP8, TMP8);
522 vis_or(REF_0, REF_2, TMP10);
523
524 vis_or(REF_4, REF_6, TMP12);
525
526 vis_alignaddr_g0((void *)off);
527
528 vis_faligndata(TMP14, TMP16, REF_0);
529
530 vis_faligndata(TMP16, TMP18, REF_4);
531
532 if (off != 0x7) {
533 vis_alignaddr_g0((void *)off_plus_1);
534 vis_faligndata(TMP14, TMP16, REF_2);
535 vis_faligndata(TMP16, TMP18, REF_6);
536 } else {
537 vis_src1(TMP16, REF_2);
538 vis_src1(TMP18, REF_6);
539 }
540
541 vis_and(TMP6, MASK_7f, TMP6);
542
543 vis_and(TMP8, MASK_7f, TMP8);
544
545 vis_psub16(TMP10, TMP6, TMP6);
546 vis_st64(TMP6, dest[0]);
547
548 vis_psub16(TMP12, TMP8, TMP8);
549 vis_st64_2(TMP8, dest, 8);
550 dest += stride;
551 } while (--height);
552
553 vis_ld64(ref[0], TMP0);
554 vis_xor(REF_0, REF_2, TMP6);
555
556 vis_ld64_2(ref, 8, TMP2);
557 vis_xor(REF_4, REF_6, TMP8);
558
559 vis_ld64_2(ref, 16, TMP4);
560 vis_and(TMP6, MASK_fe, TMP6);
561
562 vis_mul8x16(CONST_128, TMP6, TMP6);
563 vis_and(TMP8, MASK_fe, TMP8);
564
565 vis_mul8x16(CONST_128, TMP8, TMP8);
566 vis_or(REF_0, REF_2, TMP10);
567
568 vis_or(REF_4, REF_6, TMP12);
569
570 vis_alignaddr_g0((void *)off);
571
572 vis_faligndata(TMP0, TMP2, REF_0);
573
574 vis_faligndata(TMP2, TMP4, REF_4);
575
576 if (off != 0x7) {
577 vis_alignaddr_g0((void *)off_plus_1);
578 vis_faligndata(TMP0, TMP2, REF_2);
579 vis_faligndata(TMP2, TMP4, REF_6);
580 } else {
581 vis_src1(TMP2, REF_2);
582 vis_src1(TMP4, REF_6);
583 }
584
585 vis_and(TMP6, MASK_7f, TMP6);
586
587 vis_and(TMP8, MASK_7f, TMP8);
588
589 vis_psub16(TMP10, TMP6, TMP6);
590 vis_st64(TMP6, dest[0]);
591
592 vis_psub16(TMP12, TMP8, TMP8);
593 vis_st64_2(TMP8, dest, 8);
594 dest += stride;
595
596 vis_xor(REF_0, REF_2, TMP6);
597
598 vis_xor(REF_4, REF_6, TMP8);
599
600 vis_and(TMP6, MASK_fe, TMP6);
601
602 vis_mul8x16(CONST_128, TMP6, TMP6);
603 vis_and(TMP8, MASK_fe, TMP8);
604
605 vis_mul8x16(CONST_128, TMP8, TMP8);
606 vis_or(REF_0, REF_2, TMP10);
607
608 vis_or(REF_4, REF_6, TMP12);
609
610 vis_and(TMP6, MASK_7f, TMP6);
611
612 vis_and(TMP8, MASK_7f, TMP8);
613
614 vis_psub16(TMP10, TMP6, TMP6);
615 vis_st64(TMP6, dest[0]);
616
617 vis_psub16(TMP12, TMP8, TMP8);
618 vis_st64_2(TMP8, dest, 8);
619 }
620
MC_put_x_8_vis(uint8_t * dest,const uint8_t * _ref,const int stride,int height)621 static void MC_put_x_8_vis (uint8_t * dest, const uint8_t * _ref,
622 const int stride, int height)
623 {
624 uint8_t *ref = (uint8_t *) _ref;
625 unsigned long off = (unsigned long) ref & 0x7;
626 unsigned long off_plus_1 = off + 1;
627
628 ref = vis_alignaddr(ref);
629
630 vis_ld64(ref[0], TMP0);
631
632 vis_ld64(ref[8], TMP2);
633
634 vis_ld64(constants_fe[0], MASK_fe);
635
636 vis_ld64(constants_7f[0], MASK_7f);
637
638 vis_ld64(constants128[0], CONST_128);
639 vis_faligndata(TMP0, TMP2, REF_0);
640
641 if (off != 0x7) {
642 vis_alignaddr_g0((void *)off_plus_1);
643 vis_faligndata(TMP0, TMP2, REF_2);
644 } else {
645 vis_src1(TMP2, REF_2);
646 }
647
648 ref += stride;
649 height = (height >> 1) - 1;
650
651 do { /* 20 cycles */
652 vis_ld64(ref[0], TMP0);
653 vis_xor(REF_0, REF_2, TMP4);
654
655 vis_ld64_2(ref, 8, TMP2);
656 vis_and(TMP4, MASK_fe, TMP4);
657 ref += stride;
658
659 vis_ld64(ref[0], TMP8);
660 vis_or(REF_0, REF_2, TMP6);
661 vis_mul8x16(CONST_128, TMP4, TMP4);
662
663 vis_alignaddr_g0((void *)off);
664
665 vis_ld64_2(ref, 8, TMP10);
666 ref += stride;
667 vis_faligndata(TMP0, TMP2, REF_0);
668
669 if (off != 0x7) {
670 vis_alignaddr_g0((void *)off_plus_1);
671 vis_faligndata(TMP0, TMP2, REF_2);
672 } else {
673 vis_src1(TMP2, REF_2);
674 }
675
676 vis_and(TMP4, MASK_7f, TMP4);
677
678 vis_psub16(TMP6, TMP4, DST_0);
679 vis_st64(DST_0, dest[0]);
680 dest += stride;
681
682 vis_xor(REF_0, REF_2, TMP12);
683
684 vis_and(TMP12, MASK_fe, TMP12);
685
686 vis_or(REF_0, REF_2, TMP14);
687 vis_mul8x16(CONST_128, TMP12, TMP12);
688
689 vis_alignaddr_g0((void *)off);
690 vis_faligndata(TMP8, TMP10, REF_0);
691 if (off != 0x7) {
692 vis_alignaddr_g0((void *)off_plus_1);
693 vis_faligndata(TMP8, TMP10, REF_2);
694 } else {
695 vis_src1(TMP10, REF_2);
696 }
697
698 vis_and(TMP12, MASK_7f, TMP12);
699
700 vis_psub16(TMP14, TMP12, DST_0);
701 vis_st64(DST_0, dest[0]);
702 dest += stride;
703 } while (--height);
704
705 vis_ld64(ref[0], TMP0);
706 vis_xor(REF_0, REF_2, TMP4);
707
708 vis_ld64_2(ref, 8, TMP2);
709 vis_and(TMP4, MASK_fe, TMP4);
710
711 vis_or(REF_0, REF_2, TMP6);
712 vis_mul8x16(CONST_128, TMP4, TMP4);
713
714 vis_alignaddr_g0((void *)off);
715
716 vis_faligndata(TMP0, TMP2, REF_0);
717
718 if (off != 0x7) {
719 vis_alignaddr_g0((void *)off_plus_1);
720 vis_faligndata(TMP0, TMP2, REF_2);
721 } else {
722 vis_src1(TMP2, REF_2);
723 }
724
725 vis_and(TMP4, MASK_7f, TMP4);
726
727 vis_psub16(TMP6, TMP4, DST_0);
728 vis_st64(DST_0, dest[0]);
729 dest += stride;
730
731 vis_xor(REF_0, REF_2, TMP12);
732
733 vis_and(TMP12, MASK_fe, TMP12);
734
735 vis_or(REF_0, REF_2, TMP14);
736 vis_mul8x16(CONST_128, TMP12, TMP12);
737
738 vis_and(TMP12, MASK_7f, TMP12);
739
740 vis_psub16(TMP14, TMP12, DST_0);
741 vis_st64(DST_0, dest[0]);
742 dest += stride;
743 }
744
MC_avg_x_16_vis(uint8_t * dest,const uint8_t * _ref,const int stride,int height)745 static void MC_avg_x_16_vis (uint8_t * dest, const uint8_t * _ref,
746 const int stride, int height)
747 {
748 uint8_t *ref = (uint8_t *) _ref;
749 unsigned long off = (unsigned long) ref & 0x7;
750 unsigned long off_plus_1 = off + 1;
751
752 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
753
754 vis_ld64(constants3[0], CONST_3);
755 vis_fzero(ZERO);
756 vis_ld64(constants256_512[0], CONST_256);
757
758 ref = vis_alignaddr(ref);
759 do { /* 26 cycles */
760 vis_ld64(ref[0], TMP0);
761
762 vis_ld64(ref[8], TMP2);
763
764 vis_alignaddr_g0((void *)off);
765
766 vis_ld64(ref[16], TMP4);
767
768 vis_ld64(dest[0], DST_0);
769 vis_faligndata(TMP0, TMP2, REF_0);
770
771 vis_ld64(dest[8], DST_2);
772 vis_faligndata(TMP2, TMP4, REF_4);
773
774 if (off != 0x7) {
775 vis_alignaddr_g0((void *)off_plus_1);
776 vis_faligndata(TMP0, TMP2, REF_2);
777 vis_faligndata(TMP2, TMP4, REF_6);
778 } else {
779 vis_src1(TMP2, REF_2);
780 vis_src1(TMP4, REF_6);
781 }
782
783 vis_mul8x16au(REF_0, CONST_256, TMP0);
784
785 vis_pmerge(ZERO, REF_2, TMP4);
786 vis_mul8x16au(REF_0_1, CONST_256, TMP2);
787
788 vis_pmerge(ZERO, REF_2_1, TMP6);
789
790 vis_padd16(TMP0, TMP4, TMP0);
791
792 vis_mul8x16al(DST_0, CONST_512, TMP4);
793 vis_padd16(TMP2, TMP6, TMP2);
794
795 vis_mul8x16al(DST_1, CONST_512, TMP6);
796
797 vis_mul8x16au(REF_6, CONST_256, TMP12);
798
799 vis_padd16(TMP0, TMP4, TMP0);
800 vis_mul8x16au(REF_6_1, CONST_256, TMP14);
801
802 vis_padd16(TMP2, TMP6, TMP2);
803 vis_mul8x16au(REF_4, CONST_256, TMP16);
804
805 vis_padd16(TMP0, CONST_3, TMP8);
806 vis_mul8x16au(REF_4_1, CONST_256, TMP18);
807
808 vis_padd16(TMP2, CONST_3, TMP10);
809 vis_pack16(TMP8, DST_0);
810
811 vis_pack16(TMP10, DST_1);
812 vis_padd16(TMP16, TMP12, TMP0);
813
814 vis_st64(DST_0, dest[0]);
815 vis_mul8x16al(DST_2, CONST_512, TMP4);
816 vis_padd16(TMP18, TMP14, TMP2);
817
818 vis_mul8x16al(DST_3, CONST_512, TMP6);
819 vis_padd16(TMP0, CONST_3, TMP0);
820
821 vis_padd16(TMP2, CONST_3, TMP2);
822
823 vis_padd16(TMP0, TMP4, TMP0);
824
825 vis_padd16(TMP2, TMP6, TMP2);
826 vis_pack16(TMP0, DST_2);
827
828 vis_pack16(TMP2, DST_3);
829 vis_st64(DST_2, dest[8]);
830
831 ref += stride;
832 dest += stride;
833 } while (--height);
834 }
835
MC_avg_x_8_vis(uint8_t * dest,const uint8_t * _ref,const int stride,int height)836 static void MC_avg_x_8_vis (uint8_t * dest, const uint8_t * _ref,
837 const int stride, int height)
838 {
839 uint8_t *ref = (uint8_t *) _ref;
840 unsigned long off = (unsigned long) ref & 0x7;
841 unsigned long off_plus_1 = off + 1;
842 int stride_times_2 = stride << 1;
843
844 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
845
846 vis_ld64(constants3[0], CONST_3);
847 vis_fzero(ZERO);
848 vis_ld64(constants256_512[0], CONST_256);
849
850 ref = vis_alignaddr(ref);
851 height >>= 2;
852 do { /* 47 cycles */
853 vis_ld64(ref[0], TMP0);
854
855 vis_ld64_2(ref, 8, TMP2);
856 ref += stride;
857
858 vis_alignaddr_g0((void *)off);
859
860 vis_ld64(ref[0], TMP4);
861 vis_faligndata(TMP0, TMP2, REF_0);
862
863 vis_ld64_2(ref, 8, TMP6);
864 ref += stride;
865
866 vis_ld64(ref[0], TMP8);
867
868 vis_ld64_2(ref, 8, TMP10);
869 ref += stride;
870 vis_faligndata(TMP4, TMP6, REF_4);
871
872 vis_ld64(ref[0], TMP12);
873
874 vis_ld64_2(ref, 8, TMP14);
875 ref += stride;
876 vis_faligndata(TMP8, TMP10, REF_S0);
877
878 vis_faligndata(TMP12, TMP14, REF_S4);
879
880 if (off != 0x7) {
881 vis_alignaddr_g0((void *)off_plus_1);
882
883 vis_ld64(dest[0], DST_0);
884 vis_faligndata(TMP0, TMP2, REF_2);
885
886 vis_ld64_2(dest, stride, DST_2);
887 vis_faligndata(TMP4, TMP6, REF_6);
888
889 vis_faligndata(TMP8, TMP10, REF_S2);
890
891 vis_faligndata(TMP12, TMP14, REF_S6);
892 } else {
893 vis_ld64(dest[0], DST_0);
894 vis_src1(TMP2, REF_2);
895
896 vis_ld64_2(dest, stride, DST_2);
897 vis_src1(TMP6, REF_6);
898
899 vis_src1(TMP10, REF_S2);
900
901 vis_src1(TMP14, REF_S6);
902 }
903
904 vis_pmerge(ZERO, REF_0, TMP0);
905 vis_mul8x16au(REF_0_1, CONST_256, TMP2);
906
907 vis_pmerge(ZERO, REF_2, TMP4);
908 vis_mul8x16au(REF_2_1, CONST_256, TMP6);
909
910 vis_padd16(TMP0, CONST_3, TMP0);
911 vis_mul8x16al(DST_0, CONST_512, TMP16);
912
913 vis_padd16(TMP2, CONST_3, TMP2);
914 vis_mul8x16al(DST_1, CONST_512, TMP18);
915
916 vis_padd16(TMP0, TMP4, TMP0);
917 vis_mul8x16au(REF_4, CONST_256, TMP8);
918
919 vis_padd16(TMP2, TMP6, TMP2);
920 vis_mul8x16au(REF_4_1, CONST_256, TMP10);
921
922 vis_padd16(TMP0, TMP16, TMP0);
923 vis_mul8x16au(REF_6, CONST_256, TMP12);
924
925 vis_padd16(TMP2, TMP18, TMP2);
926 vis_mul8x16au(REF_6_1, CONST_256, TMP14);
927
928 vis_padd16(TMP8, CONST_3, TMP8);
929 vis_mul8x16al(DST_2, CONST_512, TMP16);
930
931 vis_padd16(TMP8, TMP12, TMP8);
932 vis_mul8x16al(DST_3, CONST_512, TMP18);
933
934 vis_padd16(TMP10, TMP14, TMP10);
935 vis_pack16(TMP0, DST_0);
936
937 vis_pack16(TMP2, DST_1);
938 vis_st64(DST_0, dest[0]);
939 dest += stride;
940 vis_padd16(TMP10, CONST_3, TMP10);
941
942 vis_ld64_2(dest, stride, DST_0);
943 vis_padd16(TMP8, TMP16, TMP8);
944
945 vis_ld64_2(dest, stride_times_2, TMP4/*DST_2*/);
946 vis_padd16(TMP10, TMP18, TMP10);
947 vis_pack16(TMP8, DST_2);
948
949 vis_pack16(TMP10, DST_3);
950 vis_st64(DST_2, dest[0]);
951 dest += stride;
952
953 vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
954 vis_pmerge(ZERO, REF_S0, TMP0);
955
956 vis_pmerge(ZERO, REF_S2, TMP24);
957 vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
958
959 vis_padd16(TMP0, CONST_3, TMP0);
960 vis_mul8x16au(REF_S4, CONST_256, TMP8);
961
962 vis_padd16(TMP2, CONST_3, TMP2);
963 vis_mul8x16au(REF_S4_1, CONST_256, TMP10);
964
965 vis_padd16(TMP0, TMP24, TMP0);
966 vis_mul8x16au(REF_S6, CONST_256, TMP12);
967
968 vis_padd16(TMP2, TMP6, TMP2);
969 vis_mul8x16au(REF_S6_1, CONST_256, TMP14);
970
971 vis_padd16(TMP8, CONST_3, TMP8);
972 vis_mul8x16al(DST_0, CONST_512, TMP16);
973
974 vis_padd16(TMP10, CONST_3, TMP10);
975 vis_mul8x16al(DST_1, CONST_512, TMP18);
976
977 vis_padd16(TMP8, TMP12, TMP8);
978 vis_mul8x16al(TMP4/*DST_2*/, CONST_512, TMP20);
979
980 vis_mul8x16al(TMP5/*DST_3*/, CONST_512, TMP22);
981 vis_padd16(TMP0, TMP16, TMP0);
982
983 vis_padd16(TMP2, TMP18, TMP2);
984 vis_pack16(TMP0, DST_0);
985
986 vis_padd16(TMP10, TMP14, TMP10);
987 vis_pack16(TMP2, DST_1);
988 vis_st64(DST_0, dest[0]);
989 dest += stride;
990
991 vis_padd16(TMP8, TMP20, TMP8);
992
993 vis_padd16(TMP10, TMP22, TMP10);
994 vis_pack16(TMP8, DST_2);
995
996 vis_pack16(TMP10, DST_3);
997 vis_st64(DST_2, dest[0]);
998 dest += stride;
999 } while (--height);
1000 }
1001
MC_put_y_16_vis(uint8_t * dest,const uint8_t * _ref,const int stride,int height)1002 static void MC_put_y_16_vis (uint8_t * dest, const uint8_t * _ref,
1003 const int stride, int height)
1004 {
1005 uint8_t *ref = (uint8_t *) _ref;
1006 int offset;
1007
1008 ref = vis_alignaddr(ref);
1009 offset = (ref != _ref) ? 16 : 0;
1010
1011 vis_ld64(ref[0], TMP0);
1012
1013 vis_ld64_2(ref, 8, TMP2);
1014
1015 vis_ld64_2(ref, offset, TMP4);
1016 ref += stride;
1017
1018 vis_ld64(ref[0], TMP6);
1019 vis_faligndata(TMP0, TMP2, REF_0);
1020
1021 vis_ld64_2(ref, 8, TMP8);
1022 vis_faligndata(TMP2, TMP4, REF_4);
1023
1024 vis_ld64_2(ref, offset, TMP10);
1025 ref += stride;
1026
1027 vis_ld64(constants_fe[0], MASK_fe);
1028 vis_faligndata(TMP6, TMP8, REF_2);
1029
1030 vis_ld64(constants_7f[0], MASK_7f);
1031 vis_faligndata(TMP8, TMP10, REF_6);
1032
1033 vis_ld64(constants128[0], CONST_128);
1034 height = (height >> 1) - 1;
1035 do { /* 24 cycles */
1036 vis_ld64(ref[0], TMP0);
1037 vis_xor(REF_0, REF_2, TMP12);
1038
1039 vis_ld64_2(ref, 8, TMP2);
1040 vis_xor(REF_4, REF_6, TMP16);
1041
1042 vis_ld64_2(ref, offset, TMP4);
1043 ref += stride;
1044 vis_or(REF_0, REF_2, TMP14);
1045
1046 vis_ld64(ref[0], TMP6);
1047 vis_or(REF_4, REF_6, TMP18);
1048
1049 vis_ld64_2(ref, 8, TMP8);
1050 vis_faligndata(TMP0, TMP2, REF_0);
1051
1052 vis_ld64_2(ref, offset, TMP10);
1053 ref += stride;
1054 vis_faligndata(TMP2, TMP4, REF_4);
1055
1056 vis_and(TMP12, MASK_fe, TMP12);
1057
1058 vis_and(TMP16, MASK_fe, TMP16);
1059 vis_mul8x16(CONST_128, TMP12, TMP12);
1060
1061 vis_mul8x16(CONST_128, TMP16, TMP16);
1062 vis_xor(REF_0, REF_2, TMP0);
1063
1064 vis_xor(REF_4, REF_6, TMP2);
1065
1066 vis_or(REF_0, REF_2, TMP20);
1067
1068 vis_and(TMP12, MASK_7f, TMP12);
1069
1070 vis_and(TMP16, MASK_7f, TMP16);
1071
1072 vis_psub16(TMP14, TMP12, TMP12);
1073 vis_st64(TMP12, dest[0]);
1074
1075 vis_psub16(TMP18, TMP16, TMP16);
1076 vis_st64_2(TMP16, dest, 8);
1077 dest += stride;
1078
1079 vis_or(REF_4, REF_6, TMP18);
1080
1081 vis_and(TMP0, MASK_fe, TMP0);
1082
1083 vis_and(TMP2, MASK_fe, TMP2);
1084 vis_mul8x16(CONST_128, TMP0, TMP0);
1085
1086 vis_faligndata(TMP6, TMP8, REF_2);
1087 vis_mul8x16(CONST_128, TMP2, TMP2);
1088
1089 vis_faligndata(TMP8, TMP10, REF_6);
1090
1091 vis_and(TMP0, MASK_7f, TMP0);
1092
1093 vis_and(TMP2, MASK_7f, TMP2);
1094
1095 vis_psub16(TMP20, TMP0, TMP0);
1096 vis_st64(TMP0, dest[0]);
1097
1098 vis_psub16(TMP18, TMP2, TMP2);
1099 vis_st64_2(TMP2, dest, 8);
1100 dest += stride;
1101 } while (--height);
1102
1103 vis_ld64(ref[0], TMP0);
1104 vis_xor(REF_0, REF_2, TMP12);
1105
1106 vis_ld64_2(ref, 8, TMP2);
1107 vis_xor(REF_4, REF_6, TMP16);
1108
1109 vis_ld64_2(ref, offset, TMP4);
1110 vis_or(REF_0, REF_2, TMP14);
1111
1112 vis_or(REF_4, REF_6, TMP18);
1113
1114 vis_faligndata(TMP0, TMP2, REF_0);
1115
1116 vis_faligndata(TMP2, TMP4, REF_4);
1117
1118 vis_and(TMP12, MASK_fe, TMP12);
1119
1120 vis_and(TMP16, MASK_fe, TMP16);
1121 vis_mul8x16(CONST_128, TMP12, TMP12);
1122
1123 vis_mul8x16(CONST_128, TMP16, TMP16);
1124 vis_xor(REF_0, REF_2, TMP0);
1125
1126 vis_xor(REF_4, REF_6, TMP2);
1127
1128 vis_or(REF_0, REF_2, TMP20);
1129
1130 vis_and(TMP12, MASK_7f, TMP12);
1131
1132 vis_and(TMP16, MASK_7f, TMP16);
1133
1134 vis_psub16(TMP14, TMP12, TMP12);
1135 vis_st64(TMP12, dest[0]);
1136
1137 vis_psub16(TMP18, TMP16, TMP16);
1138 vis_st64_2(TMP16, dest, 8);
1139 dest += stride;
1140
1141 vis_or(REF_4, REF_6, TMP18);
1142
1143 vis_and(TMP0, MASK_fe, TMP0);
1144
1145 vis_and(TMP2, MASK_fe, TMP2);
1146 vis_mul8x16(CONST_128, TMP0, TMP0);
1147
1148 vis_mul8x16(CONST_128, TMP2, TMP2);
1149
1150 vis_and(TMP0, MASK_7f, TMP0);
1151
1152 vis_and(TMP2, MASK_7f, TMP2);
1153
1154 vis_psub16(TMP20, TMP0, TMP0);
1155 vis_st64(TMP0, dest[0]);
1156
1157 vis_psub16(TMP18, TMP2, TMP2);
1158 vis_st64_2(TMP2, dest, 8);
1159 }
1160
MC_put_y_8_vis(uint8_t * dest,const uint8_t * _ref,const int stride,int height)1161 static void MC_put_y_8_vis (uint8_t * dest, const uint8_t * _ref,
1162 const int stride, int height)
1163 {
1164 uint8_t *ref = (uint8_t *) _ref;
1165 int offset;
1166
1167 ref = vis_alignaddr(ref);
1168 offset = (ref != _ref) ? 8 : 0;
1169
1170 vis_ld64(ref[0], TMP0);
1171
1172 vis_ld64_2(ref, offset, TMP2);
1173 ref += stride;
1174
1175 vis_ld64(ref[0], TMP4);
1176
1177 vis_ld64_2(ref, offset, TMP6);
1178 ref += stride;
1179
1180 vis_ld64(constants_fe[0], MASK_fe);
1181 vis_faligndata(TMP0, TMP2, REF_0);
1182
1183 vis_ld64(constants_7f[0], MASK_7f);
1184 vis_faligndata(TMP4, TMP6, REF_2);
1185
1186 vis_ld64(constants128[0], CONST_128);
1187 height = (height >> 1) - 1;
1188 do { /* 12 cycles */
1189 vis_ld64(ref[0], TMP0);
1190 vis_xor(REF_0, REF_2, TMP4);
1191
1192 vis_ld64_2(ref, offset, TMP2);
1193 ref += stride;
1194 vis_and(TMP4, MASK_fe, TMP4);
1195
1196 vis_or(REF_0, REF_2, TMP6);
1197 vis_mul8x16(CONST_128, TMP4, TMP4);
1198
1199 vis_faligndata(TMP0, TMP2, REF_0);
1200 vis_ld64(ref[0], TMP0);
1201
1202 vis_ld64_2(ref, offset, TMP2);
1203 ref += stride;
1204 vis_xor(REF_0, REF_2, TMP12);
1205
1206 vis_and(TMP4, MASK_7f, TMP4);
1207
1208 vis_and(TMP12, MASK_fe, TMP12);
1209
1210 vis_mul8x16(CONST_128, TMP12, TMP12);
1211 vis_or(REF_0, REF_2, TMP14);
1212
1213 vis_psub16(TMP6, TMP4, DST_0);
1214 vis_st64(DST_0, dest[0]);
1215 dest += stride;
1216
1217 vis_faligndata(TMP0, TMP2, REF_2);
1218
1219 vis_and(TMP12, MASK_7f, TMP12);
1220
1221 vis_psub16(TMP14, TMP12, DST_0);
1222 vis_st64(DST_0, dest[0]);
1223 dest += stride;
1224 } while (--height);
1225
1226 vis_ld64(ref[0], TMP0);
1227 vis_xor(REF_0, REF_2, TMP4);
1228
1229 vis_ld64_2(ref, offset, TMP2);
1230 vis_and(TMP4, MASK_fe, TMP4);
1231
1232 vis_or(REF_0, REF_2, TMP6);
1233 vis_mul8x16(CONST_128, TMP4, TMP4);
1234
1235 vis_faligndata(TMP0, TMP2, REF_0);
1236
1237 vis_xor(REF_0, REF_2, TMP12);
1238
1239 vis_and(TMP4, MASK_7f, TMP4);
1240
1241 vis_and(TMP12, MASK_fe, TMP12);
1242
1243 vis_mul8x16(CONST_128, TMP12, TMP12);
1244 vis_or(REF_0, REF_2, TMP14);
1245
1246 vis_psub16(TMP6, TMP4, DST_0);
1247 vis_st64(DST_0, dest[0]);
1248 dest += stride;
1249
1250 vis_and(TMP12, MASK_7f, TMP12);
1251
1252 vis_psub16(TMP14, TMP12, DST_0);
1253 vis_st64(DST_0, dest[0]);
1254 }
1255
MC_avg_y_16_vis(uint8_t * dest,const uint8_t * _ref,const int stride,int height)1256 static void MC_avg_y_16_vis (uint8_t * dest, const uint8_t * _ref,
1257 const int stride, int height)
1258 {
1259 uint8_t *ref = (uint8_t *) _ref;
1260 int stride_8 = stride + 8;
1261 int stride_16;
1262 int offset;
1263
1264 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
1265
1266 ref = vis_alignaddr(ref);
1267 offset = (ref != _ref) ? 16 : 0;
1268
1269 vis_ld64(ref[ 0], TMP0);
1270 vis_fzero(ZERO);
1271
1272 vis_ld64(ref[ 8], TMP2);
1273
1274 vis_ld64_2(ref, offset, TMP4);
1275 stride_16 = stride + offset;
1276
1277 vis_ld64(constants3[0], CONST_3);
1278 vis_faligndata(TMP0, TMP2, REF_2);
1279
1280 vis_ld64(constants256_512[0], CONST_256);
1281 vis_faligndata(TMP2, TMP4, REF_6);
1282 height >>= 1;
1283
1284 do { /* 31 cycles */
1285 vis_ld64_2(ref, stride, TMP0);
1286 vis_pmerge(ZERO, REF_2, TMP12);
1287 vis_mul8x16au(REF_2_1, CONST_256, TMP14);
1288
1289 vis_ld64_2(ref, stride_8, TMP2);
1290 vis_pmerge(ZERO, REF_6, TMP16);
1291 vis_mul8x16au(REF_6_1, CONST_256, TMP18);
1292
1293 vis_ld64_2(ref, stride_16, TMP4);
1294 ref += stride;
1295
1296 vis_ld64(dest[0], DST_0);
1297 vis_faligndata(TMP0, TMP2, REF_0);
1298
1299 vis_ld64_2(dest, 8, DST_2);
1300 vis_faligndata(TMP2, TMP4, REF_4);
1301
1302 vis_ld64_2(ref, stride, TMP6);
1303 vis_pmerge(ZERO, REF_0, TMP0);
1304 vis_mul8x16au(REF_0_1, CONST_256, TMP2);
1305
1306 vis_ld64_2(ref, stride_8, TMP8);
1307 vis_pmerge(ZERO, REF_4, TMP4);
1308
1309 vis_ld64_2(ref, stride_16, TMP10);
1310 ref += stride;
1311
1312 vis_ld64_2(dest, stride, REF_S0/*DST_4*/);
1313 vis_faligndata(TMP6, TMP8, REF_2);
1314 vis_mul8x16au(REF_4_1, CONST_256, TMP6);
1315
1316 vis_ld64_2(dest, stride_8, REF_S2/*DST_6*/);
1317 vis_faligndata(TMP8, TMP10, REF_6);
1318 vis_mul8x16al(DST_0, CONST_512, TMP20);
1319
1320 vis_padd16(TMP0, CONST_3, TMP0);
1321 vis_mul8x16al(DST_1, CONST_512, TMP22);
1322
1323 vis_padd16(TMP2, CONST_3, TMP2);
1324 vis_mul8x16al(DST_2, CONST_512, TMP24);
1325
1326 vis_padd16(TMP4, CONST_3, TMP4);
1327 vis_mul8x16al(DST_3, CONST_512, TMP26);
1328
1329 vis_padd16(TMP6, CONST_3, TMP6);
1330
1331 vis_padd16(TMP12, TMP20, TMP12);
1332 vis_mul8x16al(REF_S0, CONST_512, TMP20);
1333
1334 vis_padd16(TMP14, TMP22, TMP14);
1335 vis_mul8x16al(REF_S0_1, CONST_512, TMP22);
1336
1337 vis_padd16(TMP16, TMP24, TMP16);
1338 vis_mul8x16al(REF_S2, CONST_512, TMP24);
1339
1340 vis_padd16(TMP18, TMP26, TMP18);
1341 vis_mul8x16al(REF_S2_1, CONST_512, TMP26);
1342
1343 vis_padd16(TMP12, TMP0, TMP12);
1344 vis_mul8x16au(REF_2, CONST_256, TMP28);
1345
1346 vis_padd16(TMP14, TMP2, TMP14);
1347 vis_mul8x16au(REF_2_1, CONST_256, TMP30);
1348
1349 vis_padd16(TMP16, TMP4, TMP16);
1350 vis_mul8x16au(REF_6, CONST_256, REF_S4);
1351
1352 vis_padd16(TMP18, TMP6, TMP18);
1353 vis_mul8x16au(REF_6_1, CONST_256, REF_S6);
1354
1355 vis_pack16(TMP12, DST_0);
1356 vis_padd16(TMP28, TMP0, TMP12);
1357
1358 vis_pack16(TMP14, DST_1);
1359 vis_st64(DST_0, dest[0]);
1360 vis_padd16(TMP30, TMP2, TMP14);
1361
1362 vis_pack16(TMP16, DST_2);
1363 vis_padd16(REF_S4, TMP4, TMP16);
1364
1365 vis_pack16(TMP18, DST_3);
1366 vis_st64_2(DST_2, dest, 8);
1367 dest += stride;
1368 vis_padd16(REF_S6, TMP6, TMP18);
1369
1370 vis_padd16(TMP12, TMP20, TMP12);
1371
1372 vis_padd16(TMP14, TMP22, TMP14);
1373 vis_pack16(TMP12, DST_0);
1374
1375 vis_padd16(TMP16, TMP24, TMP16);
1376 vis_pack16(TMP14, DST_1);
1377 vis_st64(DST_0, dest[0]);
1378
1379 vis_padd16(TMP18, TMP26, TMP18);
1380 vis_pack16(TMP16, DST_2);
1381
1382 vis_pack16(TMP18, DST_3);
1383 vis_st64_2(DST_2, dest, 8);
1384 dest += stride;
1385 } while (--height);
1386 }
1387
MC_avg_y_8_vis(uint8_t * dest,const uint8_t * _ref,const int stride,int height)1388 static void MC_avg_y_8_vis (uint8_t * dest, const uint8_t * _ref,
1389 const int stride, int height)
1390 {
1391 uint8_t *ref = (uint8_t *) _ref;
1392 int stride_8;
1393 int offset;
1394
1395 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
1396
1397 ref = vis_alignaddr(ref);
1398 offset = (ref != _ref) ? 8 : 0;
1399
1400 vis_ld64(ref[ 0], TMP0);
1401 vis_fzero(ZERO);
1402
1403 vis_ld64_2(ref, offset, TMP2);
1404 stride_8 = stride + offset;
1405
1406 vis_ld64(constants3[0], CONST_3);
1407 vis_faligndata(TMP0, TMP2, REF_2);
1408
1409 vis_ld64(constants256_512[0], CONST_256);
1410
1411 height >>= 1;
1412 do { /* 20 cycles */
1413 vis_ld64_2(ref, stride, TMP0);
1414 vis_pmerge(ZERO, REF_2, TMP8);
1415 vis_mul8x16au(REF_2_1, CONST_256, TMP10);
1416
1417 vis_ld64_2(ref, stride_8, TMP2);
1418 ref += stride;
1419
1420 vis_ld64(dest[0], DST_0);
1421
1422 vis_ld64_2(dest, stride, DST_2);
1423 vis_faligndata(TMP0, TMP2, REF_0);
1424
1425 vis_ld64_2(ref, stride, TMP4);
1426 vis_mul8x16al(DST_0, CONST_512, TMP16);
1427 vis_pmerge(ZERO, REF_0, TMP12);
1428
1429 vis_ld64_2(ref, stride_8, TMP6);
1430 ref += stride;
1431 vis_mul8x16al(DST_1, CONST_512, TMP18);
1432 vis_pmerge(ZERO, REF_0_1, TMP14);
1433
1434 vis_padd16(TMP12, CONST_3, TMP12);
1435 vis_mul8x16al(DST_2, CONST_512, TMP24);
1436
1437 vis_padd16(TMP14, CONST_3, TMP14);
1438 vis_mul8x16al(DST_3, CONST_512, TMP26);
1439
1440 vis_faligndata(TMP4, TMP6, REF_2);
1441
1442 vis_padd16(TMP8, TMP12, TMP8);
1443
1444 vis_padd16(TMP10, TMP14, TMP10);
1445 vis_mul8x16au(REF_2, CONST_256, TMP20);
1446
1447 vis_padd16(TMP8, TMP16, TMP0);
1448 vis_mul8x16au(REF_2_1, CONST_256, TMP22);
1449
1450 vis_padd16(TMP10, TMP18, TMP2);
1451 vis_pack16(TMP0, DST_0);
1452
1453 vis_pack16(TMP2, DST_1);
1454 vis_st64(DST_0, dest[0]);
1455 dest += stride;
1456 vis_padd16(TMP12, TMP20, TMP12);
1457
1458 vis_padd16(TMP14, TMP22, TMP14);
1459
1460 vis_padd16(TMP12, TMP24, TMP0);
1461
1462 vis_padd16(TMP14, TMP26, TMP2);
1463 vis_pack16(TMP0, DST_2);
1464
1465 vis_pack16(TMP2, DST_3);
1466 vis_st64(DST_2, dest[0]);
1467 dest += stride;
1468 } while (--height);
1469 }
1470
MC_put_xy_16_vis(uint8_t * dest,const uint8_t * _ref,const int stride,int height)1471 static void MC_put_xy_16_vis (uint8_t * dest, const uint8_t * _ref,
1472 const int stride, int height)
1473 {
1474 uint8_t *ref = (uint8_t *) _ref;
1475 unsigned long off = (unsigned long) ref & 0x7;
1476 unsigned long off_plus_1 = off + 1;
1477 int stride_8 = stride + 8;
1478 int stride_16 = stride + 16;
1479
1480 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
1481
1482 ref = vis_alignaddr(ref);
1483
1484 vis_ld64(ref[ 0], TMP0);
1485 vis_fzero(ZERO);
1486
1487 vis_ld64(ref[ 8], TMP2);
1488
1489 vis_ld64(ref[16], TMP4);
1490
1491 vis_ld64(constants2[0], CONST_2);
1492 vis_faligndata(TMP0, TMP2, REF_S0);
1493
1494 vis_ld64(constants256_512[0], CONST_256);
1495 vis_faligndata(TMP2, TMP4, REF_S4);
1496
1497 if (off != 0x7) {
1498 vis_alignaddr_g0((void *)off_plus_1);
1499 vis_faligndata(TMP0, TMP2, REF_S2);
1500 vis_faligndata(TMP2, TMP4, REF_S6);
1501 } else {
1502 vis_src1(TMP2, REF_S2);
1503 vis_src1(TMP4, REF_S6);
1504 }
1505
1506 height >>= 1;
1507 do {
1508 vis_ld64_2(ref, stride, TMP0);
1509 vis_mul8x16au(REF_S0, CONST_256, TMP12);
1510 vis_pmerge(ZERO, REF_S0_1, TMP14);
1511
1512 vis_alignaddr_g0((void *)off);
1513
1514 vis_ld64_2(ref, stride_8, TMP2);
1515 vis_mul8x16au(REF_S2, CONST_256, TMP16);
1516 vis_pmerge(ZERO, REF_S2_1, TMP18);
1517
1518 vis_ld64_2(ref, stride_16, TMP4);
1519 ref += stride;
1520 vis_mul8x16au(REF_S4, CONST_256, TMP20);
1521 vis_pmerge(ZERO, REF_S4_1, TMP22);
1522
1523 vis_ld64_2(ref, stride, TMP6);
1524 vis_mul8x16au(REF_S6, CONST_256, TMP24);
1525 vis_pmerge(ZERO, REF_S6_1, TMP26);
1526
1527 vis_ld64_2(ref, stride_8, TMP8);
1528 vis_faligndata(TMP0, TMP2, REF_0);
1529
1530 vis_ld64_2(ref, stride_16, TMP10);
1531 ref += stride;
1532 vis_faligndata(TMP2, TMP4, REF_4);
1533
1534 vis_faligndata(TMP6, TMP8, REF_S0);
1535
1536 vis_faligndata(TMP8, TMP10, REF_S4);
1537
1538 if (off != 0x7) {
1539 vis_alignaddr_g0((void *)off_plus_1);
1540 vis_faligndata(TMP0, TMP2, REF_2);
1541 vis_faligndata(TMP2, TMP4, REF_6);
1542 vis_faligndata(TMP6, TMP8, REF_S2);
1543 vis_faligndata(TMP8, TMP10, REF_S6);
1544 } else {
1545 vis_src1(TMP2, REF_2);
1546 vis_src1(TMP4, REF_6);
1547 vis_src1(TMP8, REF_S2);
1548 vis_src1(TMP10, REF_S6);
1549 }
1550
1551 vis_mul8x16au(REF_0, CONST_256, TMP0);
1552 vis_pmerge(ZERO, REF_0_1, TMP2);
1553
1554 vis_mul8x16au(REF_2, CONST_256, TMP4);
1555 vis_pmerge(ZERO, REF_2_1, TMP6);
1556
1557 vis_padd16(TMP0, CONST_2, TMP8);
1558 vis_mul8x16au(REF_4, CONST_256, TMP0);
1559
1560 vis_padd16(TMP2, CONST_2, TMP10);
1561 vis_mul8x16au(REF_4_1, CONST_256, TMP2);
1562
1563 vis_padd16(TMP8, TMP4, TMP8);
1564 vis_mul8x16au(REF_6, CONST_256, TMP4);
1565
1566 vis_padd16(TMP10, TMP6, TMP10);
1567 vis_mul8x16au(REF_6_1, CONST_256, TMP6);
1568
1569 vis_padd16(TMP12, TMP8, TMP12);
1570
1571 vis_padd16(TMP14, TMP10, TMP14);
1572
1573 vis_padd16(TMP12, TMP16, TMP12);
1574
1575 vis_padd16(TMP14, TMP18, TMP14);
1576 vis_pack16(TMP12, DST_0);
1577
1578 vis_pack16(TMP14, DST_1);
1579 vis_st64(DST_0, dest[0]);
1580 vis_padd16(TMP0, CONST_2, TMP12);
1581
1582 vis_mul8x16au(REF_S0, CONST_256, TMP0);
1583 vis_padd16(TMP2, CONST_2, TMP14);
1584
1585 vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
1586 vis_padd16(TMP12, TMP4, TMP12);
1587
1588 vis_mul8x16au(REF_S2, CONST_256, TMP4);
1589 vis_padd16(TMP14, TMP6, TMP14);
1590
1591 vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
1592 vis_padd16(TMP20, TMP12, TMP20);
1593
1594 vis_padd16(TMP22, TMP14, TMP22);
1595
1596 vis_padd16(TMP20, TMP24, TMP20);
1597
1598 vis_padd16(TMP22, TMP26, TMP22);
1599 vis_pack16(TMP20, DST_2);
1600
1601 vis_pack16(TMP22, DST_3);
1602 vis_st64_2(DST_2, dest, 8);
1603 dest += stride;
1604 vis_padd16(TMP0, TMP4, TMP24);
1605
1606 vis_mul8x16au(REF_S4, CONST_256, TMP0);
1607 vis_padd16(TMP2, TMP6, TMP26);
1608
1609 vis_mul8x16au(REF_S4_1, CONST_256, TMP2);
1610 vis_padd16(TMP24, TMP8, TMP24);
1611
1612 vis_padd16(TMP26, TMP10, TMP26);
1613 vis_pack16(TMP24, DST_0);
1614
1615 vis_pack16(TMP26, DST_1);
1616 vis_st64(DST_0, dest[0]);
1617 vis_pmerge(ZERO, REF_S6, TMP4);
1618
1619 vis_pmerge(ZERO, REF_S6_1, TMP6);
1620
1621 vis_padd16(TMP0, TMP4, TMP0);
1622
1623 vis_padd16(TMP2, TMP6, TMP2);
1624
1625 vis_padd16(TMP0, TMP12, TMP0);
1626
1627 vis_padd16(TMP2, TMP14, TMP2);
1628 vis_pack16(TMP0, DST_2);
1629
1630 vis_pack16(TMP2, DST_3);
1631 vis_st64_2(DST_2, dest, 8);
1632 dest += stride;
1633 } while (--height);
1634 }
1635
MC_put_xy_8_vis(uint8_t * dest,const uint8_t * _ref,const int stride,int height)1636 static void MC_put_xy_8_vis (uint8_t * dest, const uint8_t * _ref,
1637 const int stride, int height)
1638 {
1639 uint8_t *ref = (uint8_t *) _ref;
1640 unsigned long off = (unsigned long) ref & 0x7;
1641 unsigned long off_plus_1 = off + 1;
1642 int stride_8 = stride + 8;
1643
1644 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
1645
1646 ref = vis_alignaddr(ref);
1647
1648 vis_ld64(ref[ 0], TMP0);
1649 vis_fzero(ZERO);
1650
1651 vis_ld64(ref[ 8], TMP2);
1652
1653 vis_ld64(constants2[0], CONST_2);
1654
1655 vis_ld64(constants256_512[0], CONST_256);
1656 vis_faligndata(TMP0, TMP2, REF_S0);
1657
1658 if (off != 0x7) {
1659 vis_alignaddr_g0((void *)off_plus_1);
1660 vis_faligndata(TMP0, TMP2, REF_S2);
1661 } else {
1662 vis_src1(TMP2, REF_S2);
1663 }
1664
1665 height >>= 1;
1666 do { /* 26 cycles */
1667 vis_ld64_2(ref, stride, TMP0);
1668 vis_mul8x16au(REF_S0, CONST_256, TMP8);
1669 vis_pmerge(ZERO, REF_S2, TMP12);
1670
1671 vis_alignaddr_g0((void *)off);
1672
1673 vis_ld64_2(ref, stride_8, TMP2);
1674 ref += stride;
1675 vis_mul8x16au(REF_S0_1, CONST_256, TMP10);
1676 vis_pmerge(ZERO, REF_S2_1, TMP14);
1677
1678 vis_ld64_2(ref, stride, TMP4);
1679
1680 vis_ld64_2(ref, stride_8, TMP6);
1681 ref += stride;
1682 vis_faligndata(TMP0, TMP2, REF_S4);
1683
1684 vis_pmerge(ZERO, REF_S4, TMP18);
1685
1686 vis_pmerge(ZERO, REF_S4_1, TMP20);
1687
1688 vis_faligndata(TMP4, TMP6, REF_S0);
1689
1690 if (off != 0x7) {
1691 vis_alignaddr_g0((void *)off_plus_1);
1692 vis_faligndata(TMP0, TMP2, REF_S6);
1693 vis_faligndata(TMP4, TMP6, REF_S2);
1694 } else {
1695 vis_src1(TMP2, REF_S6);
1696 vis_src1(TMP6, REF_S2);
1697 }
1698
1699 vis_padd16(TMP18, CONST_2, TMP18);
1700 vis_mul8x16au(REF_S6, CONST_256, TMP22);
1701
1702 vis_padd16(TMP20, CONST_2, TMP20);
1703 vis_mul8x16au(REF_S6_1, CONST_256, TMP24);
1704
1705 vis_mul8x16au(REF_S0, CONST_256, TMP26);
1706 vis_pmerge(ZERO, REF_S0_1, TMP28);
1707
1708 vis_mul8x16au(REF_S2, CONST_256, TMP30);
1709 vis_padd16(TMP18, TMP22, TMP18);
1710
1711 vis_mul8x16au(REF_S2_1, CONST_256, TMP32);
1712 vis_padd16(TMP20, TMP24, TMP20);
1713
1714 vis_padd16(TMP8, TMP18, TMP8);
1715
1716 vis_padd16(TMP10, TMP20, TMP10);
1717
1718 vis_padd16(TMP8, TMP12, TMP8);
1719
1720 vis_padd16(TMP10, TMP14, TMP10);
1721 vis_pack16(TMP8, DST_0);
1722
1723 vis_pack16(TMP10, DST_1);
1724 vis_st64(DST_0, dest[0]);
1725 dest += stride;
1726 vis_padd16(TMP18, TMP26, TMP18);
1727
1728 vis_padd16(TMP20, TMP28, TMP20);
1729
1730 vis_padd16(TMP18, TMP30, TMP18);
1731
1732 vis_padd16(TMP20, TMP32, TMP20);
1733 vis_pack16(TMP18, DST_2);
1734
1735 vis_pack16(TMP20, DST_3);
1736 vis_st64(DST_2, dest[0]);
1737 dest += stride;
1738 } while (--height);
1739 }
1740
MC_avg_xy_16_vis(uint8_t * dest,const uint8_t * _ref,const int stride,int height)1741 static void MC_avg_xy_16_vis (uint8_t * dest, const uint8_t * _ref,
1742 const int stride, int height)
1743 {
1744 uint8_t *ref = (uint8_t *) _ref;
1745 unsigned long off = (unsigned long) ref & 0x7;
1746 unsigned long off_plus_1 = off + 1;
1747 int stride_8 = stride + 8;
1748 int stride_16 = stride + 16;
1749
1750 vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
1751
1752 ref = vis_alignaddr(ref);
1753
1754 vis_ld64(ref[ 0], TMP0);
1755 vis_fzero(ZERO);
1756
1757 vis_ld64(ref[ 8], TMP2);
1758
1759 vis_ld64(ref[16], TMP4);
1760
1761 vis_ld64(constants6[0], CONST_6);
1762 vis_faligndata(TMP0, TMP2, REF_S0);
1763
1764 vis_ld64(constants256_1024[0], CONST_256);
1765 vis_faligndata(TMP2, TMP4, REF_S4);
1766
1767 if (off != 0x7) {
1768 vis_alignaddr_g0((void *)off_plus_1);
1769 vis_faligndata(TMP0, TMP2, REF_S2);
1770 vis_faligndata(TMP2, TMP4, REF_S6);
1771 } else {
1772 vis_src1(TMP2, REF_S2);
1773 vis_src1(TMP4, REF_S6);
1774 }
1775
1776 height >>= 1;
1777 do { /* 55 cycles */
1778 vis_ld64_2(ref, stride, TMP0);
1779 vis_mul8x16au(REF_S0, CONST_256, TMP12);
1780 vis_pmerge(ZERO, REF_S0_1, TMP14);
1781
1782 vis_alignaddr_g0((void *)off);
1783
1784 vis_ld64_2(ref, stride_8, TMP2);
1785 vis_mul8x16au(REF_S2, CONST_256, TMP16);
1786 vis_pmerge(ZERO, REF_S2_1, TMP18);
1787
1788 vis_ld64_2(ref, stride_16, TMP4);
1789 ref += stride;
1790 vis_mul8x16au(REF_S4, CONST_256, TMP20);
1791 vis_pmerge(ZERO, REF_S4_1, TMP22);
1792
1793 vis_ld64_2(ref, stride, TMP6);
1794 vis_mul8x16au(REF_S6, CONST_256, TMP24);
1795 vis_pmerge(ZERO, REF_S6_1, TMP26);
1796
1797 vis_ld64_2(ref, stride_8, TMP8);
1798 vis_faligndata(TMP0, TMP2, REF_0);
1799
1800 vis_ld64_2(ref, stride_16, TMP10);
1801 ref += stride;
1802 vis_faligndata(TMP2, TMP4, REF_4);
1803
1804 vis_ld64(dest[0], DST_0);
1805 vis_faligndata(TMP6, TMP8, REF_S0);
1806
1807 vis_ld64_2(dest, 8, DST_2);
1808 vis_faligndata(TMP8, TMP10, REF_S4);
1809
1810 if (off != 0x7) {
1811 vis_alignaddr_g0((void *)off_plus_1);
1812 vis_faligndata(TMP0, TMP2, REF_2);
1813 vis_faligndata(TMP2, TMP4, REF_6);
1814 vis_faligndata(TMP6, TMP8, REF_S2);
1815 vis_faligndata(TMP8, TMP10, REF_S6);
1816 } else {
1817 vis_src1(TMP2, REF_2);
1818 vis_src1(TMP4, REF_6);
1819 vis_src1(TMP8, REF_S2);
1820 vis_src1(TMP10, REF_S6);
1821 }
1822
1823 vis_mul8x16al(DST_0, CONST_1024, TMP30);
1824 vis_pmerge(ZERO, REF_0, TMP0);
1825
1826 vis_mul8x16al(DST_1, CONST_1024, TMP32);
1827 vis_pmerge(ZERO, REF_0_1, TMP2);
1828
1829 vis_mul8x16au(REF_2, CONST_256, TMP4);
1830 vis_pmerge(ZERO, REF_2_1, TMP6);
1831
1832 vis_mul8x16al(DST_2, CONST_1024, REF_0);
1833 vis_padd16(TMP0, CONST_6, TMP0);
1834
1835 vis_mul8x16al(DST_3, CONST_1024, REF_2);
1836 vis_padd16(TMP2, CONST_6, TMP2);
1837
1838 vis_padd16(TMP0, TMP4, TMP0);
1839 vis_mul8x16au(REF_4, CONST_256, TMP4);
1840
1841 vis_padd16(TMP2, TMP6, TMP2);
1842 vis_mul8x16au(REF_4_1, CONST_256, TMP6);
1843
1844 vis_padd16(TMP12, TMP0, TMP12);
1845 vis_mul8x16au(REF_6, CONST_256, TMP8);
1846
1847 vis_padd16(TMP14, TMP2, TMP14);
1848 vis_mul8x16au(REF_6_1, CONST_256, TMP10);
1849
1850 vis_padd16(TMP12, TMP16, TMP12);
1851 vis_mul8x16au(REF_S0, CONST_256, REF_4);
1852
1853 vis_padd16(TMP14, TMP18, TMP14);
1854 vis_mul8x16au(REF_S0_1, CONST_256, REF_6);
1855
1856 vis_padd16(TMP12, TMP30, TMP12);
1857
1858 vis_padd16(TMP14, TMP32, TMP14);
1859 vis_pack16(TMP12, DST_0);
1860
1861 vis_pack16(TMP14, DST_1);
1862 vis_st64(DST_0, dest[0]);
1863 vis_padd16(TMP4, CONST_6, TMP4);
1864
1865 vis_ld64_2(dest, stride, DST_0);
1866 vis_padd16(TMP6, CONST_6, TMP6);
1867 vis_mul8x16au(REF_S2, CONST_256, TMP12);
1868
1869 vis_padd16(TMP4, TMP8, TMP4);
1870 vis_mul8x16au(REF_S2_1, CONST_256, TMP14);
1871
1872 vis_padd16(TMP6, TMP10, TMP6);
1873
1874 vis_padd16(TMP20, TMP4, TMP20);
1875
1876 vis_padd16(TMP22, TMP6, TMP22);
1877
1878 vis_padd16(TMP20, TMP24, TMP20);
1879
1880 vis_padd16(TMP22, TMP26, TMP22);
1881
1882 vis_padd16(TMP20, REF_0, TMP20);
1883 vis_mul8x16au(REF_S4, CONST_256, REF_0);
1884
1885 vis_padd16(TMP22, REF_2, TMP22);
1886 vis_pack16(TMP20, DST_2);
1887
1888 vis_pack16(TMP22, DST_3);
1889 vis_st64_2(DST_2, dest, 8);
1890 dest += stride;
1891
1892 vis_ld64_2(dest, 8, DST_2);
1893 vis_mul8x16al(DST_0, CONST_1024, TMP30);
1894 vis_pmerge(ZERO, REF_S4_1, REF_2);
1895
1896 vis_mul8x16al(DST_1, CONST_1024, TMP32);
1897 vis_padd16(REF_4, TMP0, TMP8);
1898
1899 vis_mul8x16au(REF_S6, CONST_256, REF_4);
1900 vis_padd16(REF_6, TMP2, TMP10);
1901
1902 vis_mul8x16au(REF_S6_1, CONST_256, REF_6);
1903 vis_padd16(TMP8, TMP12, TMP8);
1904
1905 vis_padd16(TMP10, TMP14, TMP10);
1906
1907 vis_padd16(TMP8, TMP30, TMP8);
1908
1909 vis_padd16(TMP10, TMP32, TMP10);
1910 vis_pack16(TMP8, DST_0);
1911
1912 vis_pack16(TMP10, DST_1);
1913 vis_st64(DST_0, dest[0]);
1914
1915 vis_padd16(REF_0, TMP4, REF_0);
1916
1917 vis_mul8x16al(DST_2, CONST_1024, TMP30);
1918 vis_padd16(REF_2, TMP6, REF_2);
1919
1920 vis_mul8x16al(DST_3, CONST_1024, TMP32);
1921 vis_padd16(REF_0, REF_4, REF_0);
1922
1923 vis_padd16(REF_2, REF_6, REF_2);
1924
1925 vis_padd16(REF_0, TMP30, REF_0);
1926
1927 /* stall */
1928
1929 vis_padd16(REF_2, TMP32, REF_2);
1930 vis_pack16(REF_0, DST_2);
1931
1932 vis_pack16(REF_2, DST_3);
1933 vis_st64_2(DST_2, dest, 8);
1934 dest += stride;
1935 } while (--height);
1936 }
1937
MC_avg_xy_8_vis(uint8_t * dest,const uint8_t * _ref,const int stride,int height)1938 static void MC_avg_xy_8_vis (uint8_t * dest, const uint8_t * _ref,
1939 const int stride, int height)
1940 {
1941 uint8_t *ref = (uint8_t *) _ref;
1942 unsigned long off = (unsigned long) ref & 0x7;
1943 unsigned long off_plus_1 = off + 1;
1944 int stride_8 = stride + 8;
1945
1946 vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
1947
1948 ref = vis_alignaddr(ref);
1949
1950 vis_ld64(ref[0], TMP0);
1951 vis_fzero(ZERO);
1952
1953 vis_ld64_2(ref, 8, TMP2);
1954
1955 vis_ld64(constants6[0], CONST_6);
1956
1957 vis_ld64(constants256_1024[0], CONST_256);
1958 vis_faligndata(TMP0, TMP2, REF_S0);
1959
1960 if (off != 0x7) {
1961 vis_alignaddr_g0((void *)off_plus_1);
1962 vis_faligndata(TMP0, TMP2, REF_S2);
1963 } else {
1964 vis_src1(TMP2, REF_S2);
1965 }
1966
1967 height >>= 1;
1968 do { /* 31 cycles */
1969 vis_ld64_2(ref, stride, TMP0);
1970 vis_mul8x16au(REF_S0, CONST_256, TMP8);
1971 vis_pmerge(ZERO, REF_S0_1, TMP10);
1972
1973 vis_ld64_2(ref, stride_8, TMP2);
1974 ref += stride;
1975 vis_mul8x16au(REF_S2, CONST_256, TMP12);
1976 vis_pmerge(ZERO, REF_S2_1, TMP14);
1977
1978 vis_alignaddr_g0((void *)off);
1979
1980 vis_ld64_2(ref, stride, TMP4);
1981 vis_faligndata(TMP0, TMP2, REF_S4);
1982
1983 vis_ld64_2(ref, stride_8, TMP6);
1984 ref += stride;
1985
1986 vis_ld64(dest[0], DST_0);
1987 vis_faligndata(TMP4, TMP6, REF_S0);
1988
1989 vis_ld64_2(dest, stride, DST_2);
1990
1991 if (off != 0x7) {
1992 vis_alignaddr_g0((void *)off_plus_1);
1993 vis_faligndata(TMP0, TMP2, REF_S6);
1994 vis_faligndata(TMP4, TMP6, REF_S2);
1995 } else {
1996 vis_src1(TMP2, REF_S6);
1997 vis_src1(TMP6, REF_S2);
1998 }
1999
2000 vis_mul8x16al(DST_0, CONST_1024, TMP30);
2001 vis_pmerge(ZERO, REF_S4, TMP22);
2002
2003 vis_mul8x16al(DST_1, CONST_1024, TMP32);
2004 vis_pmerge(ZERO, REF_S4_1, TMP24);
2005
2006 vis_mul8x16au(REF_S6, CONST_256, TMP26);
2007 vis_pmerge(ZERO, REF_S6_1, TMP28);
2008
2009 vis_mul8x16au(REF_S0, CONST_256, REF_S4);
2010 vis_padd16(TMP22, CONST_6, TMP22);
2011
2012 vis_mul8x16au(REF_S0_1, CONST_256, REF_S6);
2013 vis_padd16(TMP24, CONST_6, TMP24);
2014
2015 vis_mul8x16al(DST_2, CONST_1024, REF_0);
2016 vis_padd16(TMP22, TMP26, TMP22);
2017
2018 vis_mul8x16al(DST_3, CONST_1024, REF_2);
2019 vis_padd16(TMP24, TMP28, TMP24);
2020
2021 vis_mul8x16au(REF_S2, CONST_256, TMP26);
2022 vis_padd16(TMP8, TMP22, TMP8);
2023
2024 vis_mul8x16au(REF_S2_1, CONST_256, TMP28);
2025 vis_padd16(TMP10, TMP24, TMP10);
2026
2027 vis_padd16(TMP8, TMP12, TMP8);
2028
2029 vis_padd16(TMP10, TMP14, TMP10);
2030
2031 vis_padd16(TMP8, TMP30, TMP8);
2032
2033 vis_padd16(TMP10, TMP32, TMP10);
2034 vis_pack16(TMP8, DST_0);
2035
2036 vis_pack16(TMP10, DST_1);
2037 vis_st64(DST_0, dest[0]);
2038 dest += stride;
2039
2040 vis_padd16(REF_S4, TMP22, TMP12);
2041
2042 vis_padd16(REF_S6, TMP24, TMP14);
2043
2044 vis_padd16(TMP12, TMP26, TMP12);
2045
2046 vis_padd16(TMP14, TMP28, TMP14);
2047
2048 vis_padd16(TMP12, REF_0, TMP12);
2049
2050 vis_padd16(TMP14, REF_2, TMP14);
2051 vis_pack16(TMP12, DST_2);
2052
2053 vis_pack16(TMP14, DST_3);
2054 vis_st64(DST_2, dest[0]);
2055 dest += stride;
2056 } while (--height);
2057 }
2058
2059 MPEG2_MC_EXTERN(vis);
2060
2061 #endif /* defined(ARCH_SPARC) && defined(ENABLE_VIS) */
2062