1 /*
2  *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 
13 #include "./vpx_config.h"
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx/vpx_integer.h"
16 #include "vpx_dsp/ppc/types_vsx.h"
17 
subtract_block4x4(int16_t * diff,ptrdiff_t diff_stride,const uint8_t * src,ptrdiff_t src_stride,const uint8_t * pred,ptrdiff_t pred_stride)18 static VPX_FORCE_INLINE void subtract_block4x4(
19     int16_t *diff, ptrdiff_t diff_stride, const uint8_t *src,
20     ptrdiff_t src_stride, const uint8_t *pred, ptrdiff_t pred_stride) {
21   int16_t *diff1 = diff + 2 * diff_stride;
22   const uint8_t *src1 = src + 2 * src_stride;
23   const uint8_t *pred1 = pred + 2 * pred_stride;
24 
25   const int16x8_t d0 = vec_vsx_ld(0, diff);
26   const int16x8_t d1 = vec_vsx_ld(0, diff + diff_stride);
27   const int16x8_t d2 = vec_vsx_ld(0, diff1);
28   const int16x8_t d3 = vec_vsx_ld(0, diff1 + diff_stride);
29 
30   const uint8x16_t s0 = read4x2(src, (int)src_stride);
31   const uint8x16_t p0 = read4x2(pred, (int)pred_stride);
32   const uint8x16_t s1 = read4x2(src1, (int)src_stride);
33   const uint8x16_t p1 = read4x2(pred1, (int)pred_stride);
34 
35   const int16x8_t da = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0));
36   const int16x8_t db = vec_sub(unpack_to_s16_h(s1), unpack_to_s16_h(p1));
37 
38   vec_vsx_st(xxpermdi(da, d0, 1), 0, diff);
39   vec_vsx_st(xxpermdi(da, d1, 3), 0, diff + diff_stride);
40   vec_vsx_st(xxpermdi(db, d2, 1), 0, diff1);
41   vec_vsx_st(xxpermdi(db, d3, 3), 0, diff1 + diff_stride);
42 }
43 
vpx_subtract_block_vsx(int rows,int cols,int16_t * diff,ptrdiff_t diff_stride,const uint8_t * src,ptrdiff_t src_stride,const uint8_t * pred,ptrdiff_t pred_stride)44 void vpx_subtract_block_vsx(int rows, int cols, int16_t *diff,
45                             ptrdiff_t diff_stride, const uint8_t *src,
46                             ptrdiff_t src_stride, const uint8_t *pred,
47                             ptrdiff_t pred_stride) {
48   int r = rows, c;
49 
50   switch (cols) {
51     case 64:
52     case 32:
53       do {
54         for (c = 0; c < cols; c += 32) {
55           const uint8x16_t s0 = vec_vsx_ld(0, src + c);
56           const uint8x16_t s1 = vec_vsx_ld(16, src + c);
57           const uint8x16_t p0 = vec_vsx_ld(0, pred + c);
58           const uint8x16_t p1 = vec_vsx_ld(16, pred + c);
59           const int16x8_t d0l =
60               vec_sub(unpack_to_s16_l(s0), unpack_to_s16_l(p0));
61           const int16x8_t d0h =
62               vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0));
63           const int16x8_t d1l =
64               vec_sub(unpack_to_s16_l(s1), unpack_to_s16_l(p1));
65           const int16x8_t d1h =
66               vec_sub(unpack_to_s16_h(s1), unpack_to_s16_h(p1));
67           vec_vsx_st(d0h, 0, diff + c);
68           vec_vsx_st(d0l, 16, diff + c);
69           vec_vsx_st(d1h, 0, diff + c + 16);
70           vec_vsx_st(d1l, 16, diff + c + 16);
71         }
72         diff += diff_stride;
73         pred += pred_stride;
74         src += src_stride;
75       } while (--r);
76       break;
77     case 16:
78       do {
79         const uint8x16_t s0 = vec_vsx_ld(0, src);
80         const uint8x16_t p0 = vec_vsx_ld(0, pred);
81         const int16x8_t d0l = vec_sub(unpack_to_s16_l(s0), unpack_to_s16_l(p0));
82         const int16x8_t d0h = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0));
83         vec_vsx_st(d0h, 0, diff);
84         vec_vsx_st(d0l, 16, diff);
85         diff += diff_stride;
86         pred += pred_stride;
87         src += src_stride;
88       } while (--r);
89       break;
90     case 8:
91       do {
92         const uint8x16_t s0 = vec_vsx_ld(0, src);
93         const uint8x16_t p0 = vec_vsx_ld(0, pred);
94         const int16x8_t d0h = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0));
95         vec_vsx_st(d0h, 0, diff);
96         diff += diff_stride;
97         pred += pred_stride;
98         src += src_stride;
99       } while (--r);
100       break;
101     case 4:
102       subtract_block4x4(diff, diff_stride, src, src_stride, pred, pred_stride);
103       if (r > 4) {
104         diff += 4 * diff_stride;
105         pred += 4 * pred_stride;
106         src += 4 * src_stride;
107 
108         subtract_block4x4(diff, diff_stride,
109 
110                           src, src_stride,
111 
112                           pred, pred_stride);
113       }
114       break;
115     default: assert(0);  // unreachable
116   }
117 }
118