1 /*
2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 
13 #include "./vpx_config.h"
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/ppc/types_vsx.h"
16 
vpx_get4x4sse_cs_vsx(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride)17 uint32_t vpx_get4x4sse_cs_vsx(const uint8_t *src_ptr, int src_stride,
18                               const uint8_t *ref_ptr, int ref_stride) {
19   int distortion;
20 
21   const int16x8_t a0 = unpack_to_s16_h(read4x2(src_ptr, src_stride));
22   const int16x8_t a1 =
23       unpack_to_s16_h(read4x2(src_ptr + src_stride * 2, src_stride));
24   const int16x8_t b0 = unpack_to_s16_h(read4x2(ref_ptr, ref_stride));
25   const int16x8_t b1 =
26       unpack_to_s16_h(read4x2(ref_ptr + ref_stride * 2, ref_stride));
27   const int16x8_t d0 = vec_sub(a0, b0);
28   const int16x8_t d1 = vec_sub(a1, b1);
29   const int32x4_t ds = vec_msum(d1, d1, vec_msum(d0, d0, vec_splat_s32(0)));
30   const int32x4_t d = vec_splat(vec_sums(ds, vec_splat_s32(0)), 3);
31 
32   vec_ste(d, 0, &distortion);
33 
34   return distortion;
35 }
36 
37 // TODO(lu_zero): Unroll
vpx_get_mb_ss_vsx(const int16_t * src_ptr)38 uint32_t vpx_get_mb_ss_vsx(const int16_t *src_ptr) {
39   unsigned int i, sum = 0;
40   int32x4_t s = vec_splat_s32(0);
41 
42   for (i = 0; i < 256; i += 8) {
43     const int16x8_t v = vec_vsx_ld(0, src_ptr + i);
44     s = vec_msum(v, v, s);
45   }
46 
47   s = vec_splat(vec_sums(s, vec_splat_s32(0)), 3);
48 
49   vec_ste((uint32x4_t)s, 0, &sum);
50 
51   return sum;
52 }
53 
vpx_comp_avg_pred_vsx(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride)54 void vpx_comp_avg_pred_vsx(uint8_t *comp_pred, const uint8_t *pred, int width,
55                            int height, const uint8_t *ref, int ref_stride) {
56   int i, j;
57   /* comp_pred and pred must be 16 byte aligned. */
58   assert(((intptr_t)comp_pred & 0xf) == 0);
59   assert(((intptr_t)pred & 0xf) == 0);
60   if (width >= 16) {
61     for (i = 0; i < height; ++i) {
62       for (j = 0; j < width; j += 16) {
63         const uint8x16_t v = vec_avg(vec_vsx_ld(j, pred), vec_vsx_ld(j, ref));
64         vec_vsx_st(v, j, comp_pred);
65       }
66       comp_pred += width;
67       pred += width;
68       ref += ref_stride;
69     }
70   } else if (width == 8) {
71     // Process 2 lines at time
72     for (i = 0; i < height / 2; ++i) {
73       const uint8x16_t r0 = vec_vsx_ld(0, ref);
74       const uint8x16_t r1 = vec_vsx_ld(0, ref + ref_stride);
75       const uint8x16_t r = xxpermdi(r0, r1, 0);
76       const uint8x16_t v = vec_avg(vec_vsx_ld(0, pred), r);
77       vec_vsx_st(v, 0, comp_pred);
78       comp_pred += 16;  // width * 2;
79       pred += 16;       // width * 2;
80       ref += ref_stride * 2;
81     }
82   } else {
83     assert(width == 4);
84     // process 4 lines at time
85     for (i = 0; i < height / 4; ++i) {
86       const uint32x4_t r0 = (uint32x4_t)vec_vsx_ld(0, ref);
87       const uint32x4_t r1 = (uint32x4_t)vec_vsx_ld(0, ref + ref_stride);
88       const uint32x4_t r2 = (uint32x4_t)vec_vsx_ld(0, ref + ref_stride * 2);
89       const uint32x4_t r3 = (uint32x4_t)vec_vsx_ld(0, ref + ref_stride * 3);
90       const uint8x16_t r =
91           (uint8x16_t)xxpermdi(vec_mergeh(r0, r1), vec_mergeh(r2, r3), 0);
92       const uint8x16_t v = vec_avg(vec_vsx_ld(0, pred), r);
93       vec_vsx_st(v, 0, comp_pred);
94       comp_pred += 16;  // width * 4;
95       pred += 16;       // width * 4;
96       ref += ref_stride * 4;
97     }
98   }
99 }
100 
variance_inner_32(const uint8_t * src_ptr,const uint8_t * ref_ptr,int32x4_t * sum_squared,int32x4_t * sum)101 static INLINE void variance_inner_32(const uint8_t *src_ptr,
102                                      const uint8_t *ref_ptr,
103                                      int32x4_t *sum_squared, int32x4_t *sum) {
104   int32x4_t s = *sum;
105   int32x4_t ss = *sum_squared;
106 
107   const uint8x16_t va0 = vec_vsx_ld(0, src_ptr);
108   const uint8x16_t vb0 = vec_vsx_ld(0, ref_ptr);
109   const uint8x16_t va1 = vec_vsx_ld(16, src_ptr);
110   const uint8x16_t vb1 = vec_vsx_ld(16, ref_ptr);
111 
112   const int16x8_t a0 = unpack_to_s16_h(va0);
113   const int16x8_t b0 = unpack_to_s16_h(vb0);
114   const int16x8_t a1 = unpack_to_s16_l(va0);
115   const int16x8_t b1 = unpack_to_s16_l(vb0);
116   const int16x8_t a2 = unpack_to_s16_h(va1);
117   const int16x8_t b2 = unpack_to_s16_h(vb1);
118   const int16x8_t a3 = unpack_to_s16_l(va1);
119   const int16x8_t b3 = unpack_to_s16_l(vb1);
120   const int16x8_t d0 = vec_sub(a0, b0);
121   const int16x8_t d1 = vec_sub(a1, b1);
122   const int16x8_t d2 = vec_sub(a2, b2);
123   const int16x8_t d3 = vec_sub(a3, b3);
124 
125   s = vec_sum4s(d0, s);
126   ss = vec_msum(d0, d0, ss);
127   s = vec_sum4s(d1, s);
128   ss = vec_msum(d1, d1, ss);
129   s = vec_sum4s(d2, s);
130   ss = vec_msum(d2, d2, ss);
131   s = vec_sum4s(d3, s);
132   ss = vec_msum(d3, d3, ss);
133   *sum = s;
134   *sum_squared = ss;
135 }
136 
variance(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int w,int h,uint32_t * sse,int * sum)137 static INLINE void variance(const uint8_t *src_ptr, int src_stride,
138                             const uint8_t *ref_ptr, int ref_stride, int w,
139                             int h, uint32_t *sse, int *sum) {
140   int i;
141 
142   int32x4_t s = vec_splat_s32(0);
143   int32x4_t ss = vec_splat_s32(0);
144 
145   switch (w) {
146     case 4:
147       for (i = 0; i < h / 2; ++i) {
148         const int16x8_t a0 = unpack_to_s16_h(read4x2(src_ptr, src_stride));
149         const int16x8_t b0 = unpack_to_s16_h(read4x2(ref_ptr, ref_stride));
150         const int16x8_t d = vec_sub(a0, b0);
151         s = vec_sum4s(d, s);
152         ss = vec_msum(d, d, ss);
153         src_ptr += src_stride * 2;
154         ref_ptr += ref_stride * 2;
155       }
156       break;
157     case 8:
158       for (i = 0; i < h; ++i) {
159         const int16x8_t a0 = unpack_to_s16_h(vec_vsx_ld(0, src_ptr));
160         const int16x8_t b0 = unpack_to_s16_h(vec_vsx_ld(0, ref_ptr));
161         const int16x8_t d = vec_sub(a0, b0);
162 
163         s = vec_sum4s(d, s);
164         ss = vec_msum(d, d, ss);
165         src_ptr += src_stride;
166         ref_ptr += ref_stride;
167       }
168       break;
169     case 16:
170       for (i = 0; i < h; ++i) {
171         const uint8x16_t va = vec_vsx_ld(0, src_ptr);
172         const uint8x16_t vb = vec_vsx_ld(0, ref_ptr);
173         const int16x8_t a0 = unpack_to_s16_h(va);
174         const int16x8_t b0 = unpack_to_s16_h(vb);
175         const int16x8_t a1 = unpack_to_s16_l(va);
176         const int16x8_t b1 = unpack_to_s16_l(vb);
177         const int16x8_t d0 = vec_sub(a0, b0);
178         const int16x8_t d1 = vec_sub(a1, b1);
179 
180         s = vec_sum4s(d0, s);
181         ss = vec_msum(d0, d0, ss);
182         s = vec_sum4s(d1, s);
183         ss = vec_msum(d1, d1, ss);
184 
185         src_ptr += src_stride;
186         ref_ptr += ref_stride;
187       }
188       break;
189     case 32:
190       for (i = 0; i < h; ++i) {
191         variance_inner_32(src_ptr, ref_ptr, &ss, &s);
192         src_ptr += src_stride;
193         ref_ptr += ref_stride;
194       }
195       break;
196     case 64:
197       for (i = 0; i < h; ++i) {
198         variance_inner_32(src_ptr, ref_ptr, &ss, &s);
199         variance_inner_32(src_ptr + 32, ref_ptr + 32, &ss, &s);
200 
201         src_ptr += src_stride;
202         ref_ptr += ref_stride;
203       }
204       break;
205   }
206 
207   s = vec_splat(vec_sums(s, vec_splat_s32(0)), 3);
208 
209   vec_ste(s, 0, sum);
210 
211   ss = vec_splat(vec_sums(ss, vec_splat_s32(0)), 3);
212 
213   vec_ste((uint32x4_t)ss, 0, sse);
214 }
215 
216 /* Identical to the variance call except it takes an additional parameter, sum,
217  * and returns that value using pass-by-reference instead of returning
218  * sse - sum^2 / w*h
219  */
220 #define GET_VAR(W, H)                                                    \
221   void vpx_get##W##x##H##var_vsx(const uint8_t *src_ptr, int src_stride, \
222                                  const uint8_t *ref_ptr, int ref_stride, \
223                                  uint32_t *sse, int *sum) {              \
224     variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, sum);  \
225   }
226 
227 /* Identical to the variance call except it does not calculate the
228  * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
229  * variable.
230  */
231 #define MSE(W, H)                                                         \
232   uint32_t vpx_mse##W##x##H##_vsx(const uint8_t *src_ptr, int src_stride, \
233                                   const uint8_t *ref_ptr, int ref_stride, \
234                                   uint32_t *sse) {                        \
235     int sum;                                                              \
236     variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, &sum);  \
237     return *sse;                                                          \
238   }
239 
240 #define VAR(W, H)                                                              \
241   uint32_t vpx_variance##W##x##H##_vsx(const uint8_t *src_ptr, int src_stride, \
242                                        const uint8_t *ref_ptr, int ref_stride, \
243                                        uint32_t *sse) {                        \
244     int sum;                                                                   \
245     variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, &sum);       \
246     return *sse - (uint32_t)(((int64_t)sum * sum) / ((W) * (H)));              \
247   }
248 
249 #define VARIANCES(W, H) VAR(W, H)
250 
251 VARIANCES(64, 64)
252 VARIANCES(64, 32)
253 VARIANCES(32, 64)
254 VARIANCES(32, 32)
255 VARIANCES(32, 16)
256 VARIANCES(16, 32)
257 VARIANCES(16, 16)
258 VARIANCES(16, 8)
259 VARIANCES(8, 16)
260 VARIANCES(8, 8)
261 VARIANCES(8, 4)
262 VARIANCES(4, 8)
263 VARIANCES(4, 4)
264 
265 GET_VAR(16, 16)
266 GET_VAR(8, 8)
267 
268 MSE(16, 16)
269 MSE(16, 8)
270 MSE(8, 16)
271 MSE(8, 8)
272