1 /*
2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12
13 #include "./vpx_config.h"
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/ppc/types_vsx.h"
16
vpx_get4x4sse_cs_vsx(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride)17 uint32_t vpx_get4x4sse_cs_vsx(const uint8_t *src_ptr, int src_stride,
18 const uint8_t *ref_ptr, int ref_stride) {
19 int distortion;
20
21 const int16x8_t a0 = unpack_to_s16_h(read4x2(src_ptr, src_stride));
22 const int16x8_t a1 =
23 unpack_to_s16_h(read4x2(src_ptr + src_stride * 2, src_stride));
24 const int16x8_t b0 = unpack_to_s16_h(read4x2(ref_ptr, ref_stride));
25 const int16x8_t b1 =
26 unpack_to_s16_h(read4x2(ref_ptr + ref_stride * 2, ref_stride));
27 const int16x8_t d0 = vec_sub(a0, b0);
28 const int16x8_t d1 = vec_sub(a1, b1);
29 const int32x4_t ds = vec_msum(d1, d1, vec_msum(d0, d0, vec_splat_s32(0)));
30 const int32x4_t d = vec_splat(vec_sums(ds, vec_splat_s32(0)), 3);
31
32 vec_ste(d, 0, &distortion);
33
34 return distortion;
35 }
36
37 // TODO(lu_zero): Unroll
vpx_get_mb_ss_vsx(const int16_t * src_ptr)38 uint32_t vpx_get_mb_ss_vsx(const int16_t *src_ptr) {
39 unsigned int i, sum = 0;
40 int32x4_t s = vec_splat_s32(0);
41
42 for (i = 0; i < 256; i += 8) {
43 const int16x8_t v = vec_vsx_ld(0, src_ptr + i);
44 s = vec_msum(v, v, s);
45 }
46
47 s = vec_splat(vec_sums(s, vec_splat_s32(0)), 3);
48
49 vec_ste((uint32x4_t)s, 0, &sum);
50
51 return sum;
52 }
53
vpx_comp_avg_pred_vsx(uint8_t * comp_pred,const uint8_t * pred,int width,int height,const uint8_t * ref,int ref_stride)54 void vpx_comp_avg_pred_vsx(uint8_t *comp_pred, const uint8_t *pred, int width,
55 int height, const uint8_t *ref, int ref_stride) {
56 int i, j;
57 /* comp_pred and pred must be 16 byte aligned. */
58 assert(((intptr_t)comp_pred & 0xf) == 0);
59 assert(((intptr_t)pred & 0xf) == 0);
60 if (width >= 16) {
61 for (i = 0; i < height; ++i) {
62 for (j = 0; j < width; j += 16) {
63 const uint8x16_t v = vec_avg(vec_vsx_ld(j, pred), vec_vsx_ld(j, ref));
64 vec_vsx_st(v, j, comp_pred);
65 }
66 comp_pred += width;
67 pred += width;
68 ref += ref_stride;
69 }
70 } else if (width == 8) {
71 // Process 2 lines at time
72 for (i = 0; i < height / 2; ++i) {
73 const uint8x16_t r0 = vec_vsx_ld(0, ref);
74 const uint8x16_t r1 = vec_vsx_ld(0, ref + ref_stride);
75 const uint8x16_t r = xxpermdi(r0, r1, 0);
76 const uint8x16_t v = vec_avg(vec_vsx_ld(0, pred), r);
77 vec_vsx_st(v, 0, comp_pred);
78 comp_pred += 16; // width * 2;
79 pred += 16; // width * 2;
80 ref += ref_stride * 2;
81 }
82 } else {
83 assert(width == 4);
84 // process 4 lines at time
85 for (i = 0; i < height / 4; ++i) {
86 const uint32x4_t r0 = (uint32x4_t)vec_vsx_ld(0, ref);
87 const uint32x4_t r1 = (uint32x4_t)vec_vsx_ld(0, ref + ref_stride);
88 const uint32x4_t r2 = (uint32x4_t)vec_vsx_ld(0, ref + ref_stride * 2);
89 const uint32x4_t r3 = (uint32x4_t)vec_vsx_ld(0, ref + ref_stride * 3);
90 const uint8x16_t r =
91 (uint8x16_t)xxpermdi(vec_mergeh(r0, r1), vec_mergeh(r2, r3), 0);
92 const uint8x16_t v = vec_avg(vec_vsx_ld(0, pred), r);
93 vec_vsx_st(v, 0, comp_pred);
94 comp_pred += 16; // width * 4;
95 pred += 16; // width * 4;
96 ref += ref_stride * 4;
97 }
98 }
99 }
100
variance_inner_32(const uint8_t * src_ptr,const uint8_t * ref_ptr,int32x4_t * sum_squared,int32x4_t * sum)101 static INLINE void variance_inner_32(const uint8_t *src_ptr,
102 const uint8_t *ref_ptr,
103 int32x4_t *sum_squared, int32x4_t *sum) {
104 int32x4_t s = *sum;
105 int32x4_t ss = *sum_squared;
106
107 const uint8x16_t va0 = vec_vsx_ld(0, src_ptr);
108 const uint8x16_t vb0 = vec_vsx_ld(0, ref_ptr);
109 const uint8x16_t va1 = vec_vsx_ld(16, src_ptr);
110 const uint8x16_t vb1 = vec_vsx_ld(16, ref_ptr);
111
112 const int16x8_t a0 = unpack_to_s16_h(va0);
113 const int16x8_t b0 = unpack_to_s16_h(vb0);
114 const int16x8_t a1 = unpack_to_s16_l(va0);
115 const int16x8_t b1 = unpack_to_s16_l(vb0);
116 const int16x8_t a2 = unpack_to_s16_h(va1);
117 const int16x8_t b2 = unpack_to_s16_h(vb1);
118 const int16x8_t a3 = unpack_to_s16_l(va1);
119 const int16x8_t b3 = unpack_to_s16_l(vb1);
120 const int16x8_t d0 = vec_sub(a0, b0);
121 const int16x8_t d1 = vec_sub(a1, b1);
122 const int16x8_t d2 = vec_sub(a2, b2);
123 const int16x8_t d3 = vec_sub(a3, b3);
124
125 s = vec_sum4s(d0, s);
126 ss = vec_msum(d0, d0, ss);
127 s = vec_sum4s(d1, s);
128 ss = vec_msum(d1, d1, ss);
129 s = vec_sum4s(d2, s);
130 ss = vec_msum(d2, d2, ss);
131 s = vec_sum4s(d3, s);
132 ss = vec_msum(d3, d3, ss);
133 *sum = s;
134 *sum_squared = ss;
135 }
136
variance(const uint8_t * src_ptr,int src_stride,const uint8_t * ref_ptr,int ref_stride,int w,int h,uint32_t * sse,int * sum)137 static INLINE void variance(const uint8_t *src_ptr, int src_stride,
138 const uint8_t *ref_ptr, int ref_stride, int w,
139 int h, uint32_t *sse, int *sum) {
140 int i;
141
142 int32x4_t s = vec_splat_s32(0);
143 int32x4_t ss = vec_splat_s32(0);
144
145 switch (w) {
146 case 4:
147 for (i = 0; i < h / 2; ++i) {
148 const int16x8_t a0 = unpack_to_s16_h(read4x2(src_ptr, src_stride));
149 const int16x8_t b0 = unpack_to_s16_h(read4x2(ref_ptr, ref_stride));
150 const int16x8_t d = vec_sub(a0, b0);
151 s = vec_sum4s(d, s);
152 ss = vec_msum(d, d, ss);
153 src_ptr += src_stride * 2;
154 ref_ptr += ref_stride * 2;
155 }
156 break;
157 case 8:
158 for (i = 0; i < h; ++i) {
159 const int16x8_t a0 = unpack_to_s16_h(vec_vsx_ld(0, src_ptr));
160 const int16x8_t b0 = unpack_to_s16_h(vec_vsx_ld(0, ref_ptr));
161 const int16x8_t d = vec_sub(a0, b0);
162
163 s = vec_sum4s(d, s);
164 ss = vec_msum(d, d, ss);
165 src_ptr += src_stride;
166 ref_ptr += ref_stride;
167 }
168 break;
169 case 16:
170 for (i = 0; i < h; ++i) {
171 const uint8x16_t va = vec_vsx_ld(0, src_ptr);
172 const uint8x16_t vb = vec_vsx_ld(0, ref_ptr);
173 const int16x8_t a0 = unpack_to_s16_h(va);
174 const int16x8_t b0 = unpack_to_s16_h(vb);
175 const int16x8_t a1 = unpack_to_s16_l(va);
176 const int16x8_t b1 = unpack_to_s16_l(vb);
177 const int16x8_t d0 = vec_sub(a0, b0);
178 const int16x8_t d1 = vec_sub(a1, b1);
179
180 s = vec_sum4s(d0, s);
181 ss = vec_msum(d0, d0, ss);
182 s = vec_sum4s(d1, s);
183 ss = vec_msum(d1, d1, ss);
184
185 src_ptr += src_stride;
186 ref_ptr += ref_stride;
187 }
188 break;
189 case 32:
190 for (i = 0; i < h; ++i) {
191 variance_inner_32(src_ptr, ref_ptr, &ss, &s);
192 src_ptr += src_stride;
193 ref_ptr += ref_stride;
194 }
195 break;
196 case 64:
197 for (i = 0; i < h; ++i) {
198 variance_inner_32(src_ptr, ref_ptr, &ss, &s);
199 variance_inner_32(src_ptr + 32, ref_ptr + 32, &ss, &s);
200
201 src_ptr += src_stride;
202 ref_ptr += ref_stride;
203 }
204 break;
205 }
206
207 s = vec_splat(vec_sums(s, vec_splat_s32(0)), 3);
208
209 vec_ste(s, 0, sum);
210
211 ss = vec_splat(vec_sums(ss, vec_splat_s32(0)), 3);
212
213 vec_ste((uint32x4_t)ss, 0, sse);
214 }
215
216 /* Identical to the variance call except it takes an additional parameter, sum,
217 * and returns that value using pass-by-reference instead of returning
218 * sse - sum^2 / w*h
219 */
220 #define GET_VAR(W, H) \
221 void vpx_get##W##x##H##var_vsx(const uint8_t *src_ptr, int src_stride, \
222 const uint8_t *ref_ptr, int ref_stride, \
223 uint32_t *sse, int *sum) { \
224 variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, sum); \
225 }
226
227 /* Identical to the variance call except it does not calculate the
228 * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
229 * variable.
230 */
231 #define MSE(W, H) \
232 uint32_t vpx_mse##W##x##H##_vsx(const uint8_t *src_ptr, int src_stride, \
233 const uint8_t *ref_ptr, int ref_stride, \
234 uint32_t *sse) { \
235 int sum; \
236 variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, &sum); \
237 return *sse; \
238 }
239
240 #define VAR(W, H) \
241 uint32_t vpx_variance##W##x##H##_vsx(const uint8_t *src_ptr, int src_stride, \
242 const uint8_t *ref_ptr, int ref_stride, \
243 uint32_t *sse) { \
244 int sum; \
245 variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, &sum); \
246 return *sse - (uint32_t)(((int64_t)sum * sum) / ((W) * (H))); \
247 }
248
249 #define VARIANCES(W, H) VAR(W, H)
250
251 VARIANCES(64, 64)
252 VARIANCES(64, 32)
253 VARIANCES(32, 64)
254 VARIANCES(32, 32)
255 VARIANCES(32, 16)
256 VARIANCES(16, 32)
257 VARIANCES(16, 16)
258 VARIANCES(16, 8)
259 VARIANCES(8, 16)
260 VARIANCES(8, 8)
261 VARIANCES(8, 4)
262 VARIANCES(4, 8)
263 VARIANCES(4, 4)
264
265 GET_VAR(16, 16)
266 GET_VAR(8, 8)
267
268 MSE(16, 16)
269 MSE(16, 8)
270 MSE(8, 16)
271 MSE(8, 8)
272