1 /*
2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10 #include <assert.h>
11 #include <string.h>
12
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx/vpx_integer.h"
15 #include "vpx_dsp/ppc/types_vsx.h"
16 #include "vpx_dsp/vpx_filter.h"
17
18 // TODO(lu_zero): unroll
copy_w16(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int32_t h)19 static VPX_FORCE_INLINE void copy_w16(const uint8_t *src, ptrdiff_t src_stride,
20 uint8_t *dst, ptrdiff_t dst_stride,
21 int32_t h) {
22 int i;
23
24 for (i = h; i--;) {
25 vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
26 src += src_stride;
27 dst += dst_stride;
28 }
29 }
30
copy_w32(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int32_t h)31 static VPX_FORCE_INLINE void copy_w32(const uint8_t *src, ptrdiff_t src_stride,
32 uint8_t *dst, ptrdiff_t dst_stride,
33 int32_t h) {
34 int i;
35
36 for (i = h; i--;) {
37 vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
38 vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
39 src += src_stride;
40 dst += dst_stride;
41 }
42 }
43
copy_w64(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int32_t h)44 static VPX_FORCE_INLINE void copy_w64(const uint8_t *src, ptrdiff_t src_stride,
45 uint8_t *dst, ptrdiff_t dst_stride,
46 int32_t h) {
47 int i;
48
49 for (i = h; i--;) {
50 vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
51 vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
52 vec_vsx_st(vec_vsx_ld(32, src), 32, dst);
53 vec_vsx_st(vec_vsx_ld(48, src), 48, dst);
54 src += src_stride;
55 dst += dst_stride;
56 }
57 }
58
vpx_convolve_copy_vsx(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int32_t y_step_q4,int32_t w,int32_t h)59 void vpx_convolve_copy_vsx(const uint8_t *src, ptrdiff_t src_stride,
60 uint8_t *dst, ptrdiff_t dst_stride,
61 const InterpKernel *filter, int x0_q4, int x_step_q4,
62 int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) {
63 (void)filter;
64 (void)x0_q4;
65 (void)x_step_q4;
66 (void)y0_q4;
67 (void)y_step_q4;
68
69 switch (w) {
70 case 16: {
71 copy_w16(src, src_stride, dst, dst_stride, h);
72 break;
73 }
74 case 32: {
75 copy_w32(src, src_stride, dst, dst_stride, h);
76 break;
77 }
78 case 64: {
79 copy_w64(src, src_stride, dst, dst_stride, h);
80 break;
81 }
82 default: {
83 int i;
84 for (i = h; i--;) {
85 memcpy(dst, src, w);
86 src += src_stride;
87 dst += dst_stride;
88 }
89 break;
90 }
91 }
92 }
93
avg_w16(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int32_t h)94 static VPX_FORCE_INLINE void avg_w16(const uint8_t *src, ptrdiff_t src_stride,
95 uint8_t *dst, ptrdiff_t dst_stride,
96 int32_t h) {
97 int i;
98
99 for (i = h; i--;) {
100 const uint8x16_t v = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst));
101 vec_vsx_st(v, 0, dst);
102 src += src_stride;
103 dst += dst_stride;
104 }
105 }
106
avg_w32(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int32_t h)107 static VPX_FORCE_INLINE void avg_w32(const uint8_t *src, ptrdiff_t src_stride,
108 uint8_t *dst, ptrdiff_t dst_stride,
109 int32_t h) {
110 int i;
111
112 for (i = h; i--;) {
113 const uint8x16_t v0 = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst));
114 const uint8x16_t v1 = vec_avg(vec_vsx_ld(16, src), vec_vsx_ld(16, dst));
115 vec_vsx_st(v0, 0, dst);
116 vec_vsx_st(v1, 16, dst);
117 src += src_stride;
118 dst += dst_stride;
119 }
120 }
121
avg_w64(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int32_t h)122 static VPX_FORCE_INLINE void avg_w64(const uint8_t *src, ptrdiff_t src_stride,
123 uint8_t *dst, ptrdiff_t dst_stride,
124 int32_t h) {
125 int i;
126
127 for (i = h; i--;) {
128 const uint8x16_t v0 = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst));
129 const uint8x16_t v1 = vec_avg(vec_vsx_ld(16, src), vec_vsx_ld(16, dst));
130 const uint8x16_t v2 = vec_avg(vec_vsx_ld(32, src), vec_vsx_ld(32, dst));
131 const uint8x16_t v3 = vec_avg(vec_vsx_ld(48, src), vec_vsx_ld(48, dst));
132 vec_vsx_st(v0, 0, dst);
133 vec_vsx_st(v1, 16, dst);
134 vec_vsx_st(v2, 32, dst);
135 vec_vsx_st(v3, 48, dst);
136 src += src_stride;
137 dst += dst_stride;
138 }
139 }
140
vpx_convolve_avg_vsx(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int32_t y_step_q4,int32_t w,int32_t h)141 void vpx_convolve_avg_vsx(const uint8_t *src, ptrdiff_t src_stride,
142 uint8_t *dst, ptrdiff_t dst_stride,
143 const InterpKernel *filter, int x0_q4, int x_step_q4,
144 int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) {
145 switch (w) {
146 case 16: {
147 avg_w16(src, src_stride, dst, dst_stride, h);
148 break;
149 }
150 case 32: {
151 avg_w32(src, src_stride, dst, dst_stride, h);
152 break;
153 }
154 case 64: {
155 avg_w64(src, src_stride, dst, dst_stride, h);
156 break;
157 }
158 default: {
159 vpx_convolve_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
160 x_step_q4, y0_q4, y_step_q4, w, h);
161 break;
162 }
163 }
164 }
165
convolve_line(uint8_t * dst,const int16x8_t s,const int16x8_t f)166 static VPX_FORCE_INLINE void convolve_line(uint8_t *dst, const int16x8_t s,
167 const int16x8_t f) {
168 const int32x4_t sum = vec_msum(s, f, vec_splat_s32(0));
169 const int32x4_t bias =
170 vec_sl(vec_splat_s32(1), vec_splat_u32(FILTER_BITS - 1));
171 const int32x4_t avg = vec_sr(vec_sums(sum, bias), vec_splat_u32(FILTER_BITS));
172 const uint8x16_t v = vec_splat(
173 vec_packsu(vec_pack(avg, vec_splat_s32(0)), vec_splat_s16(0)), 3);
174 vec_ste(v, 0, dst);
175 }
176
convolve_line_h(uint8_t * dst,const uint8_t * const src_x,const int16_t * const x_filter)177 static VPX_FORCE_INLINE void convolve_line_h(uint8_t *dst,
178 const uint8_t *const src_x,
179 const int16_t *const x_filter) {
180 const int16x8_t s = unpack_to_s16_h(vec_vsx_ld(0, src_x));
181 const int16x8_t f = vec_vsx_ld(0, x_filter);
182
183 convolve_line(dst, s, f);
184 }
185
186 // TODO(lu_zero): Implement 8x8 and bigger block special cases
convolve_horiz(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h)187 static VPX_FORCE_INLINE void convolve_horiz(const uint8_t *src,
188 ptrdiff_t src_stride, uint8_t *dst,
189 ptrdiff_t dst_stride,
190 const InterpKernel *x_filters,
191 int x0_q4, int x_step_q4, int w,
192 int h) {
193 int x, y;
194 src -= SUBPEL_TAPS / 2 - 1;
195
196 for (y = 0; y < h; ++y) {
197 int x_q4 = x0_q4;
198 for (x = 0; x < w; ++x) {
199 convolve_line_h(dst + x, &src[x_q4 >> SUBPEL_BITS],
200 x_filters[x_q4 & SUBPEL_MASK]);
201 x_q4 += x_step_q4;
202 }
203 src += src_stride;
204 dst += dst_stride;
205 }
206 }
207
convolve_avg_horiz(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h)208 static VPX_FORCE_INLINE void convolve_avg_horiz(
209 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
210 ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
211 int x_step_q4, int w, int h) {
212 int x, y;
213 src -= SUBPEL_TAPS / 2 - 1;
214
215 for (y = 0; y < h; ++y) {
216 int x_q4 = x0_q4;
217 for (x = 0; x < w; ++x) {
218 uint8_t v;
219 convolve_line_h(&v, &src[x_q4 >> SUBPEL_BITS],
220 x_filters[x_q4 & SUBPEL_MASK]);
221 dst[x] = ROUND_POWER_OF_TWO(dst[x] + v, 1);
222 x_q4 += x_step_q4;
223 }
224 src += src_stride;
225 dst += dst_stride;
226 }
227 }
228
transpose_line_u8_8x8(uint8x16_t a,uint8x16_t b,uint8x16_t c,uint8x16_t d,uint8x16_t e,uint8x16_t f,uint8x16_t g,uint8x16_t h)229 static uint8x16_t transpose_line_u8_8x8(uint8x16_t a, uint8x16_t b,
230 uint8x16_t c, uint8x16_t d,
231 uint8x16_t e, uint8x16_t f,
232 uint8x16_t g, uint8x16_t h) {
233 uint16x8_t ab = (uint16x8_t)vec_mergeh(a, b);
234 uint16x8_t cd = (uint16x8_t)vec_mergeh(c, d);
235 uint16x8_t ef = (uint16x8_t)vec_mergeh(e, f);
236 uint16x8_t gh = (uint16x8_t)vec_mergeh(g, h);
237
238 uint32x4_t abcd = (uint32x4_t)vec_mergeh(ab, cd);
239 uint32x4_t efgh = (uint32x4_t)vec_mergeh(ef, gh);
240
241 return (uint8x16_t)vec_mergeh(abcd, efgh);
242 }
243
convolve_line_v(uint8_t * dst,const uint8_t * const src_y,ptrdiff_t src_stride,const int16_t * const y_filter)244 static VPX_FORCE_INLINE void convolve_line_v(uint8_t *dst,
245 const uint8_t *const src_y,
246 ptrdiff_t src_stride,
247 const int16_t *const y_filter) {
248 uint8x16_t s0 = vec_vsx_ld(0, src_y + 0 * src_stride);
249 uint8x16_t s1 = vec_vsx_ld(0, src_y + 1 * src_stride);
250 uint8x16_t s2 = vec_vsx_ld(0, src_y + 2 * src_stride);
251 uint8x16_t s3 = vec_vsx_ld(0, src_y + 3 * src_stride);
252 uint8x16_t s4 = vec_vsx_ld(0, src_y + 4 * src_stride);
253 uint8x16_t s5 = vec_vsx_ld(0, src_y + 5 * src_stride);
254 uint8x16_t s6 = vec_vsx_ld(0, src_y + 6 * src_stride);
255 uint8x16_t s7 = vec_vsx_ld(0, src_y + 7 * src_stride);
256 const int16x8_t f = vec_vsx_ld(0, y_filter);
257 uint8_t buf[16];
258 const uint8x16_t s = transpose_line_u8_8x8(s0, s1, s2, s3, s4, s5, s6, s7);
259
260 vec_vsx_st(s, 0, buf);
261
262 convolve_line(dst, unpack_to_s16_h(s), f);
263 }
264
convolve_vert(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)265 static VPX_FORCE_INLINE void convolve_vert(const uint8_t *src,
266 ptrdiff_t src_stride, uint8_t *dst,
267 ptrdiff_t dst_stride,
268 const InterpKernel *y_filters,
269 int y0_q4, int y_step_q4, int w,
270 int h) {
271 int x, y;
272 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
273
274 for (x = 0; x < w; ++x) {
275 int y_q4 = y0_q4;
276 for (y = 0; y < h; ++y) {
277 convolve_line_v(dst + y * dst_stride,
278 &src[(y_q4 >> SUBPEL_BITS) * src_stride], src_stride,
279 y_filters[y_q4 & SUBPEL_MASK]);
280 y_q4 += y_step_q4;
281 }
282 ++src;
283 ++dst;
284 }
285 }
286
convolve_avg_vert(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)287 static VPX_FORCE_INLINE void convolve_avg_vert(
288 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
289 ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
290 int y_step_q4, int w, int h) {
291 int x, y;
292 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
293
294 for (x = 0; x < w; ++x) {
295 int y_q4 = y0_q4;
296 for (y = 0; y < h; ++y) {
297 uint8_t v;
298 convolve_line_v(&v, &src[(y_q4 >> SUBPEL_BITS) * src_stride], src_stride,
299 y_filters[y_q4 & SUBPEL_MASK]);
300 dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + v, 1);
301 y_q4 += y_step_q4;
302 }
303 ++src;
304 ++dst;
305 }
306 }
307
convolve(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * const filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)308 static VPX_FORCE_INLINE void convolve(const uint8_t *src, ptrdiff_t src_stride,
309 uint8_t *dst, ptrdiff_t dst_stride,
310 const InterpKernel *const filter,
311 int x0_q4, int x_step_q4, int y0_q4,
312 int y_step_q4, int w, int h) {
313 // Note: Fixed size intermediate buffer, temp, places limits on parameters.
314 // 2d filtering proceeds in 2 steps:
315 // (1) Interpolate horizontally into an intermediate buffer, temp.
316 // (2) Interpolate temp vertically to derive the sub-pixel result.
317 // Deriving the maximum number of rows in the temp buffer (135):
318 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
319 // --Largest block size is 64x64 pixels.
320 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
321 // original frame (in 1/16th pixel units).
322 // --Must round-up because block may be located at sub-pixel position.
323 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
324 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
325 DECLARE_ALIGNED(16, uint8_t, temp[64 * 135]);
326 const int intermediate_height =
327 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
328
329 assert(w <= 64);
330 assert(h <= 64);
331 assert(y_step_q4 <= 32);
332 assert(x_step_q4 <= 32);
333
334 convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
335 filter, x0_q4, x_step_q4, w, intermediate_height);
336 convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter,
337 y0_q4, y_step_q4, w, h);
338 }
339
vpx_convolve8_horiz_vsx(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)340 void vpx_convolve8_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride,
341 uint8_t *dst, ptrdiff_t dst_stride,
342 const InterpKernel *filter, int x0_q4,
343 int x_step_q4, int y0_q4, int y_step_q4, int w,
344 int h) {
345 (void)y0_q4;
346 (void)y_step_q4;
347
348 convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w,
349 h);
350 }
351
vpx_convolve8_avg_horiz_vsx(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)352 void vpx_convolve8_avg_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride,
353 uint8_t *dst, ptrdiff_t dst_stride,
354 const InterpKernel *filter, int x0_q4,
355 int x_step_q4, int y0_q4, int y_step_q4, int w,
356 int h) {
357 (void)y0_q4;
358 (void)y_step_q4;
359
360 convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
361 w, h);
362 }
363
vpx_convolve8_vert_vsx(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)364 void vpx_convolve8_vert_vsx(const uint8_t *src, ptrdiff_t src_stride,
365 uint8_t *dst, ptrdiff_t dst_stride,
366 const InterpKernel *filter, int x0_q4,
367 int x_step_q4, int y0_q4, int y_step_q4, int w,
368 int h) {
369 (void)x0_q4;
370 (void)x_step_q4;
371
372 convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w,
373 h);
374 }
375
vpx_convolve8_avg_vert_vsx(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)376 void vpx_convolve8_avg_vert_vsx(const uint8_t *src, ptrdiff_t src_stride,
377 uint8_t *dst, ptrdiff_t dst_stride,
378 const InterpKernel *filter, int x0_q4,
379 int x_step_q4, int y0_q4, int y_step_q4, int w,
380 int h) {
381 (void)x0_q4;
382 (void)x_step_q4;
383
384 convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4,
385 w, h);
386 }
387
vpx_convolve8_vsx(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)388 void vpx_convolve8_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
389 ptrdiff_t dst_stride, const InterpKernel *filter,
390 int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
391 int w, int h) {
392 convolve(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4,
393 y_step_q4, w, h);
394 }
395
vpx_convolve8_avg_vsx(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)396 void vpx_convolve8_avg_vsx(const uint8_t *src, ptrdiff_t src_stride,
397 uint8_t *dst, ptrdiff_t dst_stride,
398 const InterpKernel *filter, int x0_q4, int x_step_q4,
399 int y0_q4, int y_step_q4, int w, int h) {
400 // Fixed size intermediate buffer places limits on parameters.
401 DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
402 assert(w <= 64);
403 assert(h <= 64);
404
405 vpx_convolve8_vsx(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4,
406 y_step_q4, w, h);
407 vpx_convolve_avg_vsx(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h);
408 }
409