1 /*
2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 #include <assert.h>
11 #include <string.h>
12 
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx/vpx_integer.h"
15 #include "vpx_dsp/ppc/types_vsx.h"
16 #include "vpx_dsp/vpx_filter.h"
17 
18 // TODO(lu_zero): unroll
copy_w16(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int32_t h)19 static VPX_FORCE_INLINE void copy_w16(const uint8_t *src, ptrdiff_t src_stride,
20                                       uint8_t *dst, ptrdiff_t dst_stride,
21                                       int32_t h) {
22   int i;
23 
24   for (i = h; i--;) {
25     vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
26     src += src_stride;
27     dst += dst_stride;
28   }
29 }
30 
copy_w32(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int32_t h)31 static VPX_FORCE_INLINE void copy_w32(const uint8_t *src, ptrdiff_t src_stride,
32                                       uint8_t *dst, ptrdiff_t dst_stride,
33                                       int32_t h) {
34   int i;
35 
36   for (i = h; i--;) {
37     vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
38     vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
39     src += src_stride;
40     dst += dst_stride;
41   }
42 }
43 
copy_w64(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int32_t h)44 static VPX_FORCE_INLINE void copy_w64(const uint8_t *src, ptrdiff_t src_stride,
45                                       uint8_t *dst, ptrdiff_t dst_stride,
46                                       int32_t h) {
47   int i;
48 
49   for (i = h; i--;) {
50     vec_vsx_st(vec_vsx_ld(0, src), 0, dst);
51     vec_vsx_st(vec_vsx_ld(16, src), 16, dst);
52     vec_vsx_st(vec_vsx_ld(32, src), 32, dst);
53     vec_vsx_st(vec_vsx_ld(48, src), 48, dst);
54     src += src_stride;
55     dst += dst_stride;
56   }
57 }
58 
vpx_convolve_copy_vsx(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int32_t y_step_q4,int32_t w,int32_t h)59 void vpx_convolve_copy_vsx(const uint8_t *src, ptrdiff_t src_stride,
60                            uint8_t *dst, ptrdiff_t dst_stride,
61                            const InterpKernel *filter, int x0_q4, int x_step_q4,
62                            int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) {
63   (void)filter;
64   (void)x0_q4;
65   (void)x_step_q4;
66   (void)y0_q4;
67   (void)y_step_q4;
68 
69   switch (w) {
70     case 16: {
71       copy_w16(src, src_stride, dst, dst_stride, h);
72       break;
73     }
74     case 32: {
75       copy_w32(src, src_stride, dst, dst_stride, h);
76       break;
77     }
78     case 64: {
79       copy_w64(src, src_stride, dst, dst_stride, h);
80       break;
81     }
82     default: {
83       int i;
84       for (i = h; i--;) {
85         memcpy(dst, src, w);
86         src += src_stride;
87         dst += dst_stride;
88       }
89       break;
90     }
91   }
92 }
93 
avg_w16(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int32_t h)94 static VPX_FORCE_INLINE void avg_w16(const uint8_t *src, ptrdiff_t src_stride,
95                                      uint8_t *dst, ptrdiff_t dst_stride,
96                                      int32_t h) {
97   int i;
98 
99   for (i = h; i--;) {
100     const uint8x16_t v = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst));
101     vec_vsx_st(v, 0, dst);
102     src += src_stride;
103     dst += dst_stride;
104   }
105 }
106 
avg_w32(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int32_t h)107 static VPX_FORCE_INLINE void avg_w32(const uint8_t *src, ptrdiff_t src_stride,
108                                      uint8_t *dst, ptrdiff_t dst_stride,
109                                      int32_t h) {
110   int i;
111 
112   for (i = h; i--;) {
113     const uint8x16_t v0 = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst));
114     const uint8x16_t v1 = vec_avg(vec_vsx_ld(16, src), vec_vsx_ld(16, dst));
115     vec_vsx_st(v0, 0, dst);
116     vec_vsx_st(v1, 16, dst);
117     src += src_stride;
118     dst += dst_stride;
119   }
120 }
121 
avg_w64(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int32_t h)122 static VPX_FORCE_INLINE void avg_w64(const uint8_t *src, ptrdiff_t src_stride,
123                                      uint8_t *dst, ptrdiff_t dst_stride,
124                                      int32_t h) {
125   int i;
126 
127   for (i = h; i--;) {
128     const uint8x16_t v0 = vec_avg(vec_vsx_ld(0, src), vec_vsx_ld(0, dst));
129     const uint8x16_t v1 = vec_avg(vec_vsx_ld(16, src), vec_vsx_ld(16, dst));
130     const uint8x16_t v2 = vec_avg(vec_vsx_ld(32, src), vec_vsx_ld(32, dst));
131     const uint8x16_t v3 = vec_avg(vec_vsx_ld(48, src), vec_vsx_ld(48, dst));
132     vec_vsx_st(v0, 0, dst);
133     vec_vsx_st(v1, 16, dst);
134     vec_vsx_st(v2, 32, dst);
135     vec_vsx_st(v3, 48, dst);
136     src += src_stride;
137     dst += dst_stride;
138   }
139 }
140 
vpx_convolve_avg_vsx(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int32_t y_step_q4,int32_t w,int32_t h)141 void vpx_convolve_avg_vsx(const uint8_t *src, ptrdiff_t src_stride,
142                           uint8_t *dst, ptrdiff_t dst_stride,
143                           const InterpKernel *filter, int x0_q4, int x_step_q4,
144                           int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) {
145   switch (w) {
146     case 16: {
147       avg_w16(src, src_stride, dst, dst_stride, h);
148       break;
149     }
150     case 32: {
151       avg_w32(src, src_stride, dst, dst_stride, h);
152       break;
153     }
154     case 64: {
155       avg_w64(src, src_stride, dst, dst_stride, h);
156       break;
157     }
158     default: {
159       vpx_convolve_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
160                          x_step_q4, y0_q4, y_step_q4, w, h);
161       break;
162     }
163   }
164 }
165 
convolve_line(uint8_t * dst,const int16x8_t s,const int16x8_t f)166 static VPX_FORCE_INLINE void convolve_line(uint8_t *dst, const int16x8_t s,
167                                            const int16x8_t f) {
168   const int32x4_t sum = vec_msum(s, f, vec_splat_s32(0));
169   const int32x4_t bias =
170       vec_sl(vec_splat_s32(1), vec_splat_u32(FILTER_BITS - 1));
171   const int32x4_t avg = vec_sr(vec_sums(sum, bias), vec_splat_u32(FILTER_BITS));
172   const uint8x16_t v = vec_splat(
173       vec_packsu(vec_pack(avg, vec_splat_s32(0)), vec_splat_s16(0)), 3);
174   vec_ste(v, 0, dst);
175 }
176 
convolve_line_h(uint8_t * dst,const uint8_t * const src_x,const int16_t * const x_filter)177 static VPX_FORCE_INLINE void convolve_line_h(uint8_t *dst,
178                                              const uint8_t *const src_x,
179                                              const int16_t *const x_filter) {
180   const int16x8_t s = unpack_to_s16_h(vec_vsx_ld(0, src_x));
181   const int16x8_t f = vec_vsx_ld(0, x_filter);
182 
183   convolve_line(dst, s, f);
184 }
185 
186 // TODO(lu_zero): Implement 8x8 and bigger block special cases
convolve_horiz(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h)187 static VPX_FORCE_INLINE void convolve_horiz(const uint8_t *src,
188                                             ptrdiff_t src_stride, uint8_t *dst,
189                                             ptrdiff_t dst_stride,
190                                             const InterpKernel *x_filters,
191                                             int x0_q4, int x_step_q4, int w,
192                                             int h) {
193   int x, y;
194   src -= SUBPEL_TAPS / 2 - 1;
195 
196   for (y = 0; y < h; ++y) {
197     int x_q4 = x0_q4;
198     for (x = 0; x < w; ++x) {
199       convolve_line_h(dst + x, &src[x_q4 >> SUBPEL_BITS],
200                       x_filters[x_q4 & SUBPEL_MASK]);
201       x_q4 += x_step_q4;
202     }
203     src += src_stride;
204     dst += dst_stride;
205   }
206 }
207 
convolve_avg_horiz(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * x_filters,int x0_q4,int x_step_q4,int w,int h)208 static VPX_FORCE_INLINE void convolve_avg_horiz(
209     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
210     ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
211     int x_step_q4, int w, int h) {
212   int x, y;
213   src -= SUBPEL_TAPS / 2 - 1;
214 
215   for (y = 0; y < h; ++y) {
216     int x_q4 = x0_q4;
217     for (x = 0; x < w; ++x) {
218       uint8_t v;
219       convolve_line_h(&v, &src[x_q4 >> SUBPEL_BITS],
220                       x_filters[x_q4 & SUBPEL_MASK]);
221       dst[x] = ROUND_POWER_OF_TWO(dst[x] + v, 1);
222       x_q4 += x_step_q4;
223     }
224     src += src_stride;
225     dst += dst_stride;
226   }
227 }
228 
transpose_line_u8_8x8(uint8x16_t a,uint8x16_t b,uint8x16_t c,uint8x16_t d,uint8x16_t e,uint8x16_t f,uint8x16_t g,uint8x16_t h)229 static uint8x16_t transpose_line_u8_8x8(uint8x16_t a, uint8x16_t b,
230                                         uint8x16_t c, uint8x16_t d,
231                                         uint8x16_t e, uint8x16_t f,
232                                         uint8x16_t g, uint8x16_t h) {
233   uint16x8_t ab = (uint16x8_t)vec_mergeh(a, b);
234   uint16x8_t cd = (uint16x8_t)vec_mergeh(c, d);
235   uint16x8_t ef = (uint16x8_t)vec_mergeh(e, f);
236   uint16x8_t gh = (uint16x8_t)vec_mergeh(g, h);
237 
238   uint32x4_t abcd = (uint32x4_t)vec_mergeh(ab, cd);
239   uint32x4_t efgh = (uint32x4_t)vec_mergeh(ef, gh);
240 
241   return (uint8x16_t)vec_mergeh(abcd, efgh);
242 }
243 
convolve_line_v(uint8_t * dst,const uint8_t * const src_y,ptrdiff_t src_stride,const int16_t * const y_filter)244 static VPX_FORCE_INLINE void convolve_line_v(uint8_t *dst,
245                                              const uint8_t *const src_y,
246                                              ptrdiff_t src_stride,
247                                              const int16_t *const y_filter) {
248   uint8x16_t s0 = vec_vsx_ld(0, src_y + 0 * src_stride);
249   uint8x16_t s1 = vec_vsx_ld(0, src_y + 1 * src_stride);
250   uint8x16_t s2 = vec_vsx_ld(0, src_y + 2 * src_stride);
251   uint8x16_t s3 = vec_vsx_ld(0, src_y + 3 * src_stride);
252   uint8x16_t s4 = vec_vsx_ld(0, src_y + 4 * src_stride);
253   uint8x16_t s5 = vec_vsx_ld(0, src_y + 5 * src_stride);
254   uint8x16_t s6 = vec_vsx_ld(0, src_y + 6 * src_stride);
255   uint8x16_t s7 = vec_vsx_ld(0, src_y + 7 * src_stride);
256   const int16x8_t f = vec_vsx_ld(0, y_filter);
257   uint8_t buf[16];
258   const uint8x16_t s = transpose_line_u8_8x8(s0, s1, s2, s3, s4, s5, s6, s7);
259 
260   vec_vsx_st(s, 0, buf);
261 
262   convolve_line(dst, unpack_to_s16_h(s), f);
263 }
264 
convolve_vert(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)265 static VPX_FORCE_INLINE void convolve_vert(const uint8_t *src,
266                                            ptrdiff_t src_stride, uint8_t *dst,
267                                            ptrdiff_t dst_stride,
268                                            const InterpKernel *y_filters,
269                                            int y0_q4, int y_step_q4, int w,
270                                            int h) {
271   int x, y;
272   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
273 
274   for (x = 0; x < w; ++x) {
275     int y_q4 = y0_q4;
276     for (y = 0; y < h; ++y) {
277       convolve_line_v(dst + y * dst_stride,
278                       &src[(y_q4 >> SUBPEL_BITS) * src_stride], src_stride,
279                       y_filters[y_q4 & SUBPEL_MASK]);
280       y_q4 += y_step_q4;
281     }
282     ++src;
283     ++dst;
284   }
285 }
286 
convolve_avg_vert(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * y_filters,int y0_q4,int y_step_q4,int w,int h)287 static VPX_FORCE_INLINE void convolve_avg_vert(
288     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
289     ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
290     int y_step_q4, int w, int h) {
291   int x, y;
292   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
293 
294   for (x = 0; x < w; ++x) {
295     int y_q4 = y0_q4;
296     for (y = 0; y < h; ++y) {
297       uint8_t v;
298       convolve_line_v(&v, &src[(y_q4 >> SUBPEL_BITS) * src_stride], src_stride,
299                       y_filters[y_q4 & SUBPEL_MASK]);
300       dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + v, 1);
301       y_q4 += y_step_q4;
302     }
303     ++src;
304     ++dst;
305   }
306 }
307 
convolve(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * const filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)308 static VPX_FORCE_INLINE void convolve(const uint8_t *src, ptrdiff_t src_stride,
309                                       uint8_t *dst, ptrdiff_t dst_stride,
310                                       const InterpKernel *const filter,
311                                       int x0_q4, int x_step_q4, int y0_q4,
312                                       int y_step_q4, int w, int h) {
313   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
314   // 2d filtering proceeds in 2 steps:
315   //   (1) Interpolate horizontally into an intermediate buffer, temp.
316   //   (2) Interpolate temp vertically to derive the sub-pixel result.
317   // Deriving the maximum number of rows in the temp buffer (135):
318   // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
319   // --Largest block size is 64x64 pixels.
320   // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
321   //   original frame (in 1/16th pixel units).
322   // --Must round-up because block may be located at sub-pixel position.
323   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
324   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
325   DECLARE_ALIGNED(16, uint8_t, temp[64 * 135]);
326   const int intermediate_height =
327       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
328 
329   assert(w <= 64);
330   assert(h <= 64);
331   assert(y_step_q4 <= 32);
332   assert(x_step_q4 <= 32);
333 
334   convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
335                  filter, x0_q4, x_step_q4, w, intermediate_height);
336   convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter,
337                 y0_q4, y_step_q4, w, h);
338 }
339 
vpx_convolve8_horiz_vsx(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)340 void vpx_convolve8_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride,
341                              uint8_t *dst, ptrdiff_t dst_stride,
342                              const InterpKernel *filter, int x0_q4,
343                              int x_step_q4, int y0_q4, int y_step_q4, int w,
344                              int h) {
345   (void)y0_q4;
346   (void)y_step_q4;
347 
348   convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w,
349                  h);
350 }
351 
vpx_convolve8_avg_horiz_vsx(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)352 void vpx_convolve8_avg_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride,
353                                  uint8_t *dst, ptrdiff_t dst_stride,
354                                  const InterpKernel *filter, int x0_q4,
355                                  int x_step_q4, int y0_q4, int y_step_q4, int w,
356                                  int h) {
357   (void)y0_q4;
358   (void)y_step_q4;
359 
360   convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
361                      w, h);
362 }
363 
vpx_convolve8_vert_vsx(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)364 void vpx_convolve8_vert_vsx(const uint8_t *src, ptrdiff_t src_stride,
365                             uint8_t *dst, ptrdiff_t dst_stride,
366                             const InterpKernel *filter, int x0_q4,
367                             int x_step_q4, int y0_q4, int y_step_q4, int w,
368                             int h) {
369   (void)x0_q4;
370   (void)x_step_q4;
371 
372   convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w,
373                 h);
374 }
375 
vpx_convolve8_avg_vert_vsx(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)376 void vpx_convolve8_avg_vert_vsx(const uint8_t *src, ptrdiff_t src_stride,
377                                 uint8_t *dst, ptrdiff_t dst_stride,
378                                 const InterpKernel *filter, int x0_q4,
379                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
380                                 int h) {
381   (void)x0_q4;
382   (void)x_step_q4;
383 
384   convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4,
385                     w, h);
386 }
387 
vpx_convolve8_vsx(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)388 void vpx_convolve8_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
389                        ptrdiff_t dst_stride, const InterpKernel *filter,
390                        int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
391                        int w, int h) {
392   convolve(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4,
393            y_step_q4, w, h);
394 }
395 
vpx_convolve8_avg_vsx(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)396 void vpx_convolve8_avg_vsx(const uint8_t *src, ptrdiff_t src_stride,
397                            uint8_t *dst, ptrdiff_t dst_stride,
398                            const InterpKernel *filter, int x0_q4, int x_step_q4,
399                            int y0_q4, int y_step_q4, int w, int h) {
400   // Fixed size intermediate buffer places limits on parameters.
401   DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
402   assert(w <= 64);
403   assert(h <= 64);
404 
405   vpx_convolve8_vsx(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4,
406                     y_step_q4, w, h);
407   vpx_convolve_avg_vsx(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h);
408 }
409