1 /*
2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vpx_dsp_rtcd.h"
12 #include "vpx_dsp/ppc/types_vsx.h"
13 
vpx_v_predictor_16x16_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)14 void vpx_v_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
15                                const uint8_t *above, const uint8_t *left) {
16   const uint8x16_t d = vec_vsx_ld(0, above);
17   int i;
18   (void)left;
19 
20   for (i = 0; i < 16; i++, dst += stride) {
21     vec_vsx_st(d, 0, dst);
22   }
23 }
24 
vpx_v_predictor_32x32_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)25 void vpx_v_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
26                                const uint8_t *above, const uint8_t *left) {
27   const uint8x16_t d0 = vec_vsx_ld(0, above);
28   const uint8x16_t d1 = vec_vsx_ld(16, above);
29   int i;
30   (void)left;
31 
32   for (i = 0; i < 32; i++, dst += stride) {
33     vec_vsx_st(d0, 0, dst);
34     vec_vsx_st(d1, 16, dst);
35   }
36 }
37 
38 // TODO(crbug.com/webm/1522): Fix test failures.
39 #if 0
40 static const uint32x4_t mask4 = { 0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
41 
42 void vpx_h_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride,
43                              const uint8_t *above, const uint8_t *left) {
44   const uint8x16_t d = vec_vsx_ld(0, left);
45   const uint8x16_t v0 = vec_splat(d, 0);
46   const uint8x16_t v1 = vec_splat(d, 1);
47   const uint8x16_t v2 = vec_splat(d, 2);
48   const uint8x16_t v3 = vec_splat(d, 3);
49 
50   (void)above;
51 
52   vec_vsx_st(vec_sel(v0, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
53   dst += stride;
54   vec_vsx_st(vec_sel(v1, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
55   dst += stride;
56   vec_vsx_st(vec_sel(v2, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
57   dst += stride;
58   vec_vsx_st(vec_sel(v3, vec_vsx_ld(0, dst), (uint8x16_t)mask4), 0, dst);
59 }
60 
61 void vpx_h_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
62                              const uint8_t *above, const uint8_t *left) {
63   const uint8x16_t d = vec_vsx_ld(0, left);
64   const uint8x16_t v0 = vec_splat(d, 0);
65   const uint8x16_t v1 = vec_splat(d, 1);
66   const uint8x16_t v2 = vec_splat(d, 2);
67   const uint8x16_t v3 = vec_splat(d, 3);
68 
69   const uint8x16_t v4 = vec_splat(d, 4);
70   const uint8x16_t v5 = vec_splat(d, 5);
71   const uint8x16_t v6 = vec_splat(d, 6);
72   const uint8x16_t v7 = vec_splat(d, 7);
73 
74   (void)above;
75 
76   vec_vsx_st(xxpermdi(v0, vec_vsx_ld(0, dst), 1), 0, dst);
77   dst += stride;
78   vec_vsx_st(xxpermdi(v1, vec_vsx_ld(0, dst), 1), 0, dst);
79   dst += stride;
80   vec_vsx_st(xxpermdi(v2, vec_vsx_ld(0, dst), 1), 0, dst);
81   dst += stride;
82   vec_vsx_st(xxpermdi(v3, vec_vsx_ld(0, dst), 1), 0, dst);
83   dst += stride;
84   vec_vsx_st(xxpermdi(v4, vec_vsx_ld(0, dst), 1), 0, dst);
85   dst += stride;
86   vec_vsx_st(xxpermdi(v5, vec_vsx_ld(0, dst), 1), 0, dst);
87   dst += stride;
88   vec_vsx_st(xxpermdi(v6, vec_vsx_ld(0, dst), 1), 0, dst);
89   dst += stride;
90   vec_vsx_st(xxpermdi(v7, vec_vsx_ld(0, dst), 1), 0, dst);
91 }
92 #endif
93 
vpx_h_predictor_16x16_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)94 void vpx_h_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
95                                const uint8_t *above, const uint8_t *left) {
96   const uint8x16_t d = vec_vsx_ld(0, left);
97   const uint8x16_t v0 = vec_splat(d, 0);
98   const uint8x16_t v1 = vec_splat(d, 1);
99   const uint8x16_t v2 = vec_splat(d, 2);
100   const uint8x16_t v3 = vec_splat(d, 3);
101 
102   const uint8x16_t v4 = vec_splat(d, 4);
103   const uint8x16_t v5 = vec_splat(d, 5);
104   const uint8x16_t v6 = vec_splat(d, 6);
105   const uint8x16_t v7 = vec_splat(d, 7);
106 
107   const uint8x16_t v8 = vec_splat(d, 8);
108   const uint8x16_t v9 = vec_splat(d, 9);
109   const uint8x16_t v10 = vec_splat(d, 10);
110   const uint8x16_t v11 = vec_splat(d, 11);
111 
112   const uint8x16_t v12 = vec_splat(d, 12);
113   const uint8x16_t v13 = vec_splat(d, 13);
114   const uint8x16_t v14 = vec_splat(d, 14);
115   const uint8x16_t v15 = vec_splat(d, 15);
116 
117   (void)above;
118 
119   vec_vsx_st(v0, 0, dst);
120   dst += stride;
121   vec_vsx_st(v1, 0, dst);
122   dst += stride;
123   vec_vsx_st(v2, 0, dst);
124   dst += stride;
125   vec_vsx_st(v3, 0, dst);
126   dst += stride;
127   vec_vsx_st(v4, 0, dst);
128   dst += stride;
129   vec_vsx_st(v5, 0, dst);
130   dst += stride;
131   vec_vsx_st(v6, 0, dst);
132   dst += stride;
133   vec_vsx_st(v7, 0, dst);
134   dst += stride;
135   vec_vsx_st(v8, 0, dst);
136   dst += stride;
137   vec_vsx_st(v9, 0, dst);
138   dst += stride;
139   vec_vsx_st(v10, 0, dst);
140   dst += stride;
141   vec_vsx_st(v11, 0, dst);
142   dst += stride;
143   vec_vsx_st(v12, 0, dst);
144   dst += stride;
145   vec_vsx_st(v13, 0, dst);
146   dst += stride;
147   vec_vsx_st(v14, 0, dst);
148   dst += stride;
149   vec_vsx_st(v15, 0, dst);
150 }
151 
152 #define H_PREDICTOR_32(v) \
153   vec_vsx_st(v, 0, dst);  \
154   vec_vsx_st(v, 16, dst); \
155   dst += stride
156 
vpx_h_predictor_32x32_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)157 void vpx_h_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
158                                const uint8_t *above, const uint8_t *left) {
159   const uint8x16_t d0 = vec_vsx_ld(0, left);
160   const uint8x16_t d1 = vec_vsx_ld(16, left);
161 
162   const uint8x16_t v0_0 = vec_splat(d0, 0);
163   const uint8x16_t v1_0 = vec_splat(d0, 1);
164   const uint8x16_t v2_0 = vec_splat(d0, 2);
165   const uint8x16_t v3_0 = vec_splat(d0, 3);
166   const uint8x16_t v4_0 = vec_splat(d0, 4);
167   const uint8x16_t v5_0 = vec_splat(d0, 5);
168   const uint8x16_t v6_0 = vec_splat(d0, 6);
169   const uint8x16_t v7_0 = vec_splat(d0, 7);
170   const uint8x16_t v8_0 = vec_splat(d0, 8);
171   const uint8x16_t v9_0 = vec_splat(d0, 9);
172   const uint8x16_t v10_0 = vec_splat(d0, 10);
173   const uint8x16_t v11_0 = vec_splat(d0, 11);
174   const uint8x16_t v12_0 = vec_splat(d0, 12);
175   const uint8x16_t v13_0 = vec_splat(d0, 13);
176   const uint8x16_t v14_0 = vec_splat(d0, 14);
177   const uint8x16_t v15_0 = vec_splat(d0, 15);
178 
179   const uint8x16_t v0_1 = vec_splat(d1, 0);
180   const uint8x16_t v1_1 = vec_splat(d1, 1);
181   const uint8x16_t v2_1 = vec_splat(d1, 2);
182   const uint8x16_t v3_1 = vec_splat(d1, 3);
183   const uint8x16_t v4_1 = vec_splat(d1, 4);
184   const uint8x16_t v5_1 = vec_splat(d1, 5);
185   const uint8x16_t v6_1 = vec_splat(d1, 6);
186   const uint8x16_t v7_1 = vec_splat(d1, 7);
187   const uint8x16_t v8_1 = vec_splat(d1, 8);
188   const uint8x16_t v9_1 = vec_splat(d1, 9);
189   const uint8x16_t v10_1 = vec_splat(d1, 10);
190   const uint8x16_t v11_1 = vec_splat(d1, 11);
191   const uint8x16_t v12_1 = vec_splat(d1, 12);
192   const uint8x16_t v13_1 = vec_splat(d1, 13);
193   const uint8x16_t v14_1 = vec_splat(d1, 14);
194   const uint8x16_t v15_1 = vec_splat(d1, 15);
195 
196   (void)above;
197 
198   H_PREDICTOR_32(v0_0);
199   H_PREDICTOR_32(v1_0);
200   H_PREDICTOR_32(v2_0);
201   H_PREDICTOR_32(v3_0);
202 
203   H_PREDICTOR_32(v4_0);
204   H_PREDICTOR_32(v5_0);
205   H_PREDICTOR_32(v6_0);
206   H_PREDICTOR_32(v7_0);
207 
208   H_PREDICTOR_32(v8_0);
209   H_PREDICTOR_32(v9_0);
210   H_PREDICTOR_32(v10_0);
211   H_PREDICTOR_32(v11_0);
212 
213   H_PREDICTOR_32(v12_0);
214   H_PREDICTOR_32(v13_0);
215   H_PREDICTOR_32(v14_0);
216   H_PREDICTOR_32(v15_0);
217 
218   H_PREDICTOR_32(v0_1);
219   H_PREDICTOR_32(v1_1);
220   H_PREDICTOR_32(v2_1);
221   H_PREDICTOR_32(v3_1);
222 
223   H_PREDICTOR_32(v4_1);
224   H_PREDICTOR_32(v5_1);
225   H_PREDICTOR_32(v6_1);
226   H_PREDICTOR_32(v7_1);
227 
228   H_PREDICTOR_32(v8_1);
229   H_PREDICTOR_32(v9_1);
230   H_PREDICTOR_32(v10_1);
231   H_PREDICTOR_32(v11_1);
232 
233   H_PREDICTOR_32(v12_1);
234   H_PREDICTOR_32(v13_1);
235   H_PREDICTOR_32(v14_1);
236   H_PREDICTOR_32(v15_1);
237 }
238 
239 // TODO(crbug.com/webm/1522): Fix test failures.
240 #if 0
241 void vpx_tm_predictor_4x4_vsx(uint8_t *dst, ptrdiff_t stride,
242                               const uint8_t *above, const uint8_t *left) {
243   const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
244   const int16x8_t l = unpack_to_s16_h(vec_vsx_ld(0, left));
245   const int16x8_t a = unpack_to_s16_h(vec_vsx_ld(0, above));
246   int16x8_t tmp, val;
247   uint8x16_t d;
248 
249   d = vec_vsx_ld(0, dst);
250   tmp = unpack_to_s16_l(d);
251   val = vec_sub(vec_add(vec_splat(l, 0), a), tl);
252   vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
253   dst += stride;
254 
255   d = vec_vsx_ld(0, dst);
256   tmp = unpack_to_s16_l(d);
257   val = vec_sub(vec_add(vec_splat(l, 1), a), tl);
258   vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
259   dst += stride;
260 
261   d = vec_vsx_ld(0, dst);
262   tmp = unpack_to_s16_l(d);
263   val = vec_sub(vec_add(vec_splat(l, 2), a), tl);
264   vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
265   dst += stride;
266 
267   d = vec_vsx_ld(0, dst);
268   tmp = unpack_to_s16_l(d);
269   val = vec_sub(vec_add(vec_splat(l, 3), a), tl);
270   vec_vsx_st(vec_sel(vec_packsu(val, tmp), d, (uint8x16_t)mask4), 0, dst);
271 }
272 
273 void vpx_tm_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
274                               const uint8_t *above, const uint8_t *left) {
275   const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
276   const int16x8_t l = unpack_to_s16_h(vec_vsx_ld(0, left));
277   const int16x8_t a = unpack_to_s16_h(vec_vsx_ld(0, above));
278   int16x8_t tmp, val;
279 
280   tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
281   val = vec_sub(vec_add(vec_splat(l, 0), a), tl);
282   vec_vsx_st(vec_packsu(val, tmp), 0, dst);
283   dst += stride;
284 
285   tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
286   val = vec_sub(vec_add(vec_splat(l, 1), a), tl);
287   vec_vsx_st(vec_packsu(val, tmp), 0, dst);
288   dst += stride;
289 
290   tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
291   val = vec_sub(vec_add(vec_splat(l, 2), a), tl);
292   vec_vsx_st(vec_packsu(val, tmp), 0, dst);
293   dst += stride;
294 
295   tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
296   val = vec_sub(vec_add(vec_splat(l, 3), a), tl);
297   vec_vsx_st(vec_packsu(val, tmp), 0, dst);
298   dst += stride;
299 
300   tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
301   val = vec_sub(vec_add(vec_splat(l, 4), a), tl);
302   vec_vsx_st(vec_packsu(val, tmp), 0, dst);
303   dst += stride;
304 
305   tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
306   val = vec_sub(vec_add(vec_splat(l, 5), a), tl);
307   vec_vsx_st(vec_packsu(val, tmp), 0, dst);
308   dst += stride;
309 
310   tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
311   val = vec_sub(vec_add(vec_splat(l, 6), a), tl);
312   vec_vsx_st(vec_packsu(val, tmp), 0, dst);
313   dst += stride;
314 
315   tmp = unpack_to_s16_l(vec_vsx_ld(0, dst));
316   val = vec_sub(vec_add(vec_splat(l, 7), a), tl);
317   vec_vsx_st(vec_packsu(val, tmp), 0, dst);
318 }
319 #endif
320 
tm_predictor_16x8(uint8_t * dst,const ptrdiff_t stride,int16x8_t l,int16x8_t ah,int16x8_t al,int16x8_t tl)321 static void tm_predictor_16x8(uint8_t *dst, const ptrdiff_t stride, int16x8_t l,
322                               int16x8_t ah, int16x8_t al, int16x8_t tl) {
323   int16x8_t vh, vl, ls;
324 
325   ls = vec_splat(l, 0);
326   vh = vec_sub(vec_add(ls, ah), tl);
327   vl = vec_sub(vec_add(ls, al), tl);
328   vec_vsx_st(vec_packsu(vh, vl), 0, dst);
329   dst += stride;
330 
331   ls = vec_splat(l, 1);
332   vh = vec_sub(vec_add(ls, ah), tl);
333   vl = vec_sub(vec_add(ls, al), tl);
334   vec_vsx_st(vec_packsu(vh, vl), 0, dst);
335   dst += stride;
336 
337   ls = vec_splat(l, 2);
338   vh = vec_sub(vec_add(ls, ah), tl);
339   vl = vec_sub(vec_add(ls, al), tl);
340   vec_vsx_st(vec_packsu(vh, vl), 0, dst);
341   dst += stride;
342 
343   ls = vec_splat(l, 3);
344   vh = vec_sub(vec_add(ls, ah), tl);
345   vl = vec_sub(vec_add(ls, al), tl);
346   vec_vsx_st(vec_packsu(vh, vl), 0, dst);
347   dst += stride;
348 
349   ls = vec_splat(l, 4);
350   vh = vec_sub(vec_add(ls, ah), tl);
351   vl = vec_sub(vec_add(ls, al), tl);
352   vec_vsx_st(vec_packsu(vh, vl), 0, dst);
353   dst += stride;
354 
355   ls = vec_splat(l, 5);
356   vh = vec_sub(vec_add(ls, ah), tl);
357   vl = vec_sub(vec_add(ls, al), tl);
358   vec_vsx_st(vec_packsu(vh, vl), 0, dst);
359   dst += stride;
360 
361   ls = vec_splat(l, 6);
362   vh = vec_sub(vec_add(ls, ah), tl);
363   vl = vec_sub(vec_add(ls, al), tl);
364   vec_vsx_st(vec_packsu(vh, vl), 0, dst);
365   dst += stride;
366 
367   ls = vec_splat(l, 7);
368   vh = vec_sub(vec_add(ls, ah), tl);
369   vl = vec_sub(vec_add(ls, al), tl);
370   vec_vsx_st(vec_packsu(vh, vl), 0, dst);
371 }
372 
vpx_tm_predictor_16x16_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)373 void vpx_tm_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
374                                 const uint8_t *above, const uint8_t *left) {
375   const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
376   const uint8x16_t l = vec_vsx_ld(0, left);
377   const int16x8_t lh = unpack_to_s16_h(l);
378   const int16x8_t ll = unpack_to_s16_l(l);
379   const uint8x16_t a = vec_vsx_ld(0, above);
380   const int16x8_t ah = unpack_to_s16_h(a);
381   const int16x8_t al = unpack_to_s16_l(a);
382 
383   tm_predictor_16x8(dst, stride, lh, ah, al, tl);
384 
385   dst += stride * 8;
386 
387   tm_predictor_16x8(dst, stride, ll, ah, al, tl);
388 }
389 
tm_predictor_32x1(uint8_t * dst,const int16x8_t ls,const int16x8_t a0h,const int16x8_t a0l,const int16x8_t a1h,const int16x8_t a1l,const int16x8_t tl)390 static INLINE void tm_predictor_32x1(uint8_t *dst, const int16x8_t ls,
391                                      const int16x8_t a0h, const int16x8_t a0l,
392                                      const int16x8_t a1h, const int16x8_t a1l,
393                                      const int16x8_t tl) {
394   int16x8_t vh, vl;
395 
396   vh = vec_sub(vec_add(ls, a0h), tl);
397   vl = vec_sub(vec_add(ls, a0l), tl);
398   vec_vsx_st(vec_packsu(vh, vl), 0, dst);
399   vh = vec_sub(vec_add(ls, a1h), tl);
400   vl = vec_sub(vec_add(ls, a1l), tl);
401   vec_vsx_st(vec_packsu(vh, vl), 16, dst);
402 }
403 
tm_predictor_32x8(uint8_t * dst,const ptrdiff_t stride,const int16x8_t l,const uint8x16_t a0,const uint8x16_t a1,const int16x8_t tl)404 static void tm_predictor_32x8(uint8_t *dst, const ptrdiff_t stride,
405                               const int16x8_t l, const uint8x16_t a0,
406                               const uint8x16_t a1, const int16x8_t tl) {
407   const int16x8_t a0h = unpack_to_s16_h(a0);
408   const int16x8_t a0l = unpack_to_s16_l(a0);
409   const int16x8_t a1h = unpack_to_s16_h(a1);
410   const int16x8_t a1l = unpack_to_s16_l(a1);
411 
412   tm_predictor_32x1(dst, vec_splat(l, 0), a0h, a0l, a1h, a1l, tl);
413   dst += stride;
414 
415   tm_predictor_32x1(dst, vec_splat(l, 1), a0h, a0l, a1h, a1l, tl);
416   dst += stride;
417 
418   tm_predictor_32x1(dst, vec_splat(l, 2), a0h, a0l, a1h, a1l, tl);
419   dst += stride;
420 
421   tm_predictor_32x1(dst, vec_splat(l, 3), a0h, a0l, a1h, a1l, tl);
422   dst += stride;
423 
424   tm_predictor_32x1(dst, vec_splat(l, 4), a0h, a0l, a1h, a1l, tl);
425   dst += stride;
426 
427   tm_predictor_32x1(dst, vec_splat(l, 5), a0h, a0l, a1h, a1l, tl);
428   dst += stride;
429 
430   tm_predictor_32x1(dst, vec_splat(l, 6), a0h, a0l, a1h, a1l, tl);
431   dst += stride;
432 
433   tm_predictor_32x1(dst, vec_splat(l, 7), a0h, a0l, a1h, a1l, tl);
434 }
435 
vpx_tm_predictor_32x32_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)436 void vpx_tm_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
437                                 const uint8_t *above, const uint8_t *left) {
438   const int16x8_t tl = unpack_to_s16_h(vec_splat(vec_vsx_ld(-1, above), 0));
439   const uint8x16_t l0 = vec_vsx_ld(0, left);
440   const uint8x16_t l1 = vec_vsx_ld(16, left);
441   const uint8x16_t a0 = vec_vsx_ld(0, above);
442   const uint8x16_t a1 = vec_vsx_ld(16, above);
443 
444   tm_predictor_32x8(dst, stride, unpack_to_s16_h(l0), a0, a1, tl);
445   dst += stride * 8;
446 
447   tm_predictor_32x8(dst, stride, unpack_to_s16_l(l0), a0, a1, tl);
448   dst += stride * 8;
449 
450   tm_predictor_32x8(dst, stride, unpack_to_s16_h(l1), a0, a1, tl);
451   dst += stride * 8;
452 
453   tm_predictor_32x8(dst, stride, unpack_to_s16_l(l1), a0, a1, tl);
454 }
455 
dc_fill_predictor_8x8(uint8_t * dst,const ptrdiff_t stride,const uint8x16_t val)456 static INLINE void dc_fill_predictor_8x8(uint8_t *dst, const ptrdiff_t stride,
457                                          const uint8x16_t val) {
458   int i;
459 
460   for (i = 0; i < 8; i++, dst += stride) {
461     const uint8x16_t d = vec_vsx_ld(0, dst);
462     vec_vsx_st(xxpermdi(val, d, 1), 0, dst);
463   }
464 }
465 
dc_fill_predictor_16x16(uint8_t * dst,const ptrdiff_t stride,const uint8x16_t val)466 static INLINE void dc_fill_predictor_16x16(uint8_t *dst, const ptrdiff_t stride,
467                                            const uint8x16_t val) {
468   int i;
469 
470   for (i = 0; i < 16; i++, dst += stride) {
471     vec_vsx_st(val, 0, dst);
472   }
473 }
474 
vpx_dc_128_predictor_16x16_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)475 void vpx_dc_128_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
476                                     const uint8_t *above, const uint8_t *left) {
477   const uint8x16_t v128 = vec_sl(vec_splat_u8(1), vec_splat_u8(7));
478   (void)above;
479   (void)left;
480 
481   dc_fill_predictor_16x16(dst, stride, v128);
482 }
483 
dc_fill_predictor_32x32(uint8_t * dst,const ptrdiff_t stride,const uint8x16_t val)484 static INLINE void dc_fill_predictor_32x32(uint8_t *dst, const ptrdiff_t stride,
485                                            const uint8x16_t val) {
486   int i;
487 
488   for (i = 0; i < 32; i++, dst += stride) {
489     vec_vsx_st(val, 0, dst);
490     vec_vsx_st(val, 16, dst);
491   }
492 }
493 
vpx_dc_128_predictor_32x32_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)494 void vpx_dc_128_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
495                                     const uint8_t *above, const uint8_t *left) {
496   const uint8x16_t v128 = vec_sl(vec_splat_u8(1), vec_splat_u8(7));
497   (void)above;
498   (void)left;
499 
500   dc_fill_predictor_32x32(dst, stride, v128);
501 }
502 
avg16(const uint8_t * values)503 static uint8x16_t avg16(const uint8_t *values) {
504   const int32x4_t sum4s =
505       (int32x4_t)vec_sum4s(vec_vsx_ld(0, values), vec_splat_u32(0));
506   const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, vec_splat_s32(8));
507   const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(4));
508 
509   return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
510                    3);
511 }
512 
vpx_dc_left_predictor_16x16_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)513 void vpx_dc_left_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
514                                      const uint8_t *above,
515                                      const uint8_t *left) {
516   (void)above;
517 
518   dc_fill_predictor_16x16(dst, stride, avg16(left));
519 }
520 
vpx_dc_top_predictor_16x16_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)521 void vpx_dc_top_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
522                                     const uint8_t *above, const uint8_t *left) {
523   (void)left;
524 
525   dc_fill_predictor_16x16(dst, stride, avg16(above));
526 }
527 
avg32(const uint8_t * values)528 static uint8x16_t avg32(const uint8_t *values) {
529   const uint8x16_t v0 = vec_vsx_ld(0, values);
530   const uint8x16_t v1 = vec_vsx_ld(16, values);
531   const int32x4_t v16 = vec_sl(vec_splat_s32(1), vec_splat_u32(4));
532   const int32x4_t sum4s =
533       (int32x4_t)vec_sum4s(v0, vec_sum4s(v1, vec_splat_u32(0)));
534   const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v16);
535   const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(5));
536 
537   return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
538                    3);
539 }
540 
vpx_dc_left_predictor_32x32_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)541 void vpx_dc_left_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
542                                      const uint8_t *above,
543                                      const uint8_t *left) {
544   (void)above;
545 
546   dc_fill_predictor_32x32(dst, stride, avg32(left));
547 }
548 
vpx_dc_top_predictor_32x32_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)549 void vpx_dc_top_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
550                                     const uint8_t *above, const uint8_t *left) {
551   (void)left;
552 
553   dc_fill_predictor_32x32(dst, stride, avg32(above));
554 }
555 
556 // TODO(crbug.com/webm/1522): Fix test failures.
557 #if 0
558 static uint8x16_t dc_avg8(const uint8_t *above, const uint8_t *left) {
559   const uint8x16_t a0 = vec_vsx_ld(0, above);
560   const uint8x16_t l0 = vec_vsx_ld(0, left);
561   const int32x4_t sum4s =
562       (int32x4_t)vec_sum4s(l0, vec_sum4s(a0, vec_splat_u32(0)));
563   const int32x4_t sum4s8 = xxpermdi(sum4s, vec_splat_s32(0), 1);
564   const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s8, vec_splat_s32(8));
565   const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(4));
566 
567   return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
568                    3);
569 }
570 #endif
571 
dc_avg16(const uint8_t * above,const uint8_t * left)572 static uint8x16_t dc_avg16(const uint8_t *above, const uint8_t *left) {
573   const uint8x16_t a0 = vec_vsx_ld(0, above);
574   const uint8x16_t l0 = vec_vsx_ld(0, left);
575   const int32x4_t v16 = vec_sl(vec_splat_s32(1), vec_splat_u32(4));
576   const int32x4_t sum4s =
577       (int32x4_t)vec_sum4s(l0, vec_sum4s(a0, vec_splat_u32(0)));
578   const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v16);
579   const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(5));
580 
581   return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
582                    3);
583 }
584 
585 // TODO(crbug.com/webm/1522): Fix test failures.
586 #if 0
587 void vpx_dc_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
588                               const uint8_t *above, const uint8_t *left) {
589   dc_fill_predictor_8x8(dst, stride, dc_avg8(above, left));
590 }
591 #endif
592 
vpx_dc_predictor_16x16_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)593 void vpx_dc_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
594                                 const uint8_t *above, const uint8_t *left) {
595   dc_fill_predictor_16x16(dst, stride, dc_avg16(above, left));
596 }
597 
dc_avg32(const uint8_t * above,const uint8_t * left)598 static uint8x16_t dc_avg32(const uint8_t *above, const uint8_t *left) {
599   const uint8x16_t a0 = vec_vsx_ld(0, above);
600   const uint8x16_t a1 = vec_vsx_ld(16, above);
601   const uint8x16_t l0 = vec_vsx_ld(0, left);
602   const uint8x16_t l1 = vec_vsx_ld(16, left);
603   const int32x4_t v32 = vec_sl(vec_splat_s32(1), vec_splat_u32(5));
604   const uint32x4_t a_sum = vec_sum4s(a0, vec_sum4s(a1, vec_splat_u32(0)));
605   const int32x4_t sum4s = (int32x4_t)vec_sum4s(l0, vec_sum4s(l1, a_sum));
606   const uint32x4_t sum = (uint32x4_t)vec_sums(sum4s, v32);
607   const uint32x4_t avg = (uint32x4_t)vec_sr(sum, vec_splat_u32(6));
608 
609   return vec_splat(vec_pack(vec_pack(avg, vec_splat_u32(0)), vec_splat_u16(0)),
610                    3);
611 }
612 
vpx_dc_predictor_32x32_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)613 void vpx_dc_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
614                                 const uint8_t *above, const uint8_t *left) {
615   dc_fill_predictor_32x32(dst, stride, dc_avg32(above, left));
616 }
617 
avg3(const uint8x16_t a,const uint8x16_t b,const uint8x16_t c)618 static uint8x16_t avg3(const uint8x16_t a, const uint8x16_t b,
619                        const uint8x16_t c) {
620   const uint8x16_t ac =
621       vec_adds(vec_and(a, c), vec_sr(vec_xor(a, c), vec_splat_u8(1)));
622 
623   return vec_avg(ac, b);
624 }
625 
626 // Workaround vec_sld/vec_xxsldi/vec_lsdoi being missing or broken.
627 static const uint8x16_t sl1 = { 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
628                                 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x10 };
629 
630 // TODO(crbug.com/webm/1522): Fix test failures.
631 #if 0
632 void vpx_d45_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
633                                const uint8_t *above, const uint8_t *left) {
634   const uint8x16_t af = vec_vsx_ld(0, above);
635   const uint8x16_t above_right = vec_splat(af, 7);
636   const uint8x16_t a = xxpermdi(af, above_right, 1);
637   const uint8x16_t b = vec_perm(a, above_right, sl1);
638   const uint8x16_t c = vec_perm(b, above_right, sl1);
639   uint8x16_t row = avg3(a, b, c);
640   int i;
641   (void)left;
642 
643   for (i = 0; i < 8; i++) {
644     const uint8x16_t d = vec_vsx_ld(0, dst);
645     vec_vsx_st(xxpermdi(row, d, 1), 0, dst);
646     dst += stride;
647     row = vec_perm(row, above_right, sl1);
648   }
649 }
650 #endif
651 
vpx_d45_predictor_16x16_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)652 void vpx_d45_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
653                                  const uint8_t *above, const uint8_t *left) {
654   const uint8x16_t a = vec_vsx_ld(0, above);
655   const uint8x16_t above_right = vec_splat(a, 15);
656   const uint8x16_t b = vec_perm(a, above_right, sl1);
657   const uint8x16_t c = vec_perm(b, above_right, sl1);
658   uint8x16_t row = avg3(a, b, c);
659   int i;
660   (void)left;
661 
662   for (i = 0; i < 16; i++) {
663     vec_vsx_st(row, 0, dst);
664     dst += stride;
665     row = vec_perm(row, above_right, sl1);
666   }
667 }
668 
vpx_d45_predictor_32x32_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)669 void vpx_d45_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
670                                  const uint8_t *above, const uint8_t *left) {
671   const uint8x16_t a0 = vec_vsx_ld(0, above);
672   const uint8x16_t a1 = vec_vsx_ld(16, above);
673   const uint8x16_t above_right = vec_splat(a1, 15);
674   const uint8x16_t b0 = vec_perm(a0, a1, sl1);
675   const uint8x16_t b1 = vec_perm(a1, above_right, sl1);
676   const uint8x16_t c0 = vec_perm(b0, b1, sl1);
677   const uint8x16_t c1 = vec_perm(b1, above_right, sl1);
678   uint8x16_t row0 = avg3(a0, b0, c0);
679   uint8x16_t row1 = avg3(a1, b1, c1);
680   int i;
681   (void)left;
682 
683   for (i = 0; i < 32; i++) {
684     vec_vsx_st(row0, 0, dst);
685     vec_vsx_st(row1, 16, dst);
686     dst += stride;
687     row0 = vec_perm(row0, row1, sl1);
688     row1 = vec_perm(row1, above_right, sl1);
689   }
690 }
691 
692 // TODO(crbug.com/webm/1522): Fix test failures.
693 #if 0
694 void vpx_d63_predictor_8x8_vsx(uint8_t *dst, ptrdiff_t stride,
695                                const uint8_t *above, const uint8_t *left) {
696   const uint8x16_t af = vec_vsx_ld(0, above);
697   const uint8x16_t above_right = vec_splat(af, 9);
698   const uint8x16_t a = xxpermdi(af, above_right, 1);
699   const uint8x16_t b = vec_perm(a, above_right, sl1);
700   const uint8x16_t c = vec_perm(b, above_right, sl1);
701   uint8x16_t row0 = vec_avg(a, b);
702   uint8x16_t row1 = avg3(a, b, c);
703   int i;
704   (void)left;
705 
706   for (i = 0; i < 4; i++) {
707     const uint8x16_t d0 = vec_vsx_ld(0, dst);
708     const uint8x16_t d1 = vec_vsx_ld(0, dst + stride);
709     vec_vsx_st(xxpermdi(row0, d0, 1), 0, dst);
710     vec_vsx_st(xxpermdi(row1, d1, 1), 0, dst + stride);
711     dst += stride * 2;
712     row0 = vec_perm(row0, above_right, sl1);
713     row1 = vec_perm(row1, above_right, sl1);
714   }
715 }
716 #endif
717 
vpx_d63_predictor_16x16_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)718 void vpx_d63_predictor_16x16_vsx(uint8_t *dst, ptrdiff_t stride,
719                                  const uint8_t *above, const uint8_t *left) {
720   const uint8x16_t a0 = vec_vsx_ld(0, above);
721   const uint8x16_t a1 = vec_vsx_ld(16, above);
722   const uint8x16_t above_right = vec_splat(a1, 0);
723   const uint8x16_t b = vec_perm(a0, above_right, sl1);
724   const uint8x16_t c = vec_perm(b, above_right, sl1);
725   uint8x16_t row0 = vec_avg(a0, b);
726   uint8x16_t row1 = avg3(a0, b, c);
727   int i;
728   (void)left;
729 
730   for (i = 0; i < 8; i++) {
731     vec_vsx_st(row0, 0, dst);
732     vec_vsx_st(row1, 0, dst + stride);
733     dst += stride * 2;
734     row0 = vec_perm(row0, above_right, sl1);
735     row1 = vec_perm(row1, above_right, sl1);
736   }
737 }
738 
vpx_d63_predictor_32x32_vsx(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)739 void vpx_d63_predictor_32x32_vsx(uint8_t *dst, ptrdiff_t stride,
740                                  const uint8_t *above, const uint8_t *left) {
741   const uint8x16_t a0 = vec_vsx_ld(0, above);
742   const uint8x16_t a1 = vec_vsx_ld(16, above);
743   const uint8x16_t a2 = vec_vsx_ld(32, above);
744   const uint8x16_t above_right = vec_splat(a2, 0);
745   const uint8x16_t b0 = vec_perm(a0, a1, sl1);
746   const uint8x16_t b1 = vec_perm(a1, above_right, sl1);
747   const uint8x16_t c0 = vec_perm(b0, b1, sl1);
748   const uint8x16_t c1 = vec_perm(b1, above_right, sl1);
749   uint8x16_t row0_0 = vec_avg(a0, b0);
750   uint8x16_t row0_1 = vec_avg(a1, b1);
751   uint8x16_t row1_0 = avg3(a0, b0, c0);
752   uint8x16_t row1_1 = avg3(a1, b1, c1);
753   int i;
754   (void)left;
755 
756   for (i = 0; i < 16; i++) {
757     vec_vsx_st(row0_0, 0, dst);
758     vec_vsx_st(row0_1, 16, dst);
759     vec_vsx_st(row1_0, 0, dst + stride);
760     vec_vsx_st(row1_1, 16, dst + stride);
761     dst += stride * 2;
762     row0_0 = vec_perm(row0_0, row0_1, sl1);
763     row0_1 = vec_perm(row0_1, above_right, sl1);
764     row1_0 = vec_perm(row1_0, row1_1, sl1);
765     row1_1 = vec_perm(row1_1, above_right, sl1);
766   }
767 }
768