1 /*
2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <tmmintrin.h>
12 
13 #include "./vpx_config.h"
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx/vpx_integer.h"
16 
17 // -----------------------------------------------------------------------------
18 /*
19 ; ------------------------------------------
20 ; input: x, y, z, result
21 ;
22 ; trick from pascal
23 ; (x+2y+z+2)>>2 can be calculated as:
24 ; result = avg(x,z)
25 ; result -= xor(x,z) & 1
26 ; result = avg(result,y)
27 ; ------------------------------------------
28 */
avg3_epu16(const __m128i * x,const __m128i * y,const __m128i * z)29 static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y,
30                                  const __m128i *z) {
31   const __m128i one = _mm_set1_epi16(1);
32   const __m128i a = _mm_avg_epu16(*x, *z);
33   const __m128i b =
34       _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one));
35   return _mm_avg_epu16(b, *y);
36 }
37 
vpx_highbd_d45_predictor_4x4_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)38 void vpx_highbd_d45_predictor_4x4_ssse3(uint16_t *dst, ptrdiff_t stride,
39                                         const uint16_t *above,
40                                         const uint16_t *left, int bd) {
41   const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above);
42   const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
43   const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4);
44   const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00);
45   (void)left;
46   (void)bd;
47   _mm_storel_epi64((__m128i *)dst, avg3);
48   dst += stride;
49   _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2));
50   dst += stride;
51   _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4));
52   dst += stride;
53   _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6));
54   dst[3] = above[7];  // aka H
55 }
56 
d45_store_8(uint16_t ** dst,const ptrdiff_t stride,__m128i * row,const __m128i * ar)57 static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride,
58                                __m128i *row, const __m128i *ar) {
59   *row = _mm_alignr_epi8(*ar, *row, 2);
60   _mm_store_si128((__m128i *)*dst, *row);
61   *dst += stride;
62 }
63 
vpx_highbd_d45_predictor_8x8_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)64 void vpx_highbd_d45_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
65                                         const uint16_t *above,
66                                         const uint16_t *left, int bd) {
67   const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
68   const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff);
69   const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH);
70   const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2);
71   const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4);
72   __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH);
73   (void)left;
74   (void)bd;
75   _mm_store_si128((__m128i *)dst, avg3);
76   dst += stride;
77   d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
78   d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
79   d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
80   d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
81   d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
82   d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
83   d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
84 }
85 
d45_store_16(uint16_t ** dst,const ptrdiff_t stride,__m128i * row_0,__m128i * row_1,const __m128i * ar)86 static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride,
87                                 __m128i *row_0, __m128i *row_1,
88                                 const __m128i *ar) {
89   *row_0 = _mm_alignr_epi8(*row_1, *row_0, 2);
90   *row_1 = _mm_alignr_epi8(*ar, *row_1, 2);
91   _mm_store_si128((__m128i *)*dst, *row_0);
92   _mm_store_si128((__m128i *)(*dst + 8), *row_1);
93   *dst += stride;
94 }
95 
vpx_highbd_d45_predictor_16x16_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)96 void vpx_highbd_d45_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
97                                           const uint16_t *above,
98                                           const uint16_t *left, int bd) {
99   const __m128i A0 = _mm_load_si128((const __m128i *)above);
100   const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
101   const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff);
102   const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
103   const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
104   const __m128i B1 = _mm_alignr_epi8(AR, A1, 2);
105   const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
106   const __m128i C1 = _mm_alignr_epi8(AR, A1, 4);
107   __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
108   __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
109   (void)left;
110   (void)bd;
111   _mm_store_si128((__m128i *)dst, avg3_0);
112   _mm_store_si128((__m128i *)(dst + 8), avg3_1);
113   dst += stride;
114   d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
115   d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
116   d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
117   d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
118   d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
119   d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
120   d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
121   d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
122   d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
123   d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
124   d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
125   d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
126   d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
127   d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
128   d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
129 }
130 
vpx_highbd_d45_predictor_32x32_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)131 void vpx_highbd_d45_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
132                                           const uint16_t *above,
133                                           const uint16_t *left, int bd) {
134   const __m128i A0 = _mm_load_si128((const __m128i *)above);
135   const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
136   const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16));
137   const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24));
138   const __m128i AR0 = _mm_shufflehi_epi16(A3, 0xff);
139   const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
140   const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
141   const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
142   const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
143   const __m128i B3 = _mm_alignr_epi8(AR, A3, 2);
144   const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
145   const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
146   const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
147   const __m128i C3 = _mm_alignr_epi8(AR, A3, 4);
148   __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
149   __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
150   __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
151   __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
152   int i;
153   (void)left;
154   (void)bd;
155   _mm_store_si128((__m128i *)dst, avg3_0);
156   _mm_store_si128((__m128i *)(dst + 8), avg3_1);
157   _mm_store_si128((__m128i *)(dst + 16), avg3_2);
158   _mm_store_si128((__m128i *)(dst + 24), avg3_3);
159   dst += stride;
160   for (i = 1; i < 32; ++i) {
161     avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2);
162     avg3_1 = _mm_alignr_epi8(avg3_2, avg3_1, 2);
163     avg3_2 = _mm_alignr_epi8(avg3_3, avg3_2, 2);
164     avg3_3 = _mm_alignr_epi8(AR, avg3_3, 2);
165     _mm_store_si128((__m128i *)dst, avg3_0);
166     _mm_store_si128((__m128i *)(dst + 8), avg3_1);
167     _mm_store_si128((__m128i *)(dst + 16), avg3_2);
168     _mm_store_si128((__m128i *)(dst + 24), avg3_3);
169     dst += stride;
170   }
171 }
172 
173 DECLARE_ALIGNED(16, static const uint8_t,
174                 rotate_right_epu16[16]) = { 2,  3,  4,  5,  6,  7,  8, 9,
175                                             10, 11, 12, 13, 14, 15, 0, 1 };
176 
rotr_epu16(__m128i * a,const __m128i * rotrw)177 static INLINE __m128i rotr_epu16(__m128i *a, const __m128i *rotrw) {
178   *a = _mm_shuffle_epi8(*a, *rotrw);
179   return *a;
180 }
181 
vpx_highbd_d117_predictor_8x8_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)182 void vpx_highbd_d117_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
183                                          const uint16_t *above,
184                                          const uint16_t *left, int bd) {
185   const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
186   const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
187   const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
188   const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
189   const __m128i IXABCDEF =
190       _mm_alignr_epi8(XABCDEFG, _mm_slli_si128(IJKLMNOP, 14), 14);
191   const __m128i avg3 = avg3_epu16(&ABCDEFGH, &XABCDEFG, &IXABCDEF);
192   const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, XABCDEFG);
193   const __m128i XIJKLMNO =
194       _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
195   const __m128i JKLMNOP0 = _mm_srli_si128(IJKLMNOP, 2);
196   __m128i avg3_left = avg3_epu16(&XIJKLMNO, &IJKLMNOP, &JKLMNOP0);
197   __m128i rowa = avg2;
198   __m128i rowb = avg3;
199   int i;
200   (void)bd;
201   for (i = 0; i < 8; i += 2) {
202     _mm_store_si128((__m128i *)dst, rowa);
203     dst += stride;
204     _mm_store_si128((__m128i *)dst, rowb);
205     dst += stride;
206     rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14);
207     rowb = _mm_alignr_epi8(rowb, rotr_epu16(&avg3_left, &rotrw), 14);
208   }
209 }
210 
vpx_highbd_d117_predictor_16x16_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)211 void vpx_highbd_d117_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
212                                            const uint16_t *above,
213                                            const uint16_t *left, int bd) {
214   const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
215   const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1));
216   const __m128i A0 = _mm_load_si128((const __m128i *)above);
217   const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7));
218   const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
219   const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
220   const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
221   const __m128i L0 = _mm_load_si128((const __m128i *)left);
222   const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
223   const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14);
224   const __m128i C1 = _mm_alignr_epi8(B1, B0, 14);
225   const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
226   const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
227   const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14);
228   const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
229   const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2);
230   const __m128i L1_ = _mm_srli_si128(L1, 2);
231   __m128i rowa_0 = avg2_0;
232   __m128i rowa_1 = avg2_1;
233   __m128i rowb_0 = avg3_0;
234   __m128i rowb_1 = avg3_1;
235   __m128i avg3_left[2];
236   int i, j;
237   (void)bd;
238   avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_);
239   avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_);
240   for (i = 0; i < 2; ++i) {
241     __m128i avg_left = avg3_left[i];
242     for (j = 0; j < 8; j += 2) {
243       _mm_store_si128((__m128i *)dst, rowa_0);
244       _mm_store_si128((__m128i *)(dst + 8), rowa_1);
245       dst += stride;
246       _mm_store_si128((__m128i *)dst, rowb_0);
247       _mm_store_si128((__m128i *)(dst + 8), rowb_1);
248       dst += stride;
249       rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
250       rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
251       rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14);
252       rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14);
253     }
254   }
255 }
256 
vpx_highbd_d117_predictor_32x32_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)257 void vpx_highbd_d117_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
258                                            const uint16_t *above,
259                                            const uint16_t *left, int bd) {
260   const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
261   const __m128i A0 = _mm_load_si128((const __m128i *)above);
262   const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
263   const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16));
264   const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24));
265   const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1));
266   const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7));
267   const __m128i B2 = _mm_loadu_si128((const __m128i *)(above + 15));
268   const __m128i B3 = _mm_loadu_si128((const __m128i *)(above + 23));
269   const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
270   const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
271   const __m128i avg2_2 = _mm_avg_epu16(A2, B2);
272   const __m128i avg2_3 = _mm_avg_epu16(A3, B3);
273   const __m128i L0 = _mm_load_si128((const __m128i *)left);
274   const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
275   const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
276   const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
277   const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14);
278   const __m128i C1 = _mm_alignr_epi8(B1, B0, 14);
279   const __m128i C2 = _mm_alignr_epi8(B2, B1, 14);
280   const __m128i C3 = _mm_alignr_epi8(B3, B2, 14);
281   const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
282   const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
283   const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
284   const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
285   const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14);
286   const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
287   const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
288   const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
289   const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2);
290   const __m128i L1_ = _mm_alignr_epi8(L2, L1, 2);
291   const __m128i L2_ = _mm_alignr_epi8(L3, L2, 2);
292   const __m128i L3_ = _mm_srli_si128(L3, 2);
293   __m128i rowa_0 = avg2_0;
294   __m128i rowa_1 = avg2_1;
295   __m128i rowa_2 = avg2_2;
296   __m128i rowa_3 = avg2_3;
297   __m128i rowb_0 = avg3_0;
298   __m128i rowb_1 = avg3_1;
299   __m128i rowb_2 = avg3_2;
300   __m128i rowb_3 = avg3_3;
301   __m128i avg3_left[4];
302   int i, j;
303   (void)bd;
304   avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_);
305   avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_);
306   avg3_left[2] = avg3_epu16(&XL2, &L2, &L2_);
307   avg3_left[3] = avg3_epu16(&XL3, &L3, &L3_);
308   for (i = 0; i < 4; ++i) {
309     __m128i avg_left = avg3_left[i];
310     for (j = 0; j < 8; j += 2) {
311       _mm_store_si128((__m128i *)dst, rowa_0);
312       _mm_store_si128((__m128i *)(dst + 8), rowa_1);
313       _mm_store_si128((__m128i *)(dst + 16), rowa_2);
314       _mm_store_si128((__m128i *)(dst + 24), rowa_3);
315       dst += stride;
316       _mm_store_si128((__m128i *)dst, rowb_0);
317       _mm_store_si128((__m128i *)(dst + 8), rowb_1);
318       _mm_store_si128((__m128i *)(dst + 16), rowb_2);
319       _mm_store_si128((__m128i *)(dst + 24), rowb_3);
320       dst += stride;
321       rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14);
322       rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14);
323       rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
324       rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
325       rowb_3 = _mm_alignr_epi8(rowb_3, rowb_2, 14);
326       rowb_2 = _mm_alignr_epi8(rowb_2, rowb_1, 14);
327       rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14);
328       rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14);
329     }
330   }
331 }
332 
vpx_highbd_d135_predictor_8x8_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)333 void vpx_highbd_d135_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
334                                          const uint16_t *above,
335                                          const uint16_t *left, int bd) {
336   const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
337   const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
338   const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
339   const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
340   const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
341   const __m128i XIJKLMNO =
342       _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
343   const __m128i AXIJKLMN =
344       _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(ABCDEFGH, 14), 14);
345   const __m128i avg3 = avg3_epu16(&XABCDEFG, &ABCDEFGH, &BCDEFGH0);
346   __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN);
347   __m128i rowa = avg3;
348   int i;
349   (void)bd;
350   for (i = 0; i < 8; ++i) {
351     rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14);
352     _mm_store_si128((__m128i *)dst, rowa);
353     dst += stride;
354   }
355 }
356 
vpx_highbd_d135_predictor_16x16_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)357 void vpx_highbd_d135_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
358                                            const uint16_t *above,
359                                            const uint16_t *left, int bd) {
360   const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
361   const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
362   const __m128i B0 = _mm_load_si128((const __m128i *)above);
363   const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
364   const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8));
365   const __m128i L0 = _mm_load_si128((const __m128i *)left);
366   const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
367   const __m128i C0 = _mm_alignr_epi8(B1, B0, 2);
368   const __m128i C1 = _mm_srli_si128(B1, 2);
369   const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
370   const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
371   const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
372   const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
373   const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14);
374   const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14);
375   __m128i rowa_0 = avg3_0;
376   __m128i rowa_1 = avg3_1;
377   __m128i avg3_left[2];
378   int i, j;
379   (void)bd;
380   avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_);
381   avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_);
382   for (i = 0; i < 2; ++i) {
383     __m128i avg_left = avg3_left[i];
384     for (j = 0; j < 8; ++j) {
385       rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
386       rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
387       _mm_store_si128((__m128i *)dst, rowa_0);
388       _mm_store_si128((__m128i *)(dst + 8), rowa_1);
389       dst += stride;
390     }
391   }
392 }
393 
vpx_highbd_d135_predictor_32x32_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)394 void vpx_highbd_d135_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
395                                            const uint16_t *above,
396                                            const uint16_t *left, int bd) {
397   const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
398   const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
399   const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
400   const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15));
401   const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23));
402   const __m128i B0 = _mm_load_si128((const __m128i *)above);
403   const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8));
404   const __m128i B2 = _mm_load_si128((const __m128i *)(above + 16));
405   const __m128i B3 = _mm_load_si128((const __m128i *)(above + 24));
406   const __m128i L0 = _mm_load_si128((const __m128i *)left);
407   const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
408   const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
409   const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
410   const __m128i C0 = _mm_alignr_epi8(B1, B0, 2);
411   const __m128i C1 = _mm_alignr_epi8(B2, B1, 2);
412   const __m128i C2 = _mm_alignr_epi8(B3, B2, 2);
413   const __m128i C3 = _mm_srli_si128(B3, 2);
414   const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
415   const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
416   const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
417   const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
418   const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
419   const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
420   const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
421   const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
422   const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14);
423   const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14);
424   const __m128i L2_ = _mm_alignr_epi8(XL2, XL1, 14);
425   const __m128i L3_ = _mm_alignr_epi8(XL3, XL2, 14);
426   __m128i rowa_0 = avg3_0;
427   __m128i rowa_1 = avg3_1;
428   __m128i rowa_2 = avg3_2;
429   __m128i rowa_3 = avg3_3;
430   __m128i avg3_left[4];
431   int i, j;
432   (void)bd;
433   avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_);
434   avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_);
435   avg3_left[2] = avg3_epu16(&L2, &XL2, &L2_);
436   avg3_left[3] = avg3_epu16(&L3, &XL3, &L3_);
437   for (i = 0; i < 4; ++i) {
438     __m128i avg_left = avg3_left[i];
439     for (j = 0; j < 8; ++j) {
440       rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14);
441       rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14);
442       rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
443       rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
444       _mm_store_si128((__m128i *)dst, rowa_0);
445       _mm_store_si128((__m128i *)(dst + 8), rowa_1);
446       _mm_store_si128((__m128i *)(dst + 16), rowa_2);
447       _mm_store_si128((__m128i *)(dst + 24), rowa_3);
448       dst += stride;
449     }
450   }
451 }
452 
vpx_highbd_d153_predictor_8x8_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)453 void vpx_highbd_d153_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
454                                          const uint16_t *above,
455                                          const uint16_t *left, int bd) {
456   const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
457   const __m128i ABCDEFG0 = _mm_srli_si128(XABCDEFG, 2);
458   const __m128i BCDEFG00 = _mm_srli_si128(XABCDEFG, 4);
459   const __m128i avg3 = avg3_epu16(&BCDEFG00, &ABCDEFG0, &XABCDEFG);
460   const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
461   const __m128i XIJKLMNO =
462       _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
463   const __m128i AXIJKLMN =
464       _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(XABCDEFG, 12), 14);
465   const __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN);
466   const __m128i avg2_left = _mm_avg_epu16(IJKLMNOP, XIJKLMNO);
467   const __m128i avg2_avg3_lo = _mm_unpacklo_epi16(avg2_left, avg3_left);
468   const __m128i avg2_avg3_hi = _mm_unpackhi_epi16(avg2_left, avg3_left);
469   const __m128i row0 =
470       _mm_alignr_epi8(avg3, _mm_slli_si128(avg2_avg3_lo, 12), 12);
471   const __m128i row1 =
472       _mm_alignr_epi8(row0, _mm_slli_si128(avg2_avg3_lo, 8), 12);
473   const __m128i row2 =
474       _mm_alignr_epi8(row1, _mm_slli_si128(avg2_avg3_lo, 4), 12);
475   const __m128i row3 = _mm_alignr_epi8(row2, avg2_avg3_lo, 12);
476   const __m128i row4 =
477       _mm_alignr_epi8(row3, _mm_slli_si128(avg2_avg3_hi, 12), 12);
478   const __m128i row5 =
479       _mm_alignr_epi8(row4, _mm_slli_si128(avg2_avg3_hi, 8), 12);
480   const __m128i row6 =
481       _mm_alignr_epi8(row5, _mm_slli_si128(avg2_avg3_hi, 4), 12);
482   const __m128i row7 = _mm_alignr_epi8(row6, avg2_avg3_hi, 12);
483   (void)bd;
484   _mm_store_si128((__m128i *)dst, row0);
485   dst += stride;
486   _mm_store_si128((__m128i *)dst, row1);
487   dst += stride;
488   _mm_store_si128((__m128i *)dst, row2);
489   dst += stride;
490   _mm_store_si128((__m128i *)dst, row3);
491   dst += stride;
492   _mm_store_si128((__m128i *)dst, row4);
493   dst += stride;
494   _mm_store_si128((__m128i *)dst, row5);
495   dst += stride;
496   _mm_store_si128((__m128i *)dst, row6);
497   dst += stride;
498   _mm_store_si128((__m128i *)dst, row7);
499 }
500 
vpx_highbd_d153_predictor_16x16_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)501 void vpx_highbd_d153_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
502                                            const uint16_t *above,
503                                            const uint16_t *left, int bd) {
504   const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
505   const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
506   const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
507   const __m128i B1 = _mm_srli_si128(A1, 2);
508   const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
509   const __m128i C1 = _mm_srli_si128(A1, 4);
510   const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
511   const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
512   const __m128i L0 = _mm_load_si128((const __m128i *)left);
513   const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
514   const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
515   const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14);
516   const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
517   const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12);
518   const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0);
519   const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0);
520   const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1);
521   const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1);
522   __m128i row_0 = avg3_0;
523   __m128i row_1 = avg3_1;
524   __m128i avg2_avg3_left[2][2];
525   int i, j;
526   (void)bd;
527 
528   avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0);
529   avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0);
530   avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1);
531   avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1);
532 
533   for (j = 0; j < 2; ++j) {
534     for (i = 0; i < 2; ++i) {
535       const __m128i avg2_avg3 = avg2_avg3_left[j][i];
536       row_1 = _mm_alignr_epi8(row_1, row_0, 12);
537       row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12);
538       _mm_store_si128((__m128i *)dst, row_0);
539       _mm_store_si128((__m128i *)(dst + 8), row_1);
540       dst += stride;
541       row_1 = _mm_alignr_epi8(row_1, row_0, 12);
542       row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12);
543       _mm_store_si128((__m128i *)dst, row_0);
544       _mm_store_si128((__m128i *)(dst + 8), row_1);
545       dst += stride;
546       row_1 = _mm_alignr_epi8(row_1, row_0, 12);
547       row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12);
548       _mm_store_si128((__m128i *)dst, row_0);
549       _mm_store_si128((__m128i *)(dst + 8), row_1);
550       dst += stride;
551       row_1 = _mm_alignr_epi8(row_1, row_0, 12);
552       row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12);
553       _mm_store_si128((__m128i *)dst, row_0);
554       _mm_store_si128((__m128i *)(dst + 8), row_1);
555       dst += stride;
556     }
557   }
558 }
559 
vpx_highbd_d153_predictor_32x32_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)560 void vpx_highbd_d153_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
561                                            const uint16_t *above,
562                                            const uint16_t *left, int bd) {
563   const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
564   const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
565   const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15));
566   const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23));
567   const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
568   const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
569   const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
570   const __m128i B3 = _mm_srli_si128(A3, 2);
571   const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
572   const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
573   const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
574   const __m128i C3 = _mm_srli_si128(A3, 4);
575   const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
576   const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
577   const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
578   const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
579   const __m128i L0 = _mm_load_si128((const __m128i *)left);
580   const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
581   const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
582   const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
583   const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
584   const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
585   const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
586   const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
587   const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14);
588   const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12);
589   const __m128i AXL2 = _mm_alignr_epi8(L2, L1, 12);
590   const __m128i AXL3 = _mm_alignr_epi8(L3, L2, 12);
591   const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0);
592   const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1);
593   const __m128i avg3_left_2 = avg3_epu16(&L2, &XL2, &AXL2);
594   const __m128i avg3_left_3 = avg3_epu16(&L3, &XL3, &AXL3);
595   const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0);
596   const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1);
597   const __m128i avg2_left_2 = _mm_avg_epu16(L2, XL2);
598   const __m128i avg2_left_3 = _mm_avg_epu16(L3, XL3);
599   __m128i row_0 = avg3_0;
600   __m128i row_1 = avg3_1;
601   __m128i row_2 = avg3_2;
602   __m128i row_3 = avg3_3;
603   __m128i avg2_avg3_left[4][2];
604   int i, j;
605   (void)bd;
606 
607   avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0);
608   avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0);
609   avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1);
610   avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1);
611   avg2_avg3_left[2][0] = _mm_unpacklo_epi16(avg2_left_2, avg3_left_2);
612   avg2_avg3_left[2][1] = _mm_unpackhi_epi16(avg2_left_2, avg3_left_2);
613   avg2_avg3_left[3][0] = _mm_unpacklo_epi16(avg2_left_3, avg3_left_3);
614   avg2_avg3_left[3][1] = _mm_unpackhi_epi16(avg2_left_3, avg3_left_3);
615 
616   for (j = 0; j < 4; ++j) {
617     for (i = 0; i < 2; ++i) {
618       const __m128i avg2_avg3 = avg2_avg3_left[j][i];
619       row_3 = _mm_alignr_epi8(row_3, row_2, 12);
620       row_2 = _mm_alignr_epi8(row_2, row_1, 12);
621       row_1 = _mm_alignr_epi8(row_1, row_0, 12);
622       row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12);
623       _mm_store_si128((__m128i *)dst, row_0);
624       _mm_store_si128((__m128i *)(dst + 8), row_1);
625       _mm_store_si128((__m128i *)(dst + 16), row_2);
626       _mm_store_si128((__m128i *)(dst + 24), row_3);
627       dst += stride;
628       row_3 = _mm_alignr_epi8(row_3, row_2, 12);
629       row_2 = _mm_alignr_epi8(row_2, row_1, 12);
630       row_1 = _mm_alignr_epi8(row_1, row_0, 12);
631       row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12);
632       _mm_store_si128((__m128i *)dst, row_0);
633       _mm_store_si128((__m128i *)(dst + 8), row_1);
634       _mm_store_si128((__m128i *)(dst + 16), row_2);
635       _mm_store_si128((__m128i *)(dst + 24), row_3);
636       dst += stride;
637       row_3 = _mm_alignr_epi8(row_3, row_2, 12);
638       row_2 = _mm_alignr_epi8(row_2, row_1, 12);
639       row_1 = _mm_alignr_epi8(row_1, row_0, 12);
640       row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12);
641       _mm_store_si128((__m128i *)dst, row_0);
642       _mm_store_si128((__m128i *)(dst + 8), row_1);
643       _mm_store_si128((__m128i *)(dst + 16), row_2);
644       _mm_store_si128((__m128i *)(dst + 24), row_3);
645       dst += stride;
646       row_3 = _mm_alignr_epi8(row_3, row_2, 12);
647       row_2 = _mm_alignr_epi8(row_2, row_1, 12);
648       row_1 = _mm_alignr_epi8(row_1, row_0, 12);
649       row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12);
650       _mm_store_si128((__m128i *)dst, row_0);
651       _mm_store_si128((__m128i *)(dst + 8), row_1);
652       _mm_store_si128((__m128i *)(dst + 16), row_2);
653       _mm_store_si128((__m128i *)(dst + 24), row_3);
654       dst += stride;
655     }
656   }
657 }
658 
d207_store_4x8(uint16_t ** dst,const ptrdiff_t stride,const __m128i * a,const __m128i * b)659 static INLINE void d207_store_4x8(uint16_t **dst, const ptrdiff_t stride,
660                                   const __m128i *a, const __m128i *b) {
661   _mm_store_si128((__m128i *)*dst, *a);
662   *dst += stride;
663   _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4));
664   *dst += stride;
665   _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8));
666   *dst += stride;
667   _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12));
668   *dst += stride;
669 }
670 
vpx_highbd_d207_predictor_8x8_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)671 void vpx_highbd_d207_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
672                                          const uint16_t *above,
673                                          const uint16_t *left, int bd) {
674   const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)left);
675   const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff);
676   const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH);
677   const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2);
678   const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4);
679   const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH);
680   const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGHH);
681   const __m128i out_a = _mm_unpacklo_epi16(avg2, avg3);
682   const __m128i out_b = _mm_unpackhi_epi16(avg2, avg3);
683   (void)above;
684   (void)bd;
685   d207_store_4x8(&dst, stride, &out_a, &out_b);
686   d207_store_4x8(&dst, stride, &out_b, &HHHHHHHH);
687 }
688 
d207_store_4x16(uint16_t ** dst,const ptrdiff_t stride,const __m128i * a,const __m128i * b,const __m128i * c)689 static INLINE void d207_store_4x16(uint16_t **dst, const ptrdiff_t stride,
690                                    const __m128i *a, const __m128i *b,
691                                    const __m128i *c) {
692   _mm_store_si128((__m128i *)*dst, *a);
693   _mm_store_si128((__m128i *)(*dst + 8), *b);
694   *dst += stride;
695   _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4));
696   _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 4));
697   *dst += stride;
698   _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8));
699   _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 8));
700   *dst += stride;
701   _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12));
702   _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 12));
703   *dst += stride;
704 }
705 
vpx_highbd_d207_predictor_16x16_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)706 void vpx_highbd_d207_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
707                                            const uint16_t *above,
708                                            const uint16_t *left, int bd) {
709   const __m128i A0 = _mm_load_si128((const __m128i *)left);
710   const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8));
711   const __m128i LR0 = _mm_shufflehi_epi16(A1, 0xff);
712   const __m128i LR = _mm_unpackhi_epi64(LR0, LR0);
713   const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
714   const __m128i B1 = _mm_alignr_epi8(LR, A1, 2);
715   const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
716   const __m128i C1 = _mm_alignr_epi8(LR, A1, 4);
717   const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
718   const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
719   const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
720   const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
721   const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0);
722   const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0);
723   const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1);
724   const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1);
725   (void)above;
726   (void)bd;
727   d207_store_4x16(&dst, stride, &out_a, &out_b, &out_c);
728   d207_store_4x16(&dst, stride, &out_b, &out_c, &out_d);
729   d207_store_4x16(&dst, stride, &out_c, &out_d, &LR);
730   d207_store_4x16(&dst, stride, &out_d, &LR, &LR);
731 }
732 
d207_store_4x32(uint16_t ** dst,const ptrdiff_t stride,const __m128i * a,const __m128i * b,const __m128i * c,const __m128i * d,const __m128i * e)733 static INLINE void d207_store_4x32(uint16_t **dst, const ptrdiff_t stride,
734                                    const __m128i *a, const __m128i *b,
735                                    const __m128i *c, const __m128i *d,
736                                    const __m128i *e) {
737   _mm_store_si128((__m128i *)*dst, *a);
738   _mm_store_si128((__m128i *)(*dst + 8), *b);
739   _mm_store_si128((__m128i *)(*dst + 16), *c);
740   _mm_store_si128((__m128i *)(*dst + 24), *d);
741   *dst += stride;
742   _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4));
743   _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 4));
744   _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 4));
745   _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 4));
746   *dst += stride;
747   _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8));
748   _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 8));
749   _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 8));
750   _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 8));
751   *dst += stride;
752   _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12));
753   _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 12));
754   _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 12));
755   _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 12));
756   *dst += stride;
757 }
758 
vpx_highbd_d207_predictor_32x32_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)759 void vpx_highbd_d207_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
760                                            const uint16_t *above,
761                                            const uint16_t *left, int bd) {
762   const __m128i A0 = _mm_load_si128((const __m128i *)left);
763   const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8));
764   const __m128i A2 = _mm_load_si128((const __m128i *)(left + 16));
765   const __m128i A3 = _mm_load_si128((const __m128i *)(left + 24));
766   const __m128i LR0 = _mm_shufflehi_epi16(A3, 0xff);
767   const __m128i LR = _mm_unpackhi_epi64(LR0, LR0);
768   const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
769   const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
770   const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
771   const __m128i B3 = _mm_alignr_epi8(LR, A3, 2);
772   const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
773   const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
774   const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
775   const __m128i C3 = _mm_alignr_epi8(LR, A3, 4);
776   const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
777   const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
778   const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
779   const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
780   const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
781   const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
782   const __m128i avg2_2 = _mm_avg_epu16(A2, B2);
783   const __m128i avg2_3 = _mm_avg_epu16(A3, B3);
784   const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0);
785   const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0);
786   const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1);
787   const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1);
788   const __m128i out_e = _mm_unpacklo_epi16(avg2_2, avg3_2);
789   const __m128i out_f = _mm_unpackhi_epi16(avg2_2, avg3_2);
790   const __m128i out_g = _mm_unpacklo_epi16(avg2_3, avg3_3);
791   const __m128i out_h = _mm_unpackhi_epi16(avg2_3, avg3_3);
792   (void)above;
793   (void)bd;
794   d207_store_4x32(&dst, stride, &out_a, &out_b, &out_c, &out_d, &out_e);
795   d207_store_4x32(&dst, stride, &out_b, &out_c, &out_d, &out_e, &out_f);
796   d207_store_4x32(&dst, stride, &out_c, &out_d, &out_e, &out_f, &out_g);
797   d207_store_4x32(&dst, stride, &out_d, &out_e, &out_f, &out_g, &out_h);
798   d207_store_4x32(&dst, stride, &out_e, &out_f, &out_g, &out_h, &LR);
799   d207_store_4x32(&dst, stride, &out_f, &out_g, &out_h, &LR, &LR);
800   d207_store_4x32(&dst, stride, &out_g, &out_h, &LR, &LR, &LR);
801   d207_store_4x32(&dst, stride, &out_h, &LR, &LR, &LR, &LR);
802 }
803 
d63_store_4x8(uint16_t ** dst,const ptrdiff_t stride,__m128i * a,__m128i * b,const __m128i * ar)804 static INLINE void d63_store_4x8(uint16_t **dst, const ptrdiff_t stride,
805                                  __m128i *a, __m128i *b, const __m128i *ar) {
806   _mm_store_si128((__m128i *)*dst, *a);
807   *dst += stride;
808   _mm_store_si128((__m128i *)*dst, *b);
809   *dst += stride;
810   *a = _mm_alignr_epi8(*ar, *a, 2);
811   *b = _mm_alignr_epi8(*ar, *b, 2);
812   _mm_store_si128((__m128i *)*dst, *a);
813   *dst += stride;
814   _mm_store_si128((__m128i *)*dst, *b);
815   *dst += stride;
816   *a = _mm_alignr_epi8(*ar, *a, 2);
817   *b = _mm_alignr_epi8(*ar, *b, 2);
818 }
819 
vpx_highbd_d63_predictor_8x8_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)820 void vpx_highbd_d63_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
821                                         const uint16_t *above,
822                                         const uint16_t *left, int bd) {
823   const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
824   const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff);
825   const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH);
826   const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2);
827   const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4);
828   __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH);
829   __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGHH);
830   (void)left;
831   (void)bd;
832   d63_store_4x8(&dst, stride, &avg2, &avg3, &HHHHHHHH);
833   d63_store_4x8(&dst, stride, &avg2, &avg3, &HHHHHHHH);
834 }
835 
vpx_highbd_d63_predictor_16x16_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)836 void vpx_highbd_d63_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
837                                           const uint16_t *above,
838                                           const uint16_t *left, int bd) {
839   const __m128i A0 = _mm_load_si128((const __m128i *)above);
840   const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
841   const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff);
842   const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
843   const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
844   const __m128i B1 = _mm_alignr_epi8(AR, A1, 2);
845   const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
846   const __m128i C1 = _mm_alignr_epi8(AR, A1, 4);
847   __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
848   __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
849   __m128i avg2_0 = _mm_avg_epu16(A0, B0);
850   __m128i avg2_1 = _mm_avg_epu16(A1, B1);
851   int i;
852   (void)left;
853   (void)bd;
854   for (i = 0; i < 14; i += 2) {
855     _mm_store_si128((__m128i *)dst, avg2_0);
856     _mm_store_si128((__m128i *)(dst + 8), avg2_1);
857     dst += stride;
858     _mm_store_si128((__m128i *)dst, avg3_0);
859     _mm_store_si128((__m128i *)(dst + 8), avg3_1);
860     dst += stride;
861     avg2_0 = _mm_alignr_epi8(avg2_1, avg2_0, 2);
862     avg2_1 = _mm_alignr_epi8(AR, avg2_1, 2);
863     avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2);
864     avg3_1 = _mm_alignr_epi8(AR, avg3_1, 2);
865   }
866   _mm_store_si128((__m128i *)dst, avg2_0);
867   _mm_store_si128((__m128i *)(dst + 8), avg2_1);
868   dst += stride;
869   _mm_store_si128((__m128i *)dst, avg3_0);
870   _mm_store_si128((__m128i *)(dst + 8), avg3_1);
871 }
872 
vpx_highbd_d63_predictor_32x32_ssse3(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)873 void vpx_highbd_d63_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
874                                           const uint16_t *above,
875                                           const uint16_t *left, int bd) {
876   const __m128i A0 = _mm_load_si128((const __m128i *)above);
877   const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
878   const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16));
879   const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24));
880   const __m128i AR0 = _mm_shufflehi_epi16(A3, 0xff);
881   const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
882   const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
883   const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
884   const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
885   const __m128i B3 = _mm_alignr_epi8(AR, A3, 2);
886   const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
887   const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
888   const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
889   const __m128i C3 = _mm_alignr_epi8(AR, A3, 4);
890   __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
891   __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
892   __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
893   __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
894   __m128i avg2_0 = _mm_avg_epu16(A0, B0);
895   __m128i avg2_1 = _mm_avg_epu16(A1, B1);
896   __m128i avg2_2 = _mm_avg_epu16(A2, B2);
897   __m128i avg2_3 = _mm_avg_epu16(A3, B3);
898   int i;
899   (void)left;
900   (void)bd;
901   for (i = 0; i < 30; i += 2) {
902     _mm_store_si128((__m128i *)dst, avg2_0);
903     _mm_store_si128((__m128i *)(dst + 8), avg2_1);
904     _mm_store_si128((__m128i *)(dst + 16), avg2_2);
905     _mm_store_si128((__m128i *)(dst + 24), avg2_3);
906     dst += stride;
907     _mm_store_si128((__m128i *)dst, avg3_0);
908     _mm_store_si128((__m128i *)(dst + 8), avg3_1);
909     _mm_store_si128((__m128i *)(dst + 16), avg3_2);
910     _mm_store_si128((__m128i *)(dst + 24), avg3_3);
911     dst += stride;
912     avg2_0 = _mm_alignr_epi8(avg2_1, avg2_0, 2);
913     avg2_1 = _mm_alignr_epi8(avg2_2, avg2_1, 2);
914     avg2_2 = _mm_alignr_epi8(avg2_3, avg2_2, 2);
915     avg2_3 = _mm_alignr_epi8(AR, avg2_3, 2);
916     avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2);
917     avg3_1 = _mm_alignr_epi8(avg3_2, avg3_1, 2);
918     avg3_2 = _mm_alignr_epi8(avg3_3, avg3_2, 2);
919     avg3_3 = _mm_alignr_epi8(AR, avg3_3, 2);
920   }
921   _mm_store_si128((__m128i *)dst, avg2_0);
922   _mm_store_si128((__m128i *)(dst + 8), avg2_1);
923   _mm_store_si128((__m128i *)(dst + 16), avg2_2);
924   _mm_store_si128((__m128i *)(dst + 24), avg2_3);
925   dst += stride;
926   _mm_store_si128((__m128i *)dst, avg3_0);
927   _mm_store_si128((__m128i *)(dst + 8), avg3_1);
928   _mm_store_si128((__m128i *)(dst + 16), avg3_2);
929   _mm_store_si128((__m128i *)(dst + 24), avg3_3);
930 }
931