1 /*
2  * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <emmintrin.h>
13 
14 #include "config/aom_dsp_rtcd.h"
15 
16 // -----------------------------------------------------------------------------
17 // H_PRED
18 
aom_highbd_h_predictor_4x4_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)19 void aom_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
20                                      const uint16_t *above,
21                                      const uint16_t *left, int bd) {
22   const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left);
23   const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
24   const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
25   const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
26   const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
27   (void)above;
28   (void)bd;
29   _mm_storel_epi64((__m128i *)dst, row0);
30   dst += stride;
31   _mm_storel_epi64((__m128i *)dst, row1);
32   dst += stride;
33   _mm_storel_epi64((__m128i *)dst, row2);
34   dst += stride;
35   _mm_storel_epi64((__m128i *)dst, row3);
36 }
37 
aom_highbd_h_predictor_4x8_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)38 void aom_highbd_h_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
39                                      const uint16_t *above,
40                                      const uint16_t *left, int bd) {
41   aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd);
42   dst += stride << 2;
43   left += 4;
44   aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd);
45 }
46 
aom_highbd_h_predictor_8x4_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)47 void aom_highbd_h_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
48                                      const uint16_t *above,
49                                      const uint16_t *left, int bd) {
50   const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
51   const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
52   const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
53   const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
54   const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
55   (void)above;
56   (void)bd;
57   _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
58   dst += stride;
59   _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
60   dst += stride;
61   _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
62   dst += stride;
63   _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
64 }
65 
aom_highbd_h_predictor_8x8_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)66 void aom_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
67                                      const uint16_t *above,
68                                      const uint16_t *left, int bd) {
69   const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
70   const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
71   const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
72   const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
73   const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
74   const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
75   const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
76   const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
77   const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
78   (void)above;
79   (void)bd;
80   _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
81   dst += stride;
82   _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
83   dst += stride;
84   _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
85   dst += stride;
86   _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
87   dst += stride;
88   _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4));
89   dst += stride;
90   _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5));
91   dst += stride;
92   _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6));
93   dst += stride;
94   _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7));
95 }
96 
aom_highbd_h_predictor_8x16_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)97 void aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
98                                       const uint16_t *above,
99                                       const uint16_t *left, int bd) {
100   aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd);
101   dst += stride << 3;
102   left += 8;
103   aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd);
104 }
105 
h_store_16_unpacklo(uint16_t ** dst,const ptrdiff_t stride,const __m128i * row)106 static INLINE void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride,
107                                        const __m128i *row) {
108   const __m128i val = _mm_unpacklo_epi64(*row, *row);
109   _mm_store_si128((__m128i *)*dst, val);
110   _mm_store_si128((__m128i *)(*dst + 8), val);
111   *dst += stride;
112 }
113 
h_store_16_unpackhi(uint16_t ** dst,const ptrdiff_t stride,const __m128i * row)114 static INLINE void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride,
115                                        const __m128i *row) {
116   const __m128i val = _mm_unpackhi_epi64(*row, *row);
117   _mm_store_si128((__m128i *)(*dst), val);
118   _mm_store_si128((__m128i *)(*dst + 8), val);
119   *dst += stride;
120 }
121 
h_predictor_16x8(uint16_t * dst,ptrdiff_t stride,const uint16_t * left)122 static INLINE void h_predictor_16x8(uint16_t *dst, ptrdiff_t stride,
123                                     const uint16_t *left) {
124   const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
125   const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
126   const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
127   const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
128   const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
129   const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
130   const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
131   const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
132   const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
133   h_store_16_unpacklo(&dst, stride, &row0);
134   h_store_16_unpacklo(&dst, stride, &row1);
135   h_store_16_unpacklo(&dst, stride, &row2);
136   h_store_16_unpacklo(&dst, stride, &row3);
137   h_store_16_unpackhi(&dst, stride, &row4);
138   h_store_16_unpackhi(&dst, stride, &row5);
139   h_store_16_unpackhi(&dst, stride, &row6);
140   h_store_16_unpackhi(&dst, stride, &row7);
141 }
142 
aom_highbd_h_predictor_16x8_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)143 void aom_highbd_h_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
144                                       const uint16_t *above,
145                                       const uint16_t *left, int bd) {
146   (void)above;
147   (void)bd;
148   h_predictor_16x8(dst, stride, left);
149 }
150 
aom_highbd_h_predictor_16x16_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)151 void aom_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
152                                        const uint16_t *above,
153                                        const uint16_t *left, int bd) {
154   int i;
155   (void)above;
156   (void)bd;
157 
158   for (i = 0; i < 2; i++, left += 8) {
159     h_predictor_16x8(dst, stride, left);
160     dst += stride << 3;
161   }
162 }
163 
aom_highbd_h_predictor_16x32_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)164 void aom_highbd_h_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
165                                        const uint16_t *above,
166                                        const uint16_t *left, int bd) {
167   int i;
168   (void)above;
169   (void)bd;
170 
171   for (i = 0; i < 4; i++, left += 8) {
172     h_predictor_16x8(dst, stride, left);
173     dst += stride << 3;
174   }
175 }
176 
h_store_32_unpacklo(uint16_t ** dst,const ptrdiff_t stride,const __m128i * row)177 static INLINE void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride,
178                                        const __m128i *row) {
179   const __m128i val = _mm_unpacklo_epi64(*row, *row);
180   _mm_store_si128((__m128i *)(*dst), val);
181   _mm_store_si128((__m128i *)(*dst + 8), val);
182   _mm_store_si128((__m128i *)(*dst + 16), val);
183   _mm_store_si128((__m128i *)(*dst + 24), val);
184   *dst += stride;
185 }
186 
h_store_32_unpackhi(uint16_t ** dst,const ptrdiff_t stride,const __m128i * row)187 static INLINE void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride,
188                                        const __m128i *row) {
189   const __m128i val = _mm_unpackhi_epi64(*row, *row);
190   _mm_store_si128((__m128i *)(*dst), val);
191   _mm_store_si128((__m128i *)(*dst + 8), val);
192   _mm_store_si128((__m128i *)(*dst + 16), val);
193   _mm_store_si128((__m128i *)(*dst + 24), val);
194   *dst += stride;
195 }
196 
h_predictor_32x8(uint16_t * dst,ptrdiff_t stride,const uint16_t * left)197 static INLINE void h_predictor_32x8(uint16_t *dst, ptrdiff_t stride,
198                                     const uint16_t *left) {
199   const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
200   const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
201   const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
202   const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
203   const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
204   const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
205   const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
206   const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
207   const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
208   h_store_32_unpacklo(&dst, stride, &row0);
209   h_store_32_unpacklo(&dst, stride, &row1);
210   h_store_32_unpacklo(&dst, stride, &row2);
211   h_store_32_unpacklo(&dst, stride, &row3);
212   h_store_32_unpackhi(&dst, stride, &row4);
213   h_store_32_unpackhi(&dst, stride, &row5);
214   h_store_32_unpackhi(&dst, stride, &row6);
215   h_store_32_unpackhi(&dst, stride, &row7);
216 }
217 
aom_highbd_h_predictor_32x16_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)218 void aom_highbd_h_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
219                                        const uint16_t *above,
220                                        const uint16_t *left, int bd) {
221   int i;
222   (void)above;
223   (void)bd;
224 
225   for (i = 0; i < 2; i++, left += 8) {
226     h_predictor_32x8(dst, stride, left);
227     dst += stride << 3;
228   }
229 }
230 
aom_highbd_h_predictor_32x32_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)231 void aom_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
232                                        const uint16_t *above,
233                                        const uint16_t *left, int bd) {
234   int i;
235   (void)above;
236   (void)bd;
237 
238   for (i = 0; i < 4; i++, left += 8) {
239     h_predictor_32x8(dst, stride, left);
240     dst += stride << 3;
241   }
242 }
243 
244 // -----------------------------------------------------------------------------
245 // DC_TOP, DC_LEFT, DC_128
246 
247 // 4x4
248 
dc_sum_4(const uint16_t * ref)249 static INLINE __m128i dc_sum_4(const uint16_t *ref) {
250   const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref);
251   const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
252   const __m128i a = _mm_add_epi16(_dcba, _xxdc);
253   return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
254 }
255 
dc_store_4x4(uint16_t * dst,ptrdiff_t stride,const __m128i * dc)256 static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
257                                 const __m128i *dc) {
258   const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
259   int i;
260   for (i = 0; i < 4; ++i, dst += stride) {
261     _mm_storel_epi64((__m128i *)dst, dc_dup);
262   }
263 }
264 
aom_highbd_dc_left_predictor_4x4_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)265 void aom_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
266                                            const uint16_t *above,
267                                            const uint16_t *left, int bd) {
268   const __m128i two = _mm_cvtsi32_si128(2);
269   const __m128i sum = dc_sum_4(left);
270   const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
271   (void)above;
272   (void)bd;
273   dc_store_4x4(dst, stride, &dc);
274 }
275 
aom_highbd_dc_top_predictor_4x4_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)276 void aom_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
277                                           const uint16_t *above,
278                                           const uint16_t *left, int bd) {
279   const __m128i two = _mm_cvtsi32_si128(2);
280   const __m128i sum = dc_sum_4(above);
281   const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
282   (void)left;
283   (void)bd;
284   dc_store_4x4(dst, stride, &dc);
285 }
286 
aom_highbd_dc_128_predictor_4x4_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)287 void aom_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
288                                           const uint16_t *above,
289                                           const uint16_t *left, int bd) {
290   const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
291   const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
292   (void)above;
293   (void)left;
294   dc_store_4x4(dst, stride, &dc_dup);
295 }
296 
297 // -----------------------------------------------------------------------------
298 // 4x8
299 
dc_store_4x8(uint16_t * dst,ptrdiff_t stride,const __m128i * dc)300 static INLINE void dc_store_4x8(uint16_t *dst, ptrdiff_t stride,
301                                 const __m128i *dc) {
302   const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
303   int i;
304   for (i = 0; i < 8; ++i, dst += stride) {
305     _mm_storel_epi64((__m128i *)dst, dc_dup);
306   }
307 }
308 
309 // Shared with DC 8xh
dc_sum_8(const uint16_t * ref)310 static INLINE __m128i dc_sum_8(const uint16_t *ref) {
311   const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref);
312   const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8));
313   const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
314   const __m128i a = _mm_add_epi16(_dcba, _xxdc);
315 
316   return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
317 }
318 
aom_highbd_dc_left_predictor_4x8_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)319 void aom_highbd_dc_left_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
320                                            const uint16_t *above,
321                                            const uint16_t *left, int bd) {
322   const __m128i sum = dc_sum_8(left);
323   const __m128i four = _mm_cvtsi32_si128(4);
324   const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
325   (void)above;
326   (void)bd;
327   dc_store_4x8(dst, stride, &dc);
328 }
329 
aom_highbd_dc_top_predictor_4x8_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)330 void aom_highbd_dc_top_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
331                                           const uint16_t *above,
332                                           const uint16_t *left, int bd) {
333   const __m128i two = _mm_cvtsi32_si128(2);
334   const __m128i sum = dc_sum_4(above);
335   const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
336   (void)left;
337   (void)bd;
338   dc_store_4x8(dst, stride, &dc);
339 }
340 
aom_highbd_dc_128_predictor_4x8_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)341 void aom_highbd_dc_128_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
342                                           const uint16_t *above,
343                                           const uint16_t *left, int bd) {
344   const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
345   const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
346   (void)above;
347   (void)left;
348   dc_store_4x8(dst, stride, &dc_dup);
349 }
350 
351 // -----------------------------------------------------------------------------
352 // 8xh
353 
dc_store_8xh(uint16_t * dst,ptrdiff_t stride,int height,const __m128i * dc)354 static INLINE void dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int height,
355                                 const __m128i *dc) {
356   const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
357   const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
358   int i;
359   for (i = 0; i < height; ++i, dst += stride) {
360     _mm_store_si128((__m128i *)dst, dc_dup);
361   }
362 }
363 
364 // -----------------------------------------------------------------------------
365 // DC_TOP
366 
dc_top_predictor_8xh(uint16_t * dst,ptrdiff_t stride,int height,const uint16_t * above)367 static INLINE void dc_top_predictor_8xh(uint16_t *dst, ptrdiff_t stride,
368                                         int height, const uint16_t *above) {
369   const __m128i four = _mm_cvtsi32_si128(4);
370   const __m128i sum = dc_sum_8(above);
371   const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
372   dc_store_8xh(dst, stride, height, &dc);
373 }
374 
aom_highbd_dc_top_predictor_8x4_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)375 void aom_highbd_dc_top_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
376                                           const uint16_t *above,
377                                           const uint16_t *left, int bd) {
378   (void)left;
379   (void)bd;
380   dc_top_predictor_8xh(dst, stride, 4, above);
381 }
382 
aom_highbd_dc_top_predictor_8x8_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)383 void aom_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
384                                           const uint16_t *above,
385                                           const uint16_t *left, int bd) {
386   (void)left;
387   (void)bd;
388   dc_top_predictor_8xh(dst, stride, 8, above);
389 }
390 
aom_highbd_dc_top_predictor_8x16_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)391 void aom_highbd_dc_top_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
392                                            const uint16_t *above,
393                                            const uint16_t *left, int bd) {
394   (void)left;
395   (void)bd;
396   dc_top_predictor_8xh(dst, stride, 16, above);
397 }
398 
399 // -----------------------------------------------------------------------------
400 // DC_LEFT
401 
aom_highbd_dc_left_predictor_8x4_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)402 void aom_highbd_dc_left_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
403                                            const uint16_t *above,
404                                            const uint16_t *left, int bd) {
405   const __m128i two = _mm_cvtsi32_si128(2);
406   const __m128i sum = dc_sum_4(left);
407   const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
408   (void)above;
409   (void)bd;
410   dc_store_8xh(dst, stride, 4, &dc);
411 }
412 
aom_highbd_dc_left_predictor_8x8_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)413 void aom_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
414                                            const uint16_t *above,
415                                            const uint16_t *left, int bd) {
416   const __m128i four = _mm_cvtsi32_si128(4);
417   const __m128i sum = dc_sum_8(left);
418   const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
419   (void)above;
420   (void)bd;
421   dc_store_8xh(dst, stride, 8, &dc);
422 }
423 
424 // Shared with DC 16xh
dc_sum_16(const uint16_t * ref)425 static INLINE __m128i dc_sum_16(const uint16_t *ref) {
426   const __m128i sum_lo = dc_sum_8(ref);
427   const __m128i sum_hi = dc_sum_8(ref + 8);
428   return _mm_add_epi16(sum_lo, sum_hi);
429 }
430 
aom_highbd_dc_left_predictor_8x16_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)431 void aom_highbd_dc_left_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
432                                             const uint16_t *above,
433                                             const uint16_t *left, int bd) {
434   const __m128i eight = _mm_cvtsi32_si128(8);
435   const __m128i sum = dc_sum_16(left);
436   const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
437   (void)above;
438   (void)bd;
439   dc_store_8xh(dst, stride, 16, &dc);
440 }
441 
442 // -----------------------------------------------------------------------------
443 // DC_128
444 
dc_128_predictor_8xh(uint16_t * dst,ptrdiff_t stride,int height,int bd)445 static INLINE void dc_128_predictor_8xh(uint16_t *dst, ptrdiff_t stride,
446                                         int height, int bd) {
447   const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
448   const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
449   dc_store_8xh(dst, stride, height, &dc_dup);
450 }
451 
aom_highbd_dc_128_predictor_8x4_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)452 void aom_highbd_dc_128_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
453                                           const uint16_t *above,
454                                           const uint16_t *left, int bd) {
455   (void)above;
456   (void)left;
457   dc_128_predictor_8xh(dst, stride, 4, bd);
458 }
459 
aom_highbd_dc_128_predictor_8x8_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)460 void aom_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
461                                           const uint16_t *above,
462                                           const uint16_t *left, int bd) {
463   (void)above;
464   (void)left;
465   dc_128_predictor_8xh(dst, stride, 8, bd);
466 }
467 
aom_highbd_dc_128_predictor_8x16_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)468 void aom_highbd_dc_128_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
469                                            const uint16_t *above,
470                                            const uint16_t *left, int bd) {
471   (void)above;
472   (void)left;
473   dc_128_predictor_8xh(dst, stride, 16, bd);
474 }
475 
476 // -----------------------------------------------------------------------------
477 // 16xh
478 
dc_store_16xh(uint16_t * dst,ptrdiff_t stride,int height,const __m128i * dc)479 static INLINE void dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int height,
480                                  const __m128i *dc) {
481   const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
482   const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
483   int i;
484   for (i = 0; i < height; ++i, dst += stride) {
485     _mm_store_si128((__m128i *)dst, dc_dup);
486     _mm_store_si128((__m128i *)(dst + 8), dc_dup);
487   }
488 }
489 
490 // -----------------------------------------------------------------------------
491 // DC_LEFT
492 
aom_highbd_dc_left_predictor_16x8_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)493 void aom_highbd_dc_left_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
494                                             const uint16_t *above,
495                                             const uint16_t *left, int bd) {
496   const __m128i four = _mm_cvtsi32_si128(4);
497   const __m128i sum = dc_sum_8(left);
498   const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
499   (void)above;
500   (void)bd;
501   dc_store_16xh(dst, stride, 8, &dc);
502 }
503 
aom_highbd_dc_left_predictor_16x16_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)504 void aom_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
505                                              const uint16_t *above,
506                                              const uint16_t *left, int bd) {
507   const __m128i eight = _mm_cvtsi32_si128(8);
508   const __m128i sum = dc_sum_16(left);
509   const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
510   (void)above;
511   (void)bd;
512   dc_store_16xh(dst, stride, 16, &dc);
513 }
514 
515 // Shared with 32xh
dc_sum_32(const uint16_t * ref)516 static INLINE __m128i dc_sum_32(const uint16_t *ref) {
517   const __m128i zero = _mm_setzero_si128();
518   const __m128i sum_a = dc_sum_16(ref);
519   const __m128i sum_b = dc_sum_16(ref + 16);
520   // 12 bit bd will outrange, so expand to 32 bit before adding final total
521   return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero),
522                        _mm_unpacklo_epi16(sum_b, zero));
523 }
524 
aom_highbd_dc_left_predictor_16x32_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)525 void aom_highbd_dc_left_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
526                                              const uint16_t *above,
527                                              const uint16_t *left, int bd) {
528   const __m128i sixteen = _mm_cvtsi32_si128(16);
529   const __m128i sum = dc_sum_32(left);
530   const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
531   (void)above;
532   (void)bd;
533   dc_store_16xh(dst, stride, 32, &dc);
534 }
535 
536 // -----------------------------------------------------------------------------
537 // DC_TOP
538 
aom_highbd_dc_top_predictor_16x8_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)539 void aom_highbd_dc_top_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
540                                            const uint16_t *above,
541                                            const uint16_t *left, int bd) {
542   const __m128i eight = _mm_cvtsi32_si128(8);
543   const __m128i sum = dc_sum_16(above);
544   const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
545   (void)left;
546   (void)bd;
547   dc_store_16xh(dst, stride, 8, &dc);
548 }
549 
aom_highbd_dc_top_predictor_16x16_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)550 void aom_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
551                                             const uint16_t *above,
552                                             const uint16_t *left, int bd) {
553   const __m128i eight = _mm_cvtsi32_si128(8);
554   const __m128i sum = dc_sum_16(above);
555   const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
556   (void)left;
557   (void)bd;
558   dc_store_16xh(dst, stride, 16, &dc);
559 }
560 
aom_highbd_dc_top_predictor_16x32_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)561 void aom_highbd_dc_top_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
562                                             const uint16_t *above,
563                                             const uint16_t *left, int bd) {
564   const __m128i eight = _mm_cvtsi32_si128(8);
565   const __m128i sum = dc_sum_16(above);
566   const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
567   (void)left;
568   (void)bd;
569   dc_store_16xh(dst, stride, 32, &dc);
570 }
571 
572 // -----------------------------------------------------------------------------
573 // DC_128
574 
aom_highbd_dc_128_predictor_16x8_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)575 void aom_highbd_dc_128_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
576                                            const uint16_t *above,
577                                            const uint16_t *left, int bd) {
578   const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
579   const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
580   (void)above;
581   (void)left;
582   dc_store_16xh(dst, stride, 8, &dc_dup);
583 }
584 
aom_highbd_dc_128_predictor_16x16_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)585 void aom_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
586                                             const uint16_t *above,
587                                             const uint16_t *left, int bd) {
588   const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
589   const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
590   (void)above;
591   (void)left;
592   dc_store_16xh(dst, stride, 16, &dc_dup);
593 }
594 
aom_highbd_dc_128_predictor_16x32_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)595 void aom_highbd_dc_128_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
596                                             const uint16_t *above,
597                                             const uint16_t *left, int bd) {
598   const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
599   const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
600   (void)above;
601   (void)left;
602   dc_store_16xh(dst, stride, 32, &dc_dup);
603 }
604 
605 // -----------------------------------------------------------------------------
606 // 32xh
607 
dc_store_32xh(uint16_t * dst,ptrdiff_t stride,int height,const __m128i * dc)608 static INLINE void dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int height,
609                                  const __m128i *dc) {
610   const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
611   const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
612   int i;
613   for (i = 0; i < height; ++i, dst += stride) {
614     _mm_store_si128((__m128i *)dst, dc_dup);
615     _mm_store_si128((__m128i *)(dst + 8), dc_dup);
616     _mm_store_si128((__m128i *)(dst + 16), dc_dup);
617     _mm_store_si128((__m128i *)(dst + 24), dc_dup);
618   }
619 }
620 
aom_highbd_dc_left_predictor_32x16_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)621 void aom_highbd_dc_left_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
622                                              const uint16_t *above,
623                                              const uint16_t *left, int bd) {
624   const __m128i eight = _mm_cvtsi32_si128(8);
625   const __m128i sum = dc_sum_16(left);
626   const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
627   (void)above;
628   (void)bd;
629   dc_store_32xh(dst, stride, 16, &dc);
630 }
631 
aom_highbd_dc_left_predictor_32x32_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)632 void aom_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
633                                              const uint16_t *above,
634                                              const uint16_t *left, int bd) {
635   const __m128i sixteen = _mm_cvtsi32_si128(16);
636   const __m128i sum = dc_sum_32(left);
637   const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
638   (void)above;
639   (void)bd;
640   dc_store_32xh(dst, stride, 32, &dc);
641 }
642 
aom_highbd_dc_top_predictor_32x16_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)643 void aom_highbd_dc_top_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
644                                             const uint16_t *above,
645                                             const uint16_t *left, int bd) {
646   const __m128i sixteen = _mm_cvtsi32_si128(16);
647   const __m128i sum = dc_sum_32(above);
648   const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
649   (void)left;
650   (void)bd;
651   dc_store_32xh(dst, stride, 16, &dc);
652 }
653 
aom_highbd_dc_128_predictor_32x16_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)654 void aom_highbd_dc_128_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
655                                             const uint16_t *above,
656                                             const uint16_t *left, int bd) {
657   const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
658   const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
659   (void)above;
660   (void)left;
661   dc_store_32xh(dst, stride, 16, &dc_dup);
662 }
663 
aom_highbd_dc_top_predictor_32x32_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)664 void aom_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
665                                             const uint16_t *above,
666                                             const uint16_t *left, int bd) {
667   const __m128i sixteen = _mm_cvtsi32_si128(16);
668   const __m128i sum = dc_sum_32(above);
669   const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
670   (void)left;
671   (void)bd;
672   dc_store_32xh(dst, stride, 32, &dc);
673 }
674 
aom_highbd_dc_128_predictor_32x32_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)675 void aom_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
676                                             const uint16_t *above,
677                                             const uint16_t *left, int bd) {
678   const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
679   const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
680   (void)above;
681   (void)left;
682   dc_store_32xh(dst, stride, 32, &dc_dup);
683 }
684 
685 // -----------------------------------------------------------------------------
686 // V_PRED
687 
aom_highbd_v_predictor_4x8_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)688 void aom_highbd_v_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
689                                      const uint16_t *above,
690                                      const uint16_t *left, int bd) {
691   (void)left;
692   (void)bd;
693   const __m128i above_u16 = _mm_loadl_epi64((const __m128i *)above);
694   int i;
695   for (i = 0; i < 2; ++i) {
696     _mm_storel_epi64((__m128i *)dst, above_u16);
697     _mm_storel_epi64((__m128i *)(dst + stride), above_u16);
698     _mm_storel_epi64((__m128i *)(dst + 2 * stride), above_u16);
699     _mm_storel_epi64((__m128i *)(dst + 3 * stride), above_u16);
700     dst += stride << 2;
701   }
702 }
703 
aom_highbd_v_predictor_8x4_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)704 void aom_highbd_v_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
705                                      const uint16_t *above,
706                                      const uint16_t *left, int bd) {
707   (void)left;
708   (void)bd;
709   const __m128i above_u16 = _mm_load_si128((const __m128i *)above);
710   _mm_store_si128((__m128i *)dst, above_u16);
711   _mm_store_si128((__m128i *)(dst + stride), above_u16);
712   _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16);
713   _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16);
714 }
715 
aom_highbd_v_predictor_8x16_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)716 void aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
717                                       const uint16_t *above,
718                                       const uint16_t *left, int bd) {
719   (void)left;
720   (void)bd;
721   const __m128i above_u16 = _mm_load_si128((const __m128i *)above);
722   int i;
723   for (i = 0; i < 4; ++i) {
724     _mm_store_si128((__m128i *)dst, above_u16);
725     _mm_store_si128((__m128i *)(dst + stride), above_u16);
726     _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16);
727     _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16);
728     dst += stride << 2;
729   }
730 }
731 
aom_highbd_v_predictor_16x8_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)732 void aom_highbd_v_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
733                                       const uint16_t *above,
734                                       const uint16_t *left, int bd) {
735   (void)left;
736   (void)bd;
737   const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
738   const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
739   int i;
740   for (i = 0; i < 2; ++i) {
741     _mm_store_si128((__m128i *)dst, above0_u16);
742     _mm_store_si128((__m128i *)(dst + 8), above1_u16);
743     dst += stride;
744     _mm_store_si128((__m128i *)dst, above0_u16);
745     _mm_store_si128((__m128i *)(dst + 8), above1_u16);
746     dst += stride;
747     _mm_store_si128((__m128i *)dst, above0_u16);
748     _mm_store_si128((__m128i *)(dst + 8), above1_u16);
749     dst += stride;
750     _mm_store_si128((__m128i *)dst, above0_u16);
751     _mm_store_si128((__m128i *)(dst + 8), above1_u16);
752     dst += stride;
753   }
754 }
755 
aom_highbd_v_predictor_16x32_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)756 void aom_highbd_v_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
757                                        const uint16_t *above,
758                                        const uint16_t *left, int bd) {
759   (void)left;
760   (void)bd;
761   const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
762   const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
763   int i;
764   for (i = 0; i < 8; ++i) {
765     _mm_store_si128((__m128i *)dst, above0_u16);
766     _mm_store_si128((__m128i *)(dst + 8), above1_u16);
767     dst += stride;
768     _mm_store_si128((__m128i *)dst, above0_u16);
769     _mm_store_si128((__m128i *)(dst + 8), above1_u16);
770     dst += stride;
771     _mm_store_si128((__m128i *)dst, above0_u16);
772     _mm_store_si128((__m128i *)(dst + 8), above1_u16);
773     dst += stride;
774     _mm_store_si128((__m128i *)dst, above0_u16);
775     _mm_store_si128((__m128i *)(dst + 8), above1_u16);
776     dst += stride;
777   }
778 }
779 
aom_highbd_v_predictor_32x16_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)780 void aom_highbd_v_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
781                                        const uint16_t *above,
782                                        const uint16_t *left, int bd) {
783   (void)left;
784   (void)bd;
785   const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
786   const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
787   const __m128i above2_u16 = _mm_load_si128((const __m128i *)(above + 16));
788   const __m128i above3_u16 = _mm_load_si128((const __m128i *)(above + 24));
789   int i;
790   for (i = 0; i < 4; ++i) {
791     _mm_store_si128((__m128i *)dst, above0_u16);
792     _mm_store_si128((__m128i *)(dst + 8), above1_u16);
793     _mm_store_si128((__m128i *)(dst + 16), above2_u16);
794     _mm_store_si128((__m128i *)(dst + 24), above3_u16);
795     dst += stride;
796     _mm_store_si128((__m128i *)dst, above0_u16);
797     _mm_store_si128((__m128i *)(dst + 8), above1_u16);
798     _mm_store_si128((__m128i *)(dst + 16), above2_u16);
799     _mm_store_si128((__m128i *)(dst + 24), above3_u16);
800     dst += stride;
801     _mm_store_si128((__m128i *)dst, above0_u16);
802     _mm_store_si128((__m128i *)(dst + 8), above1_u16);
803     _mm_store_si128((__m128i *)(dst + 16), above2_u16);
804     _mm_store_si128((__m128i *)(dst + 24), above3_u16);
805     dst += stride;
806     _mm_store_si128((__m128i *)dst, above0_u16);
807     _mm_store_si128((__m128i *)(dst + 8), above1_u16);
808     _mm_store_si128((__m128i *)(dst + 16), above2_u16);
809     _mm_store_si128((__m128i *)(dst + 24), above3_u16);
810     dst += stride;
811   }
812 }
813 
814 // -----------------------------------------------------------------------------
815 // DC_PRED
816 
aom_highbd_dc_predictor_4x8_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)817 void aom_highbd_dc_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
818                                       const uint16_t *above,
819                                       const uint16_t *left, int bd) {
820   (void)bd;
821   const __m128i sum_above = dc_sum_4(above);
822   const __m128i sum_left = dc_sum_8(left);
823   const __m128i sum = _mm_add_epi16(sum_above, sum_left);
824   uint32_t sum32 = _mm_cvtsi128_si32(sum);
825   sum32 >>= 16;
826   sum32 += 6;
827   sum32 /= 12;
828   const __m128i row = _mm_set1_epi16((uint16_t)sum32);
829   int i;
830   for (i = 0; i < 4; ++i) {
831     _mm_storel_epi64((__m128i *)dst, row);
832     dst += stride;
833     _mm_storel_epi64((__m128i *)dst, row);
834     dst += stride;
835   }
836 }
837 
aom_highbd_dc_predictor_8x4_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)838 void aom_highbd_dc_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
839                                       const uint16_t *above,
840                                       const uint16_t *left, int bd) {
841   (void)bd;
842   const __m128i sum_left = dc_sum_4(left);
843   const __m128i sum_above = dc_sum_8(above);
844   const __m128i sum = _mm_add_epi16(sum_above, sum_left);
845   uint32_t sum32 = _mm_cvtsi128_si32(sum);
846   sum32 >>= 16;
847   sum32 += 6;
848   sum32 /= 12;
849   const __m128i row = _mm_set1_epi16((uint16_t)sum32);
850 
851   _mm_store_si128((__m128i *)dst, row);
852   dst += stride;
853   _mm_store_si128((__m128i *)dst, row);
854   dst += stride;
855   _mm_store_si128((__m128i *)dst, row);
856   dst += stride;
857   _mm_store_si128((__m128i *)dst, row);
858 }
859 
aom_highbd_dc_predictor_8x16_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)860 void aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
861                                        const uint16_t *above,
862                                        const uint16_t *left, int bd) {
863   (void)bd;
864   __m128i sum_left = dc_sum_16(left);
865   __m128i sum_above = dc_sum_8(above);
866   const __m128i zero = _mm_setzero_si128();
867   sum_left = _mm_unpacklo_epi16(sum_left, zero);
868   sum_above = _mm_unpacklo_epi16(sum_above, zero);
869   const __m128i sum = _mm_add_epi32(sum_left, sum_above);
870   uint32_t sum32 = _mm_cvtsi128_si32(sum);
871   sum32 += 12;
872   sum32 /= 24;
873   const __m128i row = _mm_set1_epi16((uint16_t)sum32);
874   int i;
875   for (i = 0; i < 4; ++i) {
876     _mm_store_si128((__m128i *)dst, row);
877     dst += stride;
878     _mm_store_si128((__m128i *)dst, row);
879     dst += stride;
880     _mm_store_si128((__m128i *)dst, row);
881     dst += stride;
882     _mm_store_si128((__m128i *)dst, row);
883     dst += stride;
884   }
885 }
886 
aom_highbd_dc_predictor_16x8_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)887 void aom_highbd_dc_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
888                                        const uint16_t *above,
889                                        const uint16_t *left, int bd) {
890   (void)bd;
891   __m128i sum_left = dc_sum_8(left);
892   __m128i sum_above = dc_sum_16(above);
893   const __m128i zero = _mm_setzero_si128();
894   sum_left = _mm_unpacklo_epi16(sum_left, zero);
895   sum_above = _mm_unpacklo_epi16(sum_above, zero);
896   const __m128i sum = _mm_add_epi32(sum_left, sum_above);
897   uint32_t sum32 = _mm_cvtsi128_si32(sum);
898   sum32 += 12;
899   sum32 /= 24;
900   const __m128i row = _mm_set1_epi16((uint16_t)sum32);
901   int i;
902   for (i = 0; i < 2; ++i) {
903     _mm_store_si128((__m128i *)dst, row);
904     _mm_store_si128((__m128i *)(dst + 8), row);
905     dst += stride;
906     _mm_store_si128((__m128i *)dst, row);
907     _mm_store_si128((__m128i *)(dst + 8), row);
908     dst += stride;
909     _mm_store_si128((__m128i *)dst, row);
910     _mm_store_si128((__m128i *)(dst + 8), row);
911     dst += stride;
912     _mm_store_si128((__m128i *)dst, row);
913     _mm_store_si128((__m128i *)(dst + 8), row);
914     dst += stride;
915   }
916 }
917 
aom_highbd_dc_predictor_16x32_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)918 void aom_highbd_dc_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
919                                         const uint16_t *above,
920                                         const uint16_t *left, int bd) {
921   (void)bd;
922   __m128i sum_left = dc_sum_32(left);
923   __m128i sum_above = dc_sum_16(above);
924   const __m128i zero = _mm_setzero_si128();
925   sum_above = _mm_unpacklo_epi16(sum_above, zero);
926   const __m128i sum = _mm_add_epi32(sum_left, sum_above);
927   uint32_t sum32 = _mm_cvtsi128_si32(sum);
928   sum32 += 24;
929   sum32 /= 48;
930   const __m128i row = _mm_set1_epi16((uint16_t)sum32);
931   int i;
932   for (i = 0; i < 8; ++i) {
933     _mm_store_si128((__m128i *)dst, row);
934     _mm_store_si128((__m128i *)(dst + 8), row);
935     dst += stride;
936     _mm_store_si128((__m128i *)dst, row);
937     _mm_store_si128((__m128i *)(dst + 8), row);
938     dst += stride;
939     _mm_store_si128((__m128i *)dst, row);
940     _mm_store_si128((__m128i *)(dst + 8), row);
941     dst += stride;
942     _mm_store_si128((__m128i *)dst, row);
943     _mm_store_si128((__m128i *)(dst + 8), row);
944     dst += stride;
945   }
946 }
947 
aom_highbd_dc_predictor_32x16_sse2(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)948 void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
949                                         const uint16_t *above,
950                                         const uint16_t *left, int bd) {
951   (void)bd;
952   __m128i sum_left = dc_sum_16(left);
953   __m128i sum_above = dc_sum_32(above);
954   const __m128i zero = _mm_setzero_si128();
955   sum_left = _mm_unpacklo_epi16(sum_left, zero);
956   const __m128i sum = _mm_add_epi32(sum_left, sum_above);
957   uint32_t sum32 = _mm_cvtsi128_si32(sum);
958   sum32 += 24;
959   sum32 /= 48;
960   const __m128i row = _mm_set1_epi16((uint16_t)sum32);
961   int i;
962   for (i = 0; i < 4; ++i) {
963     _mm_store_si128((__m128i *)dst, row);
964     _mm_store_si128((__m128i *)(dst + 8), row);
965     _mm_store_si128((__m128i *)(dst + 16), row);
966     _mm_store_si128((__m128i *)(dst + 24), row);
967     dst += stride;
968     _mm_store_si128((__m128i *)dst, row);
969     _mm_store_si128((__m128i *)(dst + 8), row);
970     _mm_store_si128((__m128i *)(dst + 16), row);
971     _mm_store_si128((__m128i *)(dst + 24), row);
972     dst += stride;
973     _mm_store_si128((__m128i *)dst, row);
974     _mm_store_si128((__m128i *)(dst + 8), row);
975     _mm_store_si128((__m128i *)(dst + 16), row);
976     _mm_store_si128((__m128i *)(dst + 24), row);
977     dst += stride;
978     _mm_store_si128((__m128i *)dst, row);
979     _mm_store_si128((__m128i *)(dst + 8), row);
980     _mm_store_si128((__m128i *)(dst + 16), row);
981     _mm_store_si128((__m128i *)(dst + 24), row);
982     dst += stride;
983   }
984 }
985