1 /*
2 * Copyright(c) 2019 Intel Corporation
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at https://www.aomedia.org/license/software-license. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at https://www.aomedia.org/license/patent-license.
10 */
11 
12 #ifndef EbHighbdIntraPrediction_SSE2_h
13 #define EbHighbdIntraPrediction_SSE2_h
14 
15 #include <emmintrin.h>
16 #include "EbDefinitions.h"
17 #include "common_dsp_rtcd.h"
18 
dc_sum_4x32bit(const __m128i src)19 static INLINE __m128i dc_sum_4x32bit(const __m128i src) {
20     __m128i sum, sum_hi;
21     sum_hi = _mm_srli_si128(src, 8);
22     sum    = _mm_add_epi32(src, sum_hi);
23     sum_hi = _mm_srli_si128(sum, 4);
24     return _mm_add_epi32(sum, sum_hi);
25 }
26 
dc_sum_4x16bit(const __m128i src)27 static INLINE __m128i dc_sum_4x16bit(const __m128i src) {
28     __m128i       sum, sum_hi;
29     const __m128i src_hi = _mm_srli_si128(src, 4);
30     sum                  = _mm_add_epi16(src, src_hi);
31     sum_hi               = _mm_srli_si128(sum, 2);
32     sum                  = _mm_add_epi16(sum, sum_hi);
33 
34     return sum;
35 }
36 
dc_sum_4x16bit_large(const __m128i src)37 static INLINE __m128i dc_sum_4x16bit_large(const __m128i src) {
38     // Unpack to avoid 12-bit overflow.
39     const __m128i src_32 = _mm_unpacklo_epi16(src, _mm_setzero_si128());
40     return dc_sum_4x32bit(src_32);
41 }
42 
dc_sum_8x16bit(const __m128i src)43 static INLINE __m128i dc_sum_8x16bit(const __m128i src) {
44     const __m128i src_hi = _mm_srli_si128(src, 8);
45     const __m128i sum    = _mm_add_epi16(src, src_hi);
46     return dc_sum_4x16bit(sum);
47 }
48 
dc_sum_8x16bit_large(const __m128i src)49 static INLINE __m128i dc_sum_8x16bit_large(const __m128i src) {
50     const __m128i src_hi = _mm_srli_si128(src, 8);
51     const __m128i sum    = _mm_add_epi16(src, src_hi);
52     return dc_sum_4x16bit_large(sum);
53 }
54 
dc_sum_4(const uint16_t * const src)55 static INLINE __m128i dc_sum_4(const uint16_t *const src) {
56     const __m128i s = _mm_loadl_epi64((const __m128i *)src);
57     return dc_sum_4x16bit(s);
58 }
59 
dc_sum_8(const uint16_t * const src)60 static INLINE __m128i dc_sum_8(const uint16_t *const src) {
61     const __m128i s = _mm_loadu_si128((const __m128i *)src);
62     return dc_sum_8x16bit(s);
63 }
64 
65 #endif // EbHighbdIntraPrediction_SSE2_h
66