1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #ifndef AOM_AOM_DSP_X86_BLEND_SSE4_H_
13 #define AOM_AOM_DSP_X86_BLEND_SSE4_H_
14 
15 #include "aom_dsp/blend.h"
16 #include "aom_dsp/x86/synonyms.h"
17 static const uint8_t g_blend_a64_mask_shuffle[32] = {
18   0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
19   0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
20 };
21 
22 //////////////////////////////////////////////////////////////////////////////
23 // Common kernels
24 //////////////////////////////////////////////////////////////////////////////
25 
blend_4(const uint8_t * src0,const uint8_t * src1,const __m128i * v_m0_w,const __m128i * v_m1_w)26 static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1,
27                               const __m128i *v_m0_w, const __m128i *v_m1_w) {
28   const __m128i v_s0_b = xx_loadl_32(src0);
29   const __m128i v_s1_b = xx_loadl_32(src1);
30   const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
31   const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
32 
33   const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
34   const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
35   const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
36   const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
37 
38   return v_res_w;
39 }
40 
blend_8(const uint8_t * src0,const uint8_t * src1,const __m128i * v_m0_w,const __m128i * v_m1_w)41 static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
42                               const __m128i *v_m0_w, const __m128i *v_m1_w) {
43   const __m128i v_s0_b = xx_loadl_64(src0);
44   const __m128i v_s1_b = xx_loadl_64(src1);
45   const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
46   const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
47 
48   const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
49   const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
50 
51   const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
52 
53   const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
54 
55   return v_res_w;
56 }
57 
blend_4_u8(const uint8_t * src0,const uint8_t * src1,const __m128i * v_m0_b,const __m128i * v_m1_b,const __m128i * rounding)58 static INLINE __m128i blend_4_u8(const uint8_t *src0, const uint8_t *src1,
59                                  const __m128i *v_m0_b, const __m128i *v_m1_b,
60                                  const __m128i *rounding) {
61   const __m128i v_s0_b = xx_loadl_32(src0);
62   const __m128i v_s1_b = xx_loadl_32(src1);
63 
64   const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
65                                            _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
66 
67   const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
68   const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
69   return v_res;
70 }
71 
blend_8_u8(const uint8_t * src0,const uint8_t * src1,const __m128i * v_m0_b,const __m128i * v_m1_b,const __m128i * rounding)72 static INLINE __m128i blend_8_u8(const uint8_t *src0, const uint8_t *src1,
73                                  const __m128i *v_m0_b, const __m128i *v_m1_b,
74                                  const __m128i *rounding) {
75   const __m128i v_s0_b = xx_loadl_64(src0);
76   const __m128i v_s1_b = xx_loadl_64(src1);
77 
78   const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
79                                            _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
80 
81   const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
82   const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
83   return v_res;
84 }
85 
blend_16_u8(const uint8_t * src0,const uint8_t * src1,const __m128i * v_m0_b,const __m128i * v_m1_b,const __m128i * rounding)86 static INLINE __m128i blend_16_u8(const uint8_t *src0, const uint8_t *src1,
87                                   const __m128i *v_m0_b, const __m128i *v_m1_b,
88                                   const __m128i *rounding) {
89   const __m128i v_s0_b = xx_loadu_128(src0);
90   const __m128i v_s1_b = xx_loadu_128(src1);
91 
92   const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
93                                            _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
94   const __m128i v_p1_w = _mm_maddubs_epi16(_mm_unpackhi_epi8(v_s0_b, v_s1_b),
95                                            _mm_unpackhi_epi8(*v_m0_b, *v_m1_b));
96 
97   const __m128i v_res0_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
98   const __m128i v_res1_w = _mm_mulhrs_epi16(v_p1_w, *rounding);
99   const __m128i v_res = _mm_packus_epi16(v_res0_w, v_res1_w);
100   return v_res;
101 }
102 
103 typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1,
104                                  const __m128i v_m0_w, const __m128i v_m1_w);
105 
blend_4_b10(const uint16_t * src0,const uint16_t * src1,const __m128i v_m0_w,const __m128i v_m1_w)106 static INLINE __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1,
107                                   const __m128i v_m0_w, const __m128i v_m1_w) {
108   const __m128i v_s0_w = xx_loadl_64(src0);
109   const __m128i v_s1_w = xx_loadl_64(src1);
110 
111   const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
112   const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
113 
114   const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
115 
116   const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
117 
118   return v_res_w;
119 }
120 
blend_8_b10(const uint16_t * src0,const uint16_t * src1,const __m128i v_m0_w,const __m128i v_m1_w)121 static INLINE __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1,
122                                   const __m128i v_m0_w, const __m128i v_m1_w) {
123   const __m128i v_s0_w = xx_loadu_128(src0);
124   const __m128i v_s1_w = xx_loadu_128(src1);
125 
126   const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
127   const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
128 
129   const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
130 
131   const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
132 
133   return v_res_w;
134 }
135 
blend_4_b12(const uint16_t * src0,const uint16_t * src1,const __m128i v_m0_w,const __m128i v_m1_w)136 static INLINE __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1,
137                                   const __m128i v_m0_w, const __m128i v_m1_w) {
138   const __m128i v_s0_w = xx_loadl_64(src0);
139   const __m128i v_s1_w = xx_loadl_64(src1);
140 
141   // Interleave
142   const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
143   const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
144 
145   // Multiply-Add
146   const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
147 
148   // Scale
149   const __m128i v_ssum_d =
150       _mm_srli_epi32(v_sum_d, AOM_BLEND_A64_ROUND_BITS - 1);
151 
152   // Pack
153   const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
154 
155   // Round
156   const __m128i v_res_w = xx_round_epu16(v_pssum_d);
157 
158   return v_res_w;
159 }
160 
blend_8_b12(const uint16_t * src0,const uint16_t * src1,const __m128i v_m0_w,const __m128i v_m1_w)161 static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1,
162                                   const __m128i v_m0_w, const __m128i v_m1_w) {
163   const __m128i v_s0_w = xx_loadu_128(src0);
164   const __m128i v_s1_w = xx_loadu_128(src1);
165 
166   // Interleave
167   const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
168   const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
169   const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
170   const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
171 
172   // Multiply-Add
173   const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
174   const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
175 
176   // Scale
177   const __m128i v_ssuml_d =
178       _mm_srli_epi32(v_suml_d, AOM_BLEND_A64_ROUND_BITS - 1);
179   const __m128i v_ssumh_d =
180       _mm_srli_epi32(v_sumh_d, AOM_BLEND_A64_ROUND_BITS - 1);
181 
182   // Pack
183   const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
184 
185   // Round
186   const __m128i v_res_w = xx_round_epu16(v_pssum_d);
187 
188   return v_res_w;
189 }
190 
191 #endif  // AOM_AOM_DSP_X86_BLEND_SSE4_H_
192