1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <smmintrin.h> // SSE4.1
13
14 #include <assert.h>
15
16 #include "aom/aom_integer.h"
17 #include "aom_ports/mem.h"
18 #include "aom_dsp/aom_dsp_common.h"
19 #include "aom_dsp/blend.h"
20
21 #include "aom_dsp/x86/synonyms.h"
22 #include "aom_dsp/x86/blend_sse4.h"
23
24 #include "config/aom_dsp_rtcd.h"
25
26 //////////////////////////////////////////////////////////////////////////////
27 // Implementation - No sub-sampling
28 //////////////////////////////////////////////////////////////////////////////
29
blend_a64_vmask_w4_sse4_1(uint8_t * dst,uint32_t dst_stride,const uint8_t * src0,uint32_t src0_stride,const uint8_t * src1,uint32_t src1_stride,const uint8_t * mask,int w,int h)30 static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
31 const uint8_t *src0, uint32_t src0_stride,
32 const uint8_t *src1, uint32_t src1_stride,
33 const uint8_t *mask, int w, int h) {
34 const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
35
36 (void)w;
37
38 do {
39 const __m128i v_m0_w = _mm_set1_epi16(*mask);
40 const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
41
42 const __m128i v_res_w = blend_4(src0, src1, &v_m0_w, &v_m1_w);
43
44 const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
45
46 xx_storel_32(dst, v_res_b);
47
48 dst += dst_stride;
49 src0 += src0_stride;
50 src1 += src1_stride;
51 mask += 1;
52 } while (--h);
53 }
54
blend_a64_vmask_w8_sse4_1(uint8_t * dst,uint32_t dst_stride,const uint8_t * src0,uint32_t src0_stride,const uint8_t * src1,uint32_t src1_stride,const uint8_t * mask,int w,int h)55 static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
56 const uint8_t *src0, uint32_t src0_stride,
57 const uint8_t *src1, uint32_t src1_stride,
58 const uint8_t *mask, int w, int h) {
59 const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
60
61 (void)w;
62
63 do {
64 const __m128i v_m0_w = _mm_set1_epi16(*mask);
65 const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
66
67 const __m128i v_res_w = blend_8(src0, src1, &v_m0_w, &v_m1_w);
68
69 const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
70
71 xx_storel_64(dst, v_res_b);
72
73 dst += dst_stride;
74 src0 += src0_stride;
75 src1 += src1_stride;
76 mask += 1;
77 } while (--h);
78 }
79
blend_a64_vmask_w16n_sse4_1(uint8_t * dst,uint32_t dst_stride,const uint8_t * src0,uint32_t src0_stride,const uint8_t * src1,uint32_t src1_stride,const uint8_t * mask,int w,int h)80 static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride,
81 const uint8_t *src0,
82 uint32_t src0_stride,
83 const uint8_t *src1,
84 uint32_t src1_stride,
85 const uint8_t *mask, int w, int h) {
86 const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
87
88 do {
89 int c;
90 const __m128i v_m0_w = _mm_set1_epi16(*mask);
91 const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
92 for (c = 0; c < w; c += 16) {
93 const __m128i v_resl_w = blend_8(src0 + c, src1 + c, &v_m0_w, &v_m1_w);
94 const __m128i v_resh_w =
95 blend_8(src0 + c + 8, src1 + c + 8, &v_m0_w, &v_m1_w);
96
97 const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
98
99 xx_storeu_128(dst + c, v_res_b);
100 }
101 dst += dst_stride;
102 src0 += src0_stride;
103 src1 += src1_stride;
104 mask += 1;
105 } while (--h);
106 }
107
108 //////////////////////////////////////////////////////////////////////////////
109 // Dispatch
110 //////////////////////////////////////////////////////////////////////////////
111
aom_blend_a64_vmask_sse4_1(uint8_t * dst,uint32_t dst_stride,const uint8_t * src0,uint32_t src0_stride,const uint8_t * src1,uint32_t src1_stride,const uint8_t * mask,int w,int h)112 void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
113 const uint8_t *src0, uint32_t src0_stride,
114 const uint8_t *src1, uint32_t src1_stride,
115 const uint8_t *mask, int w, int h) {
116 typedef void (*blend_fn)(uint8_t * dst, uint32_t dst_stride,
117 const uint8_t *src0, uint32_t src0_stride,
118 const uint8_t *src1, uint32_t src1_stride,
119 const uint8_t *mask, int w, int h);
120
121 // Dimension: width_index
122 static const blend_fn blend[9] = {
123 blend_a64_vmask_w16n_sse4_1, // w % 16 == 0
124 aom_blend_a64_vmask_c, // w == 1
125 aom_blend_a64_vmask_c, // w == 2
126 NULL, // INVALID
127 blend_a64_vmask_w4_sse4_1, // w == 4
128 NULL, // INVALID
129 NULL, // INVALID
130 NULL, // INVALID
131 blend_a64_vmask_w8_sse4_1, // w == 8
132 };
133
134 assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
135 assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
136
137 assert(h >= 1);
138 assert(w >= 1);
139 assert(IS_POWER_OF_TWO(h));
140 assert(IS_POWER_OF_TWO(w));
141
142 blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, w,
143 h);
144 }
145
146 #if CONFIG_AV1_HIGHBITDEPTH
147 //////////////////////////////////////////////////////////////////////////////
148 // Implementation - No sub-sampling
149 //////////////////////////////////////////////////////////////////////////////
150
blend_a64_vmask_bn_w4_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,int h,blend_unit_fn blend)151 static INLINE void blend_a64_vmask_bn_w4_sse4_1(
152 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
153 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
154 const uint8_t *mask, int h, blend_unit_fn blend) {
155 const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
156
157 do {
158 const __m128i v_m0_w = _mm_set1_epi16(*mask);
159 const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
160
161 const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
162
163 xx_storel_64(dst, v_res_w);
164
165 dst += dst_stride;
166 src0 += src0_stride;
167 src1 += src1_stride;
168 mask += 1;
169 } while (--h);
170 }
171
blend_a64_vmask_b10_w4_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,int w,int h)172 static void blend_a64_vmask_b10_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
173 const uint16_t *src0,
174 uint32_t src0_stride,
175 const uint16_t *src1,
176 uint32_t src1_stride,
177 const uint8_t *mask, int w, int h) {
178 (void)w;
179 blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
180 src1_stride, mask, h, blend_4_b10);
181 }
182
blend_a64_vmask_b12_w4_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,int w,int h)183 static void blend_a64_vmask_b12_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
184 const uint16_t *src0,
185 uint32_t src0_stride,
186 const uint16_t *src1,
187 uint32_t src1_stride,
188 const uint8_t *mask, int w, int h) {
189 (void)w;
190 blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
191 src1_stride, mask, h, blend_4_b12);
192 }
193
blend_a64_vmask_bn_w8n_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,int w,int h,blend_unit_fn blend)194 static INLINE void blend_a64_vmask_bn_w8n_sse4_1(
195 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
196 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
197 const uint8_t *mask, int w, int h, blend_unit_fn blend) {
198 const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
199
200 do {
201 int c;
202 const __m128i v_m0_w = _mm_set1_epi16(*mask);
203 const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
204 for (c = 0; c < w; c += 8) {
205 const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
206
207 xx_storeu_128(dst + c, v_res_w);
208 }
209 dst += dst_stride;
210 src0 += src0_stride;
211 src1 += src1_stride;
212 mask += 1;
213 } while (--h);
214 }
215
blend_a64_vmask_b10_w8n_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,int w,int h)216 static void blend_a64_vmask_b10_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
217 const uint16_t *src0,
218 uint32_t src0_stride,
219 const uint16_t *src1,
220 uint32_t src1_stride,
221 const uint8_t *mask, int w, int h) {
222 blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
223 src1_stride, mask, w, h, blend_8_b10);
224 }
225
blend_a64_vmask_b12_w8n_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,int w,int h)226 static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
227 const uint16_t *src0,
228 uint32_t src0_stride,
229 const uint16_t *src1,
230 uint32_t src1_stride,
231 const uint8_t *mask, int w, int h) {
232 blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
233 src1_stride, mask, w, h, blend_8_b12);
234 }
235
236 //////////////////////////////////////////////////////////////////////////////
237 // Dispatch
238 //////////////////////////////////////////////////////////////////////////////
239
aom_highbd_blend_a64_vmask_sse4_1(uint8_t * dst_8,uint32_t dst_stride,const uint8_t * src0_8,uint32_t src0_stride,const uint8_t * src1_8,uint32_t src1_stride,const uint8_t * mask,int w,int h,int bd)240 void aom_highbd_blend_a64_vmask_sse4_1(
241 uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
242 uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
243 const uint8_t *mask, int w, int h, int bd) {
244 typedef void (*blend_fn)(uint16_t * dst, uint32_t dst_stride,
245 const uint16_t *src0, uint32_t src0_stride,
246 const uint16_t *src1, uint32_t src1_stride,
247 const uint8_t *mask, int w, int h);
248
249 // Dimensions are: bd_index X width_index
250 static const blend_fn blend[2][2] = {
251 {
252 // bd == 8 or 10
253 blend_a64_vmask_b10_w8n_sse4_1, // w % 8 == 0
254 blend_a64_vmask_b10_w4_sse4_1, // w == 4
255 },
256 {
257 // bd == 12
258 blend_a64_vmask_b12_w8n_sse4_1, // w % 8 == 0
259 blend_a64_vmask_b12_w4_sse4_1, // w == 4
260 }
261 };
262
263 assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
264 assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
265
266 assert(h >= 1);
267 assert(w >= 1);
268 assert(IS_POWER_OF_TWO(h));
269 assert(IS_POWER_OF_TWO(w));
270
271 assert(bd == 8 || bd == 10 || bd == 12);
272
273 if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
274 aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
275 src1_stride, mask, w, h, bd);
276 } else {
277 uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
278 const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
279 const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
280
281 blend[bd == 12][(w >> 2) & 1](dst, dst_stride, src0, src0_stride, src1,
282 src1_stride, mask, w, h);
283 }
284 }
285 #endif // CONFIG_AV1_HIGHBITDEPTH
286