1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <smmintrin.h>  // SSE4.1
13 
14 #include <assert.h>
15 
16 #include "aom/aom_integer.h"
17 #include "aom_ports/mem.h"
18 #include "aom_dsp/aom_dsp_common.h"
19 #include "aom_dsp/blend.h"
20 
21 #include "aom_dsp/x86/synonyms.h"
22 #include "aom_dsp/x86/blend_sse4.h"
23 
24 #include "config/aom_dsp_rtcd.h"
25 
26 //////////////////////////////////////////////////////////////////////////////
27 // Implementation - No sub-sampling
28 //////////////////////////////////////////////////////////////////////////////
29 
blend_a64_vmask_w4_sse4_1(uint8_t * dst,uint32_t dst_stride,const uint8_t * src0,uint32_t src0_stride,const uint8_t * src1,uint32_t src1_stride,const uint8_t * mask,int w,int h)30 static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
31                                       const uint8_t *src0, uint32_t src0_stride,
32                                       const uint8_t *src1, uint32_t src1_stride,
33                                       const uint8_t *mask, int w, int h) {
34   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
35 
36   (void)w;
37 
38   do {
39     const __m128i v_m0_w = _mm_set1_epi16(*mask);
40     const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
41 
42     const __m128i v_res_w = blend_4(src0, src1, &v_m0_w, &v_m1_w);
43 
44     const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
45 
46     xx_storel_32(dst, v_res_b);
47 
48     dst += dst_stride;
49     src0 += src0_stride;
50     src1 += src1_stride;
51     mask += 1;
52   } while (--h);
53 }
54 
blend_a64_vmask_w8_sse4_1(uint8_t * dst,uint32_t dst_stride,const uint8_t * src0,uint32_t src0_stride,const uint8_t * src1,uint32_t src1_stride,const uint8_t * mask,int w,int h)55 static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
56                                       const uint8_t *src0, uint32_t src0_stride,
57                                       const uint8_t *src1, uint32_t src1_stride,
58                                       const uint8_t *mask, int w, int h) {
59   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
60 
61   (void)w;
62 
63   do {
64     const __m128i v_m0_w = _mm_set1_epi16(*mask);
65     const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
66 
67     const __m128i v_res_w = blend_8(src0, src1, &v_m0_w, &v_m1_w);
68 
69     const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
70 
71     xx_storel_64(dst, v_res_b);
72 
73     dst += dst_stride;
74     src0 += src0_stride;
75     src1 += src1_stride;
76     mask += 1;
77   } while (--h);
78 }
79 
blend_a64_vmask_w16n_sse4_1(uint8_t * dst,uint32_t dst_stride,const uint8_t * src0,uint32_t src0_stride,const uint8_t * src1,uint32_t src1_stride,const uint8_t * mask,int w,int h)80 static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride,
81                                         const uint8_t *src0,
82                                         uint32_t src0_stride,
83                                         const uint8_t *src1,
84                                         uint32_t src1_stride,
85                                         const uint8_t *mask, int w, int h) {
86   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
87 
88   do {
89     int c;
90     const __m128i v_m0_w = _mm_set1_epi16(*mask);
91     const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
92     for (c = 0; c < w; c += 16) {
93       const __m128i v_resl_w = blend_8(src0 + c, src1 + c, &v_m0_w, &v_m1_w);
94       const __m128i v_resh_w =
95           blend_8(src0 + c + 8, src1 + c + 8, &v_m0_w, &v_m1_w);
96 
97       const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
98 
99       xx_storeu_128(dst + c, v_res_b);
100     }
101     dst += dst_stride;
102     src0 += src0_stride;
103     src1 += src1_stride;
104     mask += 1;
105   } while (--h);
106 }
107 
108 //////////////////////////////////////////////////////////////////////////////
109 // Dispatch
110 //////////////////////////////////////////////////////////////////////////////
111 
aom_blend_a64_vmask_sse4_1(uint8_t * dst,uint32_t dst_stride,const uint8_t * src0,uint32_t src0_stride,const uint8_t * src1,uint32_t src1_stride,const uint8_t * mask,int w,int h)112 void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
113                                 const uint8_t *src0, uint32_t src0_stride,
114                                 const uint8_t *src1, uint32_t src1_stride,
115                                 const uint8_t *mask, int w, int h) {
116   typedef void (*blend_fn)(uint8_t * dst, uint32_t dst_stride,
117                            const uint8_t *src0, uint32_t src0_stride,
118                            const uint8_t *src1, uint32_t src1_stride,
119                            const uint8_t *mask, int w, int h);
120 
121   // Dimension: width_index
122   static const blend_fn blend[9] = {
123     blend_a64_vmask_w16n_sse4_1,  // w % 16 == 0
124     aom_blend_a64_vmask_c,        // w == 1
125     aom_blend_a64_vmask_c,        // w == 2
126     NULL,                         // INVALID
127     blend_a64_vmask_w4_sse4_1,    // w == 4
128     NULL,                         // INVALID
129     NULL,                         // INVALID
130     NULL,                         // INVALID
131     blend_a64_vmask_w8_sse4_1,    // w == 8
132   };
133 
134   assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
135   assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
136 
137   assert(h >= 1);
138   assert(w >= 1);
139   assert(IS_POWER_OF_TWO(h));
140   assert(IS_POWER_OF_TWO(w));
141 
142   blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, w,
143                  h);
144 }
145 
146 #if CONFIG_AV1_HIGHBITDEPTH
147 //////////////////////////////////////////////////////////////////////////////
148 // Implementation - No sub-sampling
149 //////////////////////////////////////////////////////////////////////////////
150 
blend_a64_vmask_bn_w4_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,int h,blend_unit_fn blend)151 static INLINE void blend_a64_vmask_bn_w4_sse4_1(
152     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
153     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
154     const uint8_t *mask, int h, blend_unit_fn blend) {
155   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
156 
157   do {
158     const __m128i v_m0_w = _mm_set1_epi16(*mask);
159     const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
160 
161     const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
162 
163     xx_storel_64(dst, v_res_w);
164 
165     dst += dst_stride;
166     src0 += src0_stride;
167     src1 += src1_stride;
168     mask += 1;
169   } while (--h);
170 }
171 
blend_a64_vmask_b10_w4_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,int w,int h)172 static void blend_a64_vmask_b10_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
173                                           const uint16_t *src0,
174                                           uint32_t src0_stride,
175                                           const uint16_t *src1,
176                                           uint32_t src1_stride,
177                                           const uint8_t *mask, int w, int h) {
178   (void)w;
179   blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
180                                src1_stride, mask, h, blend_4_b10);
181 }
182 
blend_a64_vmask_b12_w4_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,int w,int h)183 static void blend_a64_vmask_b12_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
184                                           const uint16_t *src0,
185                                           uint32_t src0_stride,
186                                           const uint16_t *src1,
187                                           uint32_t src1_stride,
188                                           const uint8_t *mask, int w, int h) {
189   (void)w;
190   blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
191                                src1_stride, mask, h, blend_4_b12);
192 }
193 
blend_a64_vmask_bn_w8n_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,int w,int h,blend_unit_fn blend)194 static INLINE void blend_a64_vmask_bn_w8n_sse4_1(
195     uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
196     uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
197     const uint8_t *mask, int w, int h, blend_unit_fn blend) {
198   const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
199 
200   do {
201     int c;
202     const __m128i v_m0_w = _mm_set1_epi16(*mask);
203     const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
204     for (c = 0; c < w; c += 8) {
205       const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
206 
207       xx_storeu_128(dst + c, v_res_w);
208     }
209     dst += dst_stride;
210     src0 += src0_stride;
211     src1 += src1_stride;
212     mask += 1;
213   } while (--h);
214 }
215 
blend_a64_vmask_b10_w8n_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,int w,int h)216 static void blend_a64_vmask_b10_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
217                                            const uint16_t *src0,
218                                            uint32_t src0_stride,
219                                            const uint16_t *src1,
220                                            uint32_t src1_stride,
221                                            const uint8_t *mask, int w, int h) {
222   blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
223                                 src1_stride, mask, w, h, blend_8_b10);
224 }
225 
blend_a64_vmask_b12_w8n_sse4_1(uint16_t * dst,uint32_t dst_stride,const uint16_t * src0,uint32_t src0_stride,const uint16_t * src1,uint32_t src1_stride,const uint8_t * mask,int w,int h)226 static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
227                                            const uint16_t *src0,
228                                            uint32_t src0_stride,
229                                            const uint16_t *src1,
230                                            uint32_t src1_stride,
231                                            const uint8_t *mask, int w, int h) {
232   blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
233                                 src1_stride, mask, w, h, blend_8_b12);
234 }
235 
236 //////////////////////////////////////////////////////////////////////////////
237 // Dispatch
238 //////////////////////////////////////////////////////////////////////////////
239 
aom_highbd_blend_a64_vmask_sse4_1(uint8_t * dst_8,uint32_t dst_stride,const uint8_t * src0_8,uint32_t src0_stride,const uint8_t * src1_8,uint32_t src1_stride,const uint8_t * mask,int w,int h,int bd)240 void aom_highbd_blend_a64_vmask_sse4_1(
241     uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
242     uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
243     const uint8_t *mask, int w, int h, int bd) {
244   typedef void (*blend_fn)(uint16_t * dst, uint32_t dst_stride,
245                            const uint16_t *src0, uint32_t src0_stride,
246                            const uint16_t *src1, uint32_t src1_stride,
247                            const uint8_t *mask, int w, int h);
248 
249   // Dimensions are: bd_index X width_index
250   static const blend_fn blend[2][2] = {
251     {
252         // bd == 8 or 10
253         blend_a64_vmask_b10_w8n_sse4_1,  // w % 8 == 0
254         blend_a64_vmask_b10_w4_sse4_1,   // w == 4
255     },
256     {
257         // bd == 12
258         blend_a64_vmask_b12_w8n_sse4_1,  // w % 8 == 0
259         blend_a64_vmask_b12_w4_sse4_1,   // w == 4
260     }
261   };
262 
263   assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
264   assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
265 
266   assert(h >= 1);
267   assert(w >= 1);
268   assert(IS_POWER_OF_TWO(h));
269   assert(IS_POWER_OF_TWO(w));
270 
271   assert(bd == 8 || bd == 10 || bd == 12);
272 
273   if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
274     aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
275                                  src1_stride, mask, w, h, bd);
276   } else {
277     uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
278     const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
279     const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
280 
281     blend[bd == 12][(w >> 2) & 1](dst, dst_stride, src0, src0_stride, src1,
282                                   src1_stride, mask, w, h);
283   }
284 }
285 #endif  // CONFIG_AV1_HIGHBITDEPTH
286