1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 #ifndef AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
12 #define AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
13 
14 #include <emmintrin.h>  // SSE2
15 
16 #include "config/aom_config.h"
17 #include "config/av1_rtcd.h"
18 
19 #include "aom/aom_integer.h"
20 #include "aom_dsp/x86/transpose_sse2.h"
21 #include "aom_dsp/x86/txfm_common_sse2.h"
22 #include "av1/common/av1_txfm.h"
23 
24 #ifdef __cplusplus
25 extern "C" {
26 #endif
27 
btf_16_w4_sse2(const __m128i * const w0,const __m128i * const w1,const __m128i __rounding,const int8_t cos_bit,const __m128i * const in0,const __m128i * const in1,__m128i * const out0,__m128i * const out1)28 static INLINE void btf_16_w4_sse2(
29     const __m128i *const w0, const __m128i *const w1, const __m128i __rounding,
30     const int8_t cos_bit, const __m128i *const in0, const __m128i *const in1,
31     __m128i *const out0, __m128i *const out1) {
32   const __m128i t0 = _mm_unpacklo_epi16(*in0, *in1);
33   const __m128i u0 = _mm_madd_epi16(t0, *w0);
34   const __m128i v0 = _mm_madd_epi16(t0, *w1);
35   const __m128i a0 = _mm_add_epi32(u0, __rounding);
36   const __m128i b0 = _mm_add_epi32(v0, __rounding);
37   const __m128i c0 = _mm_srai_epi32(a0, cos_bit);
38   const __m128i d0 = _mm_srai_epi32(b0, cos_bit);
39 
40   *out0 = _mm_packs_epi32(c0, c0);
41   *out1 = _mm_packs_epi32(d0, c0);
42 }
43 
44 #define btf_16_4p_sse2(w0, w1, in0, in1, out0, out1) \
45   {                                                  \
46     __m128i t0 = _mm_unpacklo_epi16(in0, in1);       \
47     __m128i u0 = _mm_madd_epi16(t0, w0);             \
48     __m128i v0 = _mm_madd_epi16(t0, w1);             \
49                                                      \
50     __m128i a0 = _mm_add_epi32(u0, __rounding);      \
51     __m128i b0 = _mm_add_epi32(v0, __rounding);      \
52                                                      \
53     __m128i c0 = _mm_srai_epi32(a0, cos_bit);        \
54     __m128i d0 = _mm_srai_epi32(b0, cos_bit);        \
55                                                      \
56     out0 = _mm_packs_epi32(c0, c0);                  \
57     out1 = _mm_packs_epi32(d0, d0);                  \
58   }
59 
60 #define btf_16_sse2(w0, w1, in0, in1, out0, out1) \
61   {                                               \
62     __m128i t0 = _mm_unpacklo_epi16(in0, in1);    \
63     __m128i t1 = _mm_unpackhi_epi16(in0, in1);    \
64     __m128i u0 = _mm_madd_epi16(t0, w0);          \
65     __m128i u1 = _mm_madd_epi16(t1, w0);          \
66     __m128i v0 = _mm_madd_epi16(t0, w1);          \
67     __m128i v1 = _mm_madd_epi16(t1, w1);          \
68                                                   \
69     __m128i a0 = _mm_add_epi32(u0, __rounding);   \
70     __m128i a1 = _mm_add_epi32(u1, __rounding);   \
71     __m128i b0 = _mm_add_epi32(v0, __rounding);   \
72     __m128i b1 = _mm_add_epi32(v1, __rounding);   \
73                                                   \
74     __m128i c0 = _mm_srai_epi32(a0, cos_bit);     \
75     __m128i c1 = _mm_srai_epi32(a1, cos_bit);     \
76     __m128i d0 = _mm_srai_epi32(b0, cos_bit);     \
77     __m128i d1 = _mm_srai_epi32(b1, cos_bit);     \
78                                                   \
79     out0 = _mm_packs_epi32(c0, c1);               \
80     out1 = _mm_packs_epi32(d0, d1);               \
81   }
82 
load_16bit_to_16bit(const int16_t * a)83 static INLINE __m128i load_16bit_to_16bit(const int16_t *a) {
84   return _mm_load_si128((const __m128i *)a);
85 }
86 
load_32bit_to_16bit(const int32_t * a)87 static INLINE __m128i load_32bit_to_16bit(const int32_t *a) {
88   const __m128i a_low = _mm_load_si128((const __m128i *)a);
89   return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4));
90 }
91 
load_32bit_to_16bit_w4(const int32_t * a)92 static INLINE __m128i load_32bit_to_16bit_w4(const int32_t *a) {
93   const __m128i a_low = _mm_load_si128((const __m128i *)a);
94   return _mm_packs_epi32(a_low, a_low);
95 }
96 
97 // Store 4 16 bit values. Sign extend the values.
store_16bit_to_32bit_w4(const __m128i a,int32_t * const b)98 static INLINE void store_16bit_to_32bit_w4(const __m128i a, int32_t *const b) {
99   const __m128i a_lo = _mm_unpacklo_epi16(a, a);
100   const __m128i a_1 = _mm_srai_epi32(a_lo, 16);
101   _mm_store_si128((__m128i *)b, a_1);
102 }
103 
104 // Store 8 16 bit values. Sign extend the values.
store_16bit_to_32bit(__m128i a,int32_t * b)105 static INLINE void store_16bit_to_32bit(__m128i a, int32_t *b) {
106   const __m128i a_lo = _mm_unpacklo_epi16(a, a);
107   const __m128i a_hi = _mm_unpackhi_epi16(a, a);
108   const __m128i a_1 = _mm_srai_epi32(a_lo, 16);
109   const __m128i a_2 = _mm_srai_epi32(a_hi, 16);
110   _mm_store_si128((__m128i *)b, a_1);
111   _mm_store_si128((__m128i *)(b + 4), a_2);
112 }
113 
scale_round_sse2(const __m128i a,const int scale)114 static INLINE __m128i scale_round_sse2(const __m128i a, const int scale) {
115   const __m128i scale_rounding = pair_set_epi16(scale, 1 << (NewSqrt2Bits - 1));
116   const __m128i b = _mm_madd_epi16(a, scale_rounding);
117   return _mm_srai_epi32(b, NewSqrt2Bits);
118 }
119 
store_rect_16bit_to_32bit_w4(const __m128i a,int32_t * const b)120 static INLINE void store_rect_16bit_to_32bit_w4(const __m128i a,
121                                                 int32_t *const b) {
122   const __m128i one = _mm_set1_epi16(1);
123   const __m128i a_lo = _mm_unpacklo_epi16(a, one);
124   const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
125   _mm_store_si128((__m128i *)b, b_lo);
126 }
127 
store_rect_16bit_to_32bit(const __m128i a,int32_t * const b)128 static INLINE void store_rect_16bit_to_32bit(const __m128i a,
129                                              int32_t *const b) {
130   const __m128i one = _mm_set1_epi16(1);
131   const __m128i a_lo = _mm_unpacklo_epi16(a, one);
132   const __m128i a_hi = _mm_unpackhi_epi16(a, one);
133   const __m128i b_lo = scale_round_sse2(a_lo, NewSqrt2);
134   const __m128i b_hi = scale_round_sse2(a_hi, NewSqrt2);
135   _mm_store_si128((__m128i *)b, b_lo);
136   _mm_store_si128((__m128i *)(b + 4), b_hi);
137 }
138 
load_buffer_16bit_to_16bit_w4(const int16_t * const in,const int stride,__m128i * const out,const int out_size)139 static INLINE void load_buffer_16bit_to_16bit_w4(const int16_t *const in,
140                                                  const int stride,
141                                                  __m128i *const out,
142                                                  const int out_size) {
143   for (int i = 0; i < out_size; ++i) {
144     out[i] = _mm_loadl_epi64((const __m128i *)(in + i * stride));
145   }
146 }
147 
load_buffer_16bit_to_16bit_w4_flip(const int16_t * const in,const int stride,__m128i * const out,const int out_size)148 static INLINE void load_buffer_16bit_to_16bit_w4_flip(const int16_t *const in,
149                                                       const int stride,
150                                                       __m128i *const out,
151                                                       const int out_size) {
152   for (int i = 0; i < out_size; ++i) {
153     out[out_size - i - 1] = _mm_loadl_epi64((const __m128i *)(in + i * stride));
154   }
155 }
156 
load_buffer_16bit_to_16bit(const int16_t * in,int stride,__m128i * out,int out_size)157 static INLINE void load_buffer_16bit_to_16bit(const int16_t *in, int stride,
158                                               __m128i *out, int out_size) {
159   for (int i = 0; i < out_size; ++i) {
160     out[i] = load_16bit_to_16bit(in + i * stride);
161   }
162 }
163 
load_buffer_16bit_to_16bit_flip(const int16_t * in,int stride,__m128i * out,int out_size)164 static INLINE void load_buffer_16bit_to_16bit_flip(const int16_t *in,
165                                                    int stride, __m128i *out,
166                                                    int out_size) {
167   for (int i = 0; i < out_size; ++i) {
168     out[out_size - i - 1] = load_16bit_to_16bit(in + i * stride);
169   }
170 }
171 
load_buffer_32bit_to_16bit(const int32_t * in,int stride,__m128i * out,int out_size)172 static INLINE void load_buffer_32bit_to_16bit(const int32_t *in, int stride,
173                                               __m128i *out, int out_size) {
174   for (int i = 0; i < out_size; ++i) {
175     out[i] = load_32bit_to_16bit(in + i * stride);
176   }
177 }
178 
load_buffer_32bit_to_16bit_w4(const int32_t * in,int stride,__m128i * out,int out_size)179 static INLINE void load_buffer_32bit_to_16bit_w4(const int32_t *in, int stride,
180                                                  __m128i *out, int out_size) {
181   for (int i = 0; i < out_size; ++i) {
182     out[i] = load_32bit_to_16bit_w4(in + i * stride);
183   }
184 }
185 
load_buffer_32bit_to_16bit_flip(const int32_t * in,int stride,__m128i * out,int out_size)186 static INLINE void load_buffer_32bit_to_16bit_flip(const int32_t *in,
187                                                    int stride, __m128i *out,
188                                                    int out_size) {
189   for (int i = 0; i < out_size; ++i) {
190     out[out_size - i - 1] = load_32bit_to_16bit(in + i * stride);
191   }
192 }
193 
store_buffer_16bit_to_32bit_w4(const __m128i * const in,int32_t * const out,const int stride,const int out_size)194 static INLINE void store_buffer_16bit_to_32bit_w4(const __m128i *const in,
195                                                   int32_t *const out,
196                                                   const int stride,
197                                                   const int out_size) {
198   for (int i = 0; i < out_size; ++i) {
199     store_16bit_to_32bit_w4(in[i], out + i * stride);
200   }
201 }
202 
store_buffer_16bit_to_32bit_w8(const __m128i * const in,int32_t * const out,const int stride,const int out_size)203 static INLINE void store_buffer_16bit_to_32bit_w8(const __m128i *const in,
204                                                   int32_t *const out,
205                                                   const int stride,
206                                                   const int out_size) {
207   for (int i = 0; i < out_size; ++i) {
208     store_16bit_to_32bit(in[i], out + i * stride);
209   }
210 }
211 
store_rect_buffer_16bit_to_32bit_w4(const __m128i * const in,int32_t * const out,const int stride,const int out_size)212 static INLINE void store_rect_buffer_16bit_to_32bit_w4(const __m128i *const in,
213                                                        int32_t *const out,
214                                                        const int stride,
215                                                        const int out_size) {
216   for (int i = 0; i < out_size; ++i) {
217     store_rect_16bit_to_32bit_w4(in[i], out + i * stride);
218   }
219 }
220 
store_rect_buffer_16bit_to_32bit_w8(const __m128i * const in,int32_t * const out,const int stride,const int out_size)221 static INLINE void store_rect_buffer_16bit_to_32bit_w8(const __m128i *const in,
222                                                        int32_t *const out,
223                                                        const int stride,
224                                                        const int out_size) {
225   for (int i = 0; i < out_size; ++i) {
226     store_rect_16bit_to_32bit(in[i], out + i * stride);
227   }
228 }
229 
store_buffer_16bit_to_16bit_8x8(const __m128i * in,uint16_t * out,const int stride)230 static INLINE void store_buffer_16bit_to_16bit_8x8(const __m128i *in,
231                                                    uint16_t *out,
232                                                    const int stride) {
233   for (int i = 0; i < 8; ++i) {
234     _mm_store_si128((__m128i *)(out + i * stride), in[i]);
235   }
236 }
237 
round_shift_16bit(__m128i * in,int size,int bit)238 static INLINE void round_shift_16bit(__m128i *in, int size, int bit) {
239   if (bit < 0) {
240     bit = -bit;
241     __m128i rounding = _mm_set1_epi16(1 << (bit - 1));
242     for (int i = 0; i < size; ++i) {
243       in[i] = _mm_adds_epi16(in[i], rounding);
244       in[i] = _mm_srai_epi16(in[i], bit);
245     }
246   } else if (bit > 0) {
247     for (int i = 0; i < size; ++i) {
248       in[i] = _mm_slli_epi16(in[i], bit);
249     }
250   }
251 }
252 
flip_buf_sse2(__m128i * in,__m128i * out,int size)253 static INLINE void flip_buf_sse2(__m128i *in, __m128i *out, int size) {
254   for (int i = 0; i < size; ++i) {
255     out[size - i - 1] = in[i];
256   }
257 }
258 
259 void av1_lowbd_fwd_txfm2d_4x4_sse2(const int16_t *input, int32_t *output,
260                                    int stride, TX_TYPE tx_type, int bd);
261 
262 void av1_lowbd_fwd_txfm2d_4x8_sse2(const int16_t *input, int32_t *output,
263                                    int stride, TX_TYPE tx_type, int bd);
264 
265 void av1_lowbd_fwd_txfm2d_4x16_sse2(const int16_t *input, int32_t *output,
266                                     int stride, TX_TYPE tx_type, int bd);
267 
268 void av1_lowbd_fwd_txfm2d_8x4_sse2(const int16_t *input, int32_t *output,
269                                    int stride, TX_TYPE tx_type, int bd);
270 
271 void av1_lowbd_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output,
272                                    int stride, TX_TYPE tx_type, int bd);
273 
274 void av1_lowbd_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output,
275                                     int stride, TX_TYPE tx_type, int bd);
276 
277 void av1_lowbd_fwd_txfm2d_8x32_sse2(const int16_t *input, int32_t *output,
278                                     int stride, TX_TYPE tx_type, int bd);
279 
280 void av1_lowbd_fwd_txfm2d_16x4_sse2(const int16_t *input, int32_t *output,
281                                     int stride, TX_TYPE tx_type, int bd);
282 
283 void av1_lowbd_fwd_txfm2d_16x8_sse2(const int16_t *input, int32_t *output,
284                                     int stride, TX_TYPE tx_type, int bd);
285 
286 void av1_lowbd_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output,
287                                      int stride, TX_TYPE tx_type, int bd);
288 
289 void av1_lowbd_fwd_txfm2d_16x32_sse2(const int16_t *input, int32_t *output,
290                                      int stride, TX_TYPE tx_type, int bd);
291 
292 void av1_lowbd_fwd_txfm2d_32x8_sse2(const int16_t *input, int32_t *output,
293                                     int stride, TX_TYPE tx_type, int bd);
294 
295 void av1_lowbd_fwd_txfm2d_32x16_sse2(const int16_t *input, int32_t *output,
296                                      int stride, TX_TYPE tx_type, int bd);
297 
298 void av1_lowbd_fwd_txfm2d_32x32_sse2(const int16_t *input, int32_t *output,
299                                      int stride, TX_TYPE tx_type, int bd);
300 
301 void av1_lowbd_fwd_txfm2d_16x64_sse2(const int16_t *input, int32_t *output,
302                                      int stride, TX_TYPE tx_type, int bd);
303 
304 void av1_lowbd_fwd_txfm2d_64x16_sse2(const int16_t *input, int32_t *output,
305                                      int stride, TX_TYPE tx_type, int bd);
306 
307 typedef void (*transform_1d_sse2)(const __m128i *input, __m128i *output,
308                                   int8_t cos_bit);
309 
310 typedef struct {
311   transform_1d_sse2 col, row;  // vertical and horizontal
312 } transform_2d_sse2;
313 
314 #ifdef __cplusplus
315 }
316 #endif  // __cplusplus
317 #endif  // AOM_AV1_COMMON_X86_AV1_TXFM_SSE2_H_
318