1 /*
2  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 #ifndef AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
12 #define AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
13 
14 #include <emmintrin.h>  // SSE2
15 #include <tmmintrin.h>  // SSSE3
16 
17 #include "config/aom_config.h"
18 #include "config/av1_rtcd.h"
19 
20 #include "aom/aom_integer.h"
21 #include "aom_dsp/x86/transpose_sse2.h"
22 #include "aom_dsp/x86/txfm_common_sse2.h"
23 
24 #ifdef __cplusplus
25 extern "C" {
26 #endif
27 
28 #define btf_16_ssse3(w0, w1, in, out0, out1)    \
29   do {                                          \
30     const __m128i _w0 = _mm_set1_epi16(w0 * 8); \
31     const __m128i _w1 = _mm_set1_epi16(w1 * 8); \
32     const __m128i _in = in;                     \
33     out0 = _mm_mulhrs_epi16(_in, _w0);          \
34     out1 = _mm_mulhrs_epi16(_in, _w1);          \
35   } while (0)
36 
37 #define btf_16_adds_subs_sse2(in0, in1) \
38   do {                                  \
39     const __m128i _in0 = in0;           \
40     const __m128i _in1 = in1;           \
41     in0 = _mm_adds_epi16(_in0, _in1);   \
42     in1 = _mm_subs_epi16(_in0, _in1);   \
43   } while (0)
44 
45 #define btf_16_subs_adds_sse2(in0, in1) \
46   do {                                  \
47     const __m128i _in0 = in0;           \
48     const __m128i _in1 = in1;           \
49     in1 = _mm_subs_epi16(_in0, _in1);   \
50     in0 = _mm_adds_epi16(_in0, _in1);   \
51   } while (0)
52 
53 #define btf_16_adds_subs_out_sse2(out0, out1, in0, in1) \
54   do {                                                  \
55     const __m128i _in0 = in0;                           \
56     const __m128i _in1 = in1;                           \
57     out0 = _mm_adds_epi16(_in0, _in1);                  \
58     out1 = _mm_subs_epi16(_in0, _in1);                  \
59   } while (0)
60 
round_shift_16bit_ssse3(__m128i * in,int size,int bit)61 static INLINE void round_shift_16bit_ssse3(__m128i *in, int size, int bit) {
62   if (bit < 0) {
63     const __m128i scale = _mm_set1_epi16(1 << (15 + bit));
64     for (int i = 0; i < size; ++i) {
65       in[i] = _mm_mulhrs_epi16(in[i], scale);
66     }
67   } else if (bit > 0) {
68     for (int i = 0; i < size; ++i) {
69       in[i] = _mm_slli_epi16(in[i], bit);
70     }
71   }
72 }
73 
74 // 1D itx types
75 typedef enum ATTRIBUTE_PACKED {
76   IDCT_1D,
77   IADST_1D,
78   IFLIPADST_1D = IADST_1D,
79   IIDENTITY_1D,
80   ITX_TYPES_1D,
81 } ITX_TYPE_1D;
82 
83 static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
84   IDCT_1D,      IADST_1D,     IDCT_1D,      IADST_1D,
85   IFLIPADST_1D, IDCT_1D,      IFLIPADST_1D, IADST_1D,
86   IFLIPADST_1D, IIDENTITY_1D, IDCT_1D,      IIDENTITY_1D,
87   IADST_1D,     IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D,
88 };
89 
90 static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
91   IDCT_1D,      IDCT_1D,      IADST_1D,     IADST_1D,
92   IDCT_1D,      IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D,
93   IADST_1D,     IIDENTITY_1D, IIDENTITY_1D, IDCT_1D,
94   IIDENTITY_1D, IADST_1D,     IIDENTITY_1D, IFLIPADST_1D,
95 };
96 
97 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x8_default[8]) = {
98   0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0707,
99 };
100 
101 DECLARE_ALIGNED(16, static const int16_t,
102                 av1_eob_to_eobxy_16x16_default[16]) = {
103   0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
104   0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
105 };
106 
107 DECLARE_ALIGNED(16, static const int16_t,
108                 av1_eob_to_eobxy_32x32_default[32]) = {
109   0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
110   0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
111   0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
112   0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f, 0x1f1f,
113 };
114 
115 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x16_default[16]) = {
116   0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
117   0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07,
118 };
119 
120 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_16x8_default[8]) = {
121   0x0707, 0x0707, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f, 0x070f,
122 };
123 
124 DECLARE_ALIGNED(16, static const int16_t,
125                 av1_eob_to_eobxy_16x32_default[32]) = {
126   0x0707, 0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f0f,
127   0x0f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
128   0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
129   0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f, 0x1f0f,
130 };
131 
132 DECLARE_ALIGNED(16, static const int16_t,
133                 av1_eob_to_eobxy_32x16_default[16]) = {
134   0x0707, 0x0f0f, 0x0f0f, 0x0f0f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
135   0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f, 0x0f1f,
136 };
137 
138 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_8x32_default[32]) = {
139   0x0707, 0x0707, 0x0707, 0x0707, 0x0707, 0x0f07, 0x0f07, 0x0f07,
140   0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x0f07, 0x1f07, 0x1f07, 0x1f07,
141   0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
142   0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07, 0x1f07,
143 };
144 
145 DECLARE_ALIGNED(16, static const int16_t, av1_eob_to_eobxy_32x8_default[8]) = {
146   0x0707, 0x070f, 0x070f, 0x071f, 0x071f, 0x071f, 0x071f, 0x071f,
147 };
148 
149 DECLARE_ALIGNED(16, static const int16_t *,
150                 av1_eob_to_eobxy_default[TX_SIZES_ALL]) = {
151   NULL,
152   av1_eob_to_eobxy_8x8_default,
153   av1_eob_to_eobxy_16x16_default,
154   av1_eob_to_eobxy_32x32_default,
155   av1_eob_to_eobxy_32x32_default,
156   NULL,
157   NULL,
158   av1_eob_to_eobxy_8x16_default,
159   av1_eob_to_eobxy_16x8_default,
160   av1_eob_to_eobxy_16x32_default,
161   av1_eob_to_eobxy_32x16_default,
162   av1_eob_to_eobxy_32x32_default,
163   av1_eob_to_eobxy_32x32_default,
164   NULL,
165   NULL,
166   av1_eob_to_eobxy_8x32_default,
167   av1_eob_to_eobxy_32x8_default,
168   av1_eob_to_eobxy_16x32_default,
169   av1_eob_to_eobxy_32x16_default,
170 };
171 
172 static const int lowbd_txfm_all_1d_zeros_idx[32] = {
173   0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
174   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
175 };
176 
177 // Transform block width in log2 for eob (size of 64 map to 32)
178 static const int tx_size_wide_log2_eob[TX_SIZES_ALL] = {
179   2, 3, 4, 5, 5, 2, 3, 3, 4, 4, 5, 5, 5, 2, 4, 3, 5, 4, 5,
180 };
181 
get_eobx_eoby_scan_default(int * eobx,int * eoby,TX_SIZE tx_size,int eob)182 static INLINE void get_eobx_eoby_scan_default(int *eobx, int *eoby,
183                                               TX_SIZE tx_size, int eob) {
184   if (eob == 1) {
185     *eobx = 0;
186     *eoby = 0;
187     return;
188   }
189 
190   const int tx_w_log2 = tx_size_wide_log2_eob[tx_size];
191   const int eob_row = (eob - 1) >> tx_w_log2;
192   const int eobxy = av1_eob_to_eobxy_default[tx_size][eob_row];
193   *eobx = eobxy & 0xFF;
194   *eoby = eobxy >> 8;
195 }
196 
197 static int eob_fill[32] = {
198   0,  7,  7,  7,  7,  7,  7,  7,  15, 15, 15, 15, 15, 15, 15, 15,
199   31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31, 31,
200 };
201 
get_eobx_eoby_scan_h_identity(int * eobx,int * eoby,TX_SIZE tx_size,int eob)202 static INLINE void get_eobx_eoby_scan_h_identity(int *eobx, int *eoby,
203                                                  TX_SIZE tx_size, int eob) {
204   eob -= 1;
205   const int txfm_size_col = tx_size_wide[tx_size];
206   const int eobx_max = AOMMIN(32, txfm_size_col) - 1;
207   *eobx = (eob >= eobx_max) ? eobx_max : eob_fill[eob];
208   const int temp_eoby = eob / (eobx_max + 1);
209   assert(temp_eoby < 32);
210   *eoby = eob_fill[temp_eoby];
211 }
212 
get_eobx_eoby_scan_v_identity(int * eobx,int * eoby,TX_SIZE tx_size,int eob)213 static INLINE void get_eobx_eoby_scan_v_identity(int *eobx, int *eoby,
214                                                  TX_SIZE tx_size, int eob) {
215   eob -= 1;
216   const int txfm_size_row = tx_size_high[tx_size];
217   const int eoby_max = AOMMIN(32, txfm_size_row) - 1;
218   *eobx = eob / (eoby_max + 1);
219   *eoby = (eob >= eoby_max) ? eoby_max : eob_fill[eob];
220 }
221 
222 typedef void (*transform_1d_ssse3)(const __m128i *input, __m128i *output,
223                                    int8_t cos_bit);
224 
225 void av1_lowbd_inv_txfm2d_add_ssse3(const int32_t *input, uint8_t *output,
226                                     int stride, TX_TYPE tx_type,
227                                     TX_SIZE tx_size, int eob);
228 #ifdef __cplusplus
229 }  // extern "C"
230 #endif
231 
232 #endif  // AOM_AV1_COMMON_X86_AV1_INV_TXFM_SSSE3_H_
233