1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include "config/av1_rtcd.h"
13 
14 #include "av1/common/enums.h"
15 #include "av1/common/av1_txfm.h"
16 #include "av1/common/x86/av1_txfm_sse2.h"
17 #include "av1/common/x86/highbd_txfm_utility_sse4.h"
18 #include "av1/encoder/av1_fwd_txfm1d_cfg.h"
19 #include "av1/encoder/x86/av1_txfm1d_sse4.h"
20 #include "av1/encoder/x86/av1_fwd_txfm_sse2.h"
21 
int16_array_with_stride_to_int32_array_without_stride(const int16_t * input,int stride,int32_t * output,int txfm1d_size)22 static INLINE void int16_array_with_stride_to_int32_array_without_stride(
23     const int16_t *input, int stride, int32_t *output, int txfm1d_size) {
24   int r, c;
25   for (r = 0; r < txfm1d_size; r++) {
26     for (c = 0; c < txfm1d_size; c++) {
27       output[r * txfm1d_size + c] = (int32_t)input[r * stride + c];
28     }
29   }
30 }
31 
32 typedef void (*TxfmFuncSSE2)(const __m128i *input, __m128i *output,
33                              const int8_t cos_bit, const int8_t *stage_range);
34 
fdct32_new_sse4_1(const __m128i * input,__m128i * output,const int8_t cos_bit,const int8_t * stage_range)35 static void fdct32_new_sse4_1(const __m128i *input, __m128i *output,
36                               const int8_t cos_bit, const int8_t *stage_range) {
37   const int txfm_size = 32;
38   const int num_per_128 = 4;
39   __m128i buf0[32];
40   __m128i buf1[32];
41   int col_num = txfm_size / num_per_128;
42   int col;
43   (void)stage_range;
44   for (col = 0; col < col_num; col++) {
45     int j;
46     for (j = 0; j < 32; ++j) {
47       buf0[j] = input[j * col_num + col];
48     }
49     av1_fdct32_new_sse4_1(buf0, buf1, cos_bit);
50     for (j = 0; j < 32; ++j) {
51       output[j * col_num + col] = buf1[j];
52     }
53   }
54 }
55 
fdct64_new_sse4_1(const __m128i * input,__m128i * output,const int8_t cos_bit,const int8_t * stage_range)56 static void fdct64_new_sse4_1(const __m128i *input, __m128i *output,
57                               const int8_t cos_bit, const int8_t *stage_range) {
58   const int txfm_size = 64;
59   const int num_per_128 = 4;
60   int col_num = txfm_size / num_per_128;
61   (void)stage_range;
62   for (int col = 0; col < col_num; col++) {
63     av1_fdct64_new_sse4_1((input + col), (output + col), cos_bit, col_num,
64                           col_num);
65   }
66 }
67 
fwd_txfm_type_to_func(TXFM_TYPE txfm_type)68 static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
69   switch (txfm_type) {
70     case TXFM_TYPE_DCT32: return fdct32_new_sse4_1; break;
71     case TXFM_TYPE_DCT64: return fdct64_new_sse4_1; break;
72     default: assert(0);
73   }
74   return NULL;
75 }
76 
fwd_txfm2d_sse4_1(const int16_t * input,int32_t * output,const int stride,const TXFM_2D_FLIP_CFG * cfg,int32_t * txfm_buf)77 static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
78                                      const int stride,
79                                      const TXFM_2D_FLIP_CFG *cfg,
80                                      int32_t *txfm_buf) {
81   // TODO(sarahparker) This does not currently support rectangular transforms
82   // and will break without splitting txfm_size out into row and col size.
83   // Rectangular transforms use c code only, so it should be ok for now.
84   // It will be corrected when there are sse implementations for rectangular
85   // transforms.
86   assert(cfg->tx_size < TX_SIZES);
87   const int txfm_size = tx_size_wide[cfg->tx_size];
88   const int8_t *shift = cfg->shift;
89   const int8_t *stage_range_col = cfg->stage_range_col;
90   const int8_t *stage_range_row = cfg->stage_range_row;
91   const int8_t cos_bit_col = cfg->cos_bit_col;
92   const int8_t cos_bit_row = cfg->cos_bit_row;
93   const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
94   const TxfmFuncSSE2 txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
95 
96   __m128i *buf_128 = (__m128i *)txfm_buf;
97   __m128i *out_128 = (__m128i *)output;
98   int num_per_128 = 4;
99   int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
100 
101   int16_array_with_stride_to_int32_array_without_stride(input, stride, txfm_buf,
102                                                         txfm_size);
103   av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[0]);
104   txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
105   av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]);
106   transpose_32(txfm_size, out_128, buf_128);
107   txfm_func_row(buf_128, out_128, cos_bit_row, stage_range_row);
108   av1_round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]);
109   transpose_32(txfm_size, buf_128, out_128);
110 }
111 
fwd_txfm2d_64x64_sse4_1(const int16_t * input,int32_t * output,const int stride,const TXFM_2D_FLIP_CFG * cfg,int32_t * txfm_buf)112 static INLINE void fwd_txfm2d_64x64_sse4_1(const int16_t *input,
113                                            int32_t *output, const int stride,
114                                            const TXFM_2D_FLIP_CFG *cfg,
115                                            int32_t *txfm_buf) {
116   assert(cfg->tx_size < TX_SIZES);
117   const int txfm_size = tx_size_wide[cfg->tx_size];
118   const int8_t *shift = cfg->shift;
119   const int8_t *stage_range_col = cfg->stage_range_col;
120   const int8_t cos_bit_col = cfg->cos_bit_col;
121   const int8_t cos_bit_row = cfg->cos_bit_row;
122   const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
123   __m128i *buf_128 = (__m128i *)txfm_buf;
124   __m128i *out_128 = (__m128i *)output;
125 
126   const int num_per_128 = 4;
127   int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
128   int col_num = txfm_size / num_per_128;
129 
130   int16_array_with_stride_to_int32_array_without_stride(input, stride, output,
131                                                         txfm_size);
132   /*col wise transform*/
133   txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
134   av1_round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]);
135   transpose_32(txfm_size, out_128, buf_128);
136 
137   /*row wise transform*/
138   for (int col = 0; col < (col_num >> 1); col++) {
139     av1_fdct64_new_sse4_1((buf_128 + col), (out_128 + col), cos_bit_row,
140                           col_num, (col_num >> 1));
141   }
142 
143   txfm2d_size_128 = (col_num >> 1) * (txfm_size >> 1);
144   av1_round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]);
145   transpose_32x32(buf_128, out_128);
146 }
147 
av1_fwd_txfm2d_32x32_sse4_1(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)148 void av1_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output,
149                                  int stride, TX_TYPE tx_type, int bd) {
150   DECLARE_ALIGNED(16, int32_t, txfm_buf[1024]);
151   TXFM_2D_FLIP_CFG cfg;
152   av1_get_fwd_txfm_cfg(tx_type, TX_32X32, &cfg);
153   (void)bd;
154   fwd_txfm2d_sse4_1(input, output, stride, &cfg, txfm_buf);
155 }
156 
av1_fwd_txfm2d_64x64_sse4_1(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)157 void av1_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
158                                  int stride, TX_TYPE tx_type, int bd) {
159   DECLARE_ALIGNED(16, int32_t, txfm_buf[4096]);
160   TXFM_2D_FLIP_CFG cfg;
161   av1_get_fwd_txfm_cfg(tx_type, TX_64X64, &cfg);
162   (void)bd;
163   fwd_txfm2d_64x64_sse4_1(input, output, stride, &cfg, txfm_buf);
164 }
165 
transpose_32_4x4x2(int stride,const __m128i * inputA,const __m128i * inputB,__m128i * output)166 static INLINE void transpose_32_4x4x2(int stride, const __m128i *inputA,
167                                       const __m128i *inputB, __m128i *output) {
168   __m128i temp0 = _mm_unpacklo_epi32(inputA[0], inputA[2]);
169   __m128i temp1 = _mm_unpackhi_epi32(inputA[0], inputA[2]);
170   __m128i temp2 = _mm_unpacklo_epi32(inputA[1], inputA[3]);
171   __m128i temp3 = _mm_unpackhi_epi32(inputA[1], inputA[3]);
172 
173   output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2);
174   output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2);
175   output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3);
176   output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3);
177 
178   temp0 = _mm_unpacklo_epi32(inputB[0], inputB[2]);
179   temp1 = _mm_unpackhi_epi32(inputB[0], inputB[2]);
180   temp2 = _mm_unpacklo_epi32(inputB[1], inputB[3]);
181   temp3 = _mm_unpackhi_epi32(inputB[1], inputB[3]);
182 
183   output[4 * stride] = _mm_unpacklo_epi32(temp0, temp2);
184   output[5 * stride] = _mm_unpackhi_epi32(temp0, temp2);
185   output[6 * stride] = _mm_unpacklo_epi32(temp1, temp3);
186   output[7 * stride] = _mm_unpackhi_epi32(temp1, temp3);
187 }
188 
lowbd_fwd_txfm2d_64x64_sse4_1(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)189 static void lowbd_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
190                                           int stride, TX_TYPE tx_type, int bd) {
191   (void)bd;
192   (void)tx_type;
193   assert(tx_type == DCT_DCT);
194   const TX_SIZE tx_size = TX_64X64;
195   __m128i buf0[64], buf1[512];
196   const int8_t *shift = fwd_txfm_shift_ls[tx_size];
197   const int txw_idx = get_txw_idx(tx_size);
198   const int txh_idx = get_txh_idx(tx_size);
199   const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
200   const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
201   const int width = tx_size_wide[tx_size];
202   const int height = tx_size_high[tx_size];
203   const transform_1d_sse2 col_txfm = fdct8x64_new_sse2;
204   const int width_div8 = (width >> 3);
205   const int height_div8 = (height >> 3);
206 
207   for (int i = 0; i < width_div8; i++) {
208     load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
209     round_shift_16bit(buf0, height, shift[0]);
210     col_txfm(buf0, buf0, cos_bit_col);
211     round_shift_16bit(buf0, height, shift[1]);
212     for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
213       transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
214     }
215   }
216   for (int i = 0; i < AOMMIN(4, height_div8); i++) {
217     __m128i bufA[64];
218     __m128i bufB[64];
219     __m128i *buf = buf1 + width * i;
220     for (int j = 0; j < width; ++j) {
221       bufA[j] = _mm_cvtepi16_epi32(buf[j]);
222       bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
223     }
224     av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row, 1, 1);
225     av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row, 1, 1);
226     av1_round_shift_array_32_sse4_1(bufA, bufA, 32, -shift[2]);
227     av1_round_shift_array_32_sse4_1(bufB, bufB, 32, -shift[2]);
228 
229     int32_t *output8 = output + 8 * 32 * i;
230     for (int j = 0; j < width_div8; ++j) {
231       __m128i *out = (__m128i *)(output8 + 4 * j);
232       transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out);
233     }
234   }
235 }
236 
lowbd_fwd_txfm2d_64x32_sse4_1(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)237 static void lowbd_fwd_txfm2d_64x32_sse4_1(const int16_t *input, int32_t *output,
238                                           int stride, TX_TYPE tx_type, int bd) {
239   (void)bd;
240   const TX_SIZE tx_size = TX_64X32;
241   __m128i buf0[64], buf1[256];
242   const int8_t *shift = fwd_txfm_shift_ls[tx_size];
243   const int txw_idx = get_txw_idx(tx_size);
244   const int txh_idx = get_txh_idx(tx_size);
245   const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
246   const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
247   const int width = tx_size_wide[tx_size];
248   const int height = tx_size_high[tx_size];
249   const transform_1d_sse2 col_txfm = col_txfm8x32_arr[tx_type];
250   const int width_div8 = (width >> 3);
251   const int height_div8 = (height >> 3);
252 
253   for (int i = 0; i < width_div8; i++) {
254     load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
255     round_shift_16bit(buf0, height, shift[0]);
256     col_txfm(buf0, buf0, cos_bit_col);
257     round_shift_16bit(buf0, height, shift[1]);
258     for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
259       transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
260     }
261   }
262   assert(tx_type == DCT_DCT);
263   for (int i = 0; i < AOMMIN(4, height_div8); i++) {
264     __m128i bufA[64];
265     __m128i bufB[64];
266     __m128i *buf = buf1 + width * i;
267     for (int j = 0; j < width; ++j) {
268       bufA[j] = _mm_cvtepi16_epi32(buf[j]);
269       bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
270     }
271     av1_fdct64_new_sse4_1(bufA, bufA, cos_bit_row, 1, 1);
272     av1_fdct64_new_sse4_1(bufB, bufB, cos_bit_row, 1, 1);
273     av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2);
274     av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2);
275 
276     int32_t *output8 = output + 8 * 32 * i;
277     for (int j = 0; j < width_div8; ++j) {
278       __m128i *out = (__m128i *)(output8 + 4 * j);
279       transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out);
280     }
281   }
282 }
283 
lowbd_fwd_txfm2d_32x64_sse4_1(const int16_t * input,int32_t * output,int stride,TX_TYPE tx_type,int bd)284 static void lowbd_fwd_txfm2d_32x64_sse4_1(const int16_t *input, int32_t *output,
285                                           int stride, TX_TYPE tx_type, int bd) {
286   (void)bd;
287   (void)tx_type;
288   assert(tx_type == DCT_DCT);
289   const TX_SIZE tx_size = TX_32X64;
290   __m128i buf0[64], buf1[256];
291   const int8_t *shift = fwd_txfm_shift_ls[tx_size];
292   const int txw_idx = get_txw_idx(tx_size);
293   const int txh_idx = get_txh_idx(tx_size);
294   const int cos_bit_col = fwd_cos_bit_col[txw_idx][txh_idx];
295   const int cos_bit_row = fwd_cos_bit_row[txw_idx][txh_idx];
296   const int width = tx_size_wide[tx_size];
297   const int height = tx_size_high[tx_size];
298   const transform_1d_sse2 col_txfm = fdct8x64_new_sse2;
299   const int width_div8 = (width >> 3);
300   const int height_div8 = (height >> 3);
301 
302   for (int i = 0; i < width_div8; i++) {
303     load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
304     round_shift_16bit(buf0, height, shift[0]);
305     col_txfm(buf0, buf0, cos_bit_col);
306     round_shift_16bit(buf0, height, shift[1]);
307     for (int j = 0; j < AOMMIN(4, height_div8); ++j) {
308       transpose_16bit_8x8(buf0 + j * 8, buf1 + j * width + 8 * i);
309     }
310   }
311 
312   for (int i = 0; i < AOMMIN(4, height_div8); i++) {
313     __m128i bufA[32];
314     __m128i bufB[32];
315     __m128i *buf = buf1 + width * i;
316     for (int j = 0; j < width; ++j) {
317       bufA[j] = _mm_cvtepi16_epi32(buf[j]);
318       bufB[j] = _mm_cvtepi16_epi32(_mm_unpackhi_epi64(buf[j], buf[j]));
319     }
320     av1_fdct32_new_sse4_1(bufA, bufA, cos_bit_row);
321     av1_fdct32_new_sse4_1(bufB, bufB, cos_bit_row);
322     av1_round_shift_rect_array_32_sse4_1(bufA, bufA, 32, -shift[2], NewSqrt2);
323     av1_round_shift_rect_array_32_sse4_1(bufB, bufB, 32, -shift[2], NewSqrt2);
324 
325     int32_t *output8 = output + 8 * 32 * i;
326     for (int j = 0; j < (32 / 4); ++j) {
327       __m128i *out = (__m128i *)(output8 + 4 * j);
328       transpose_32_4x4x2(8, bufA + 4 * j, bufB + 4 * j, out);
329     }
330   }
331 }
332 
333 static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = {
334   av1_lowbd_fwd_txfm2d_4x4_sse2,    // 4x4 transform
335   av1_lowbd_fwd_txfm2d_8x8_sse2,    // 8x8 transform
336   av1_lowbd_fwd_txfm2d_16x16_sse2,  // 16x16 transform
337   av1_lowbd_fwd_txfm2d_32x32_sse2,  // 32x32 transform
338   lowbd_fwd_txfm2d_64x64_sse4_1,    // 64x64 transform
339   av1_lowbd_fwd_txfm2d_4x8_sse2,    // 4x8 transform
340   av1_lowbd_fwd_txfm2d_8x4_sse2,    // 8x4 transform
341   av1_lowbd_fwd_txfm2d_8x16_sse2,   // 8x16 transform
342   av1_lowbd_fwd_txfm2d_16x8_sse2,   // 16x8 transform
343   av1_lowbd_fwd_txfm2d_16x32_sse2,  // 16x32 transform
344   av1_lowbd_fwd_txfm2d_32x16_sse2,  // 32x16 transform
345   lowbd_fwd_txfm2d_32x64_sse4_1,    // 32x64 transform
346   lowbd_fwd_txfm2d_64x32_sse4_1,    // 64x32 transform
347   av1_lowbd_fwd_txfm2d_4x16_sse2,   // 4x16 transform
348   av1_lowbd_fwd_txfm2d_16x4_sse2,   // 16x4 transform
349   av1_lowbd_fwd_txfm2d_8x32_sse2,   // 8x32 transform
350   av1_lowbd_fwd_txfm2d_32x8_sse2,   // 32x8 transform
351   av1_lowbd_fwd_txfm2d_16x64_sse2,  // 16x64 transform
352   av1_lowbd_fwd_txfm2d_64x16_sse2,  // 64x16 transform
353 };
354 
av1_lowbd_fwd_txfm_sse4_1(const int16_t * src_diff,tran_low_t * coeff,int diff_stride,TxfmParam * txfm_param)355 void av1_lowbd_fwd_txfm_sse4_1(const int16_t *src_diff, tran_low_t *coeff,
356                                int diff_stride, TxfmParam *txfm_param) {
357   FwdTxfm2dFunc fwd_txfm2d_func = fwd_txfm2d_func_ls[txfm_param->tx_size];
358   if ((fwd_txfm2d_func == NULL) ||
359       (txfm_param->lossless && txfm_param->tx_size == TX_4X4)) {
360     av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param);
361   } else {
362     fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type,
363                     txfm_param->bd);
364   }
365 }
366