1 /*
2  *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vp9_rtcd.h"
12 #include "vp9/common/vp9_idct.h"
13 #include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
14 #include "vpx_dsp/x86/inv_txfm_sse2.h"
15 #include "vpx_dsp/x86/transpose_sse2.h"
16 #include "vpx_dsp/x86/txfm_common_sse2.h"
17 
highbd_iadst4_sse4_1(__m128i * const io)18 static INLINE void highbd_iadst4_sse4_1(__m128i *const io) {
19   const __m128i pair_c1 = pair_set_epi32(4 * sinpi_1_9, 0);
20   const __m128i pair_c2 = pair_set_epi32(4 * sinpi_2_9, 0);
21   const __m128i pair_c3 = pair_set_epi32(4 * sinpi_3_9, 0);
22   const __m128i pair_c4 = pair_set_epi32(4 * sinpi_4_9, 0);
23   __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], t0[2], t1[2], t2[2];
24   __m128i temp[2];
25 
26   transpose_32bit_4x4(io, io);
27 
28   extend_64bit(io[0], temp);
29   s0[0] = _mm_mul_epi32(pair_c1, temp[0]);
30   s0[1] = _mm_mul_epi32(pair_c1, temp[1]);
31   s1[0] = _mm_mul_epi32(pair_c2, temp[0]);
32   s1[1] = _mm_mul_epi32(pair_c2, temp[1]);
33 
34   extend_64bit(io[1], temp);
35   s2[0] = _mm_mul_epi32(pair_c3, temp[0]);
36   s2[1] = _mm_mul_epi32(pair_c3, temp[1]);
37 
38   extend_64bit(io[2], temp);
39   s3[0] = _mm_mul_epi32(pair_c4, temp[0]);
40   s3[1] = _mm_mul_epi32(pair_c4, temp[1]);
41   s4[0] = _mm_mul_epi32(pair_c1, temp[0]);
42   s4[1] = _mm_mul_epi32(pair_c1, temp[1]);
43 
44   extend_64bit(io[3], temp);
45   s5[0] = _mm_mul_epi32(pair_c2, temp[0]);
46   s5[1] = _mm_mul_epi32(pair_c2, temp[1]);
47   s6[0] = _mm_mul_epi32(pair_c4, temp[0]);
48   s6[1] = _mm_mul_epi32(pair_c4, temp[1]);
49 
50   t0[0] = _mm_add_epi64(s0[0], s3[0]);
51   t0[1] = _mm_add_epi64(s0[1], s3[1]);
52   t0[0] = _mm_add_epi64(t0[0], s5[0]);
53   t0[1] = _mm_add_epi64(t0[1], s5[1]);
54   t1[0] = _mm_sub_epi64(s1[0], s4[0]);
55   t1[1] = _mm_sub_epi64(s1[1], s4[1]);
56   t1[0] = _mm_sub_epi64(t1[0], s6[0]);
57   t1[1] = _mm_sub_epi64(t1[1], s6[1]);
58   temp[0] = _mm_sub_epi32(io[0], io[2]);
59   temp[0] = _mm_add_epi32(temp[0], io[3]);
60   extend_64bit(temp[0], temp);
61   t2[0] = _mm_mul_epi32(pair_c3, temp[0]);
62   t2[1] = _mm_mul_epi32(pair_c3, temp[1]);
63 
64   s0[0] = _mm_add_epi64(t0[0], s2[0]);
65   s0[1] = _mm_add_epi64(t0[1], s2[1]);
66   s1[0] = _mm_add_epi64(t1[0], s2[0]);
67   s1[1] = _mm_add_epi64(t1[1], s2[1]);
68   s3[0] = _mm_add_epi64(t0[0], t1[0]);
69   s3[1] = _mm_add_epi64(t0[1], t1[1]);
70   s3[0] = _mm_sub_epi64(s3[0], s2[0]);
71   s3[1] = _mm_sub_epi64(s3[1], s2[1]);
72 
73   s0[0] = dct_const_round_shift_64bit(s0[0]);
74   s0[1] = dct_const_round_shift_64bit(s0[1]);
75   s1[0] = dct_const_round_shift_64bit(s1[0]);
76   s1[1] = dct_const_round_shift_64bit(s1[1]);
77   s2[0] = dct_const_round_shift_64bit(t2[0]);
78   s2[1] = dct_const_round_shift_64bit(t2[1]);
79   s3[0] = dct_const_round_shift_64bit(s3[0]);
80   s3[1] = dct_const_round_shift_64bit(s3[1]);
81   io[0] = pack_4(s0[0], s0[1]);
82   io[1] = pack_4(s1[0], s1[1]);
83   io[2] = pack_4(s2[0], s2[1]);
84   io[3] = pack_4(s3[0], s3[1]);
85 }
86 
vp9_highbd_iht4x4_16_add_sse4_1(const tran_low_t * input,uint16_t * dest,int stride,int tx_type,int bd)87 void vp9_highbd_iht4x4_16_add_sse4_1(const tran_low_t *input, uint16_t *dest,
88                                      int stride, int tx_type, int bd) {
89   __m128i io[4];
90 
91   io[0] = _mm_load_si128((const __m128i *)(input + 0));
92   io[1] = _mm_load_si128((const __m128i *)(input + 4));
93   io[2] = _mm_load_si128((const __m128i *)(input + 8));
94   io[3] = _mm_load_si128((const __m128i *)(input + 12));
95 
96   if (bd == 8) {
97     __m128i io_short[2];
98 
99     io_short[0] = _mm_packs_epi32(io[0], io[1]);
100     io_short[1] = _mm_packs_epi32(io[2], io[3]);
101     if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
102       idct4_sse2(io_short);
103     } else {
104       iadst4_sse2(io_short);
105     }
106     if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
107       idct4_sse2(io_short);
108     } else {
109       iadst4_sse2(io_short);
110     }
111     io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8));
112     io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8));
113     io[0] = _mm_srai_epi16(io_short[0], 4);
114     io[1] = _mm_srai_epi16(io_short[1], 4);
115   } else {
116     if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
117       highbd_idct4_sse4_1(io);
118     } else {
119       highbd_iadst4_sse4_1(io);
120     }
121     if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
122       highbd_idct4_sse4_1(io);
123     } else {
124       highbd_iadst4_sse4_1(io);
125     }
126     io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8));
127     io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8));
128   }
129 
130   recon_and_store_4x4(io, dest, stride, bd);
131 }
132