1 /*
2  *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vp9_rtcd.h"
12 #include "vp9/common/vp9_idct.h"
13 #include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
14 #include "vpx_dsp/x86/inv_txfm_sse2.h"
15 #include "vpx_dsp/x86/transpose_sse2.h"
16 #include "vpx_dsp/x86/txfm_common_sse2.h"
17 
highbd_iadst_half_butterfly_sse4_1(const __m128i in,const int c,__m128i * const s)18 static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in,
19                                                       const int c,
20                                                       __m128i *const s) {
21   const __m128i pair_c = pair_set_epi32(4 * c, 0);
22   __m128i x[2];
23 
24   extend_64bit(in, x);
25   s[0] = _mm_mul_epi32(pair_c, x[0]);
26   s[1] = _mm_mul_epi32(pair_c, x[1]);
27 }
28 
highbd_iadst_butterfly_sse4_1(const __m128i in0,const __m128i in1,const int c0,const int c1,__m128i * const s0,__m128i * const s1)29 static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0,
30                                                  const __m128i in1,
31                                                  const int c0, const int c1,
32                                                  __m128i *const s0,
33                                                  __m128i *const s1) {
34   const __m128i pair_c0 = pair_set_epi32(4 * c0, 0);
35   const __m128i pair_c1 = pair_set_epi32(4 * c1, 0);
36   __m128i t00[2], t01[2], t10[2], t11[2];
37   __m128i x0[2], x1[2];
38 
39   extend_64bit(in0, x0);
40   extend_64bit(in1, x1);
41   t00[0] = _mm_mul_epi32(pair_c0, x0[0]);
42   t00[1] = _mm_mul_epi32(pair_c0, x0[1]);
43   t01[0] = _mm_mul_epi32(pair_c0, x1[0]);
44   t01[1] = _mm_mul_epi32(pair_c0, x1[1]);
45   t10[0] = _mm_mul_epi32(pair_c1, x0[0]);
46   t10[1] = _mm_mul_epi32(pair_c1, x0[1]);
47   t11[0] = _mm_mul_epi32(pair_c1, x1[0]);
48   t11[1] = _mm_mul_epi32(pair_c1, x1[1]);
49 
50   s0[0] = _mm_add_epi64(t00[0], t11[0]);
51   s0[1] = _mm_add_epi64(t00[1], t11[1]);
52   s1[0] = _mm_sub_epi64(t10[0], t01[0]);
53   s1[1] = _mm_sub_epi64(t10[1], t01[1]);
54 }
55 
highbd_iadst8_sse4_1(__m128i * const io)56 static void highbd_iadst8_sse4_1(__m128i *const io) {
57   __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
58   __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2];
59 
60   transpose_32bit_4x4x2(io, io);
61 
62   // stage 1
63   highbd_iadst_butterfly_sse4_1(io[7], io[0], cospi_2_64, cospi_30_64, s0, s1);
64   highbd_iadst_butterfly_sse4_1(io[3], io[4], cospi_18_64, cospi_14_64, s4, s5);
65   x0[0] = _mm_add_epi64(s0[0], s4[0]);
66   x0[1] = _mm_add_epi64(s0[1], s4[1]);
67   x1[0] = _mm_add_epi64(s1[0], s5[0]);
68   x1[1] = _mm_add_epi64(s1[1], s5[1]);
69   x4[0] = _mm_sub_epi64(s0[0], s4[0]);
70   x4[1] = _mm_sub_epi64(s0[1], s4[1]);
71   x5[0] = _mm_sub_epi64(s1[0], s5[0]);
72   x5[1] = _mm_sub_epi64(s1[1], s5[1]);
73 
74   highbd_iadst_butterfly_sse4_1(io[5], io[2], cospi_10_64, cospi_22_64, s2, s3);
75   highbd_iadst_butterfly_sse4_1(io[1], io[6], cospi_26_64, cospi_6_64, s6, s7);
76   x2[0] = _mm_add_epi64(s2[0], s6[0]);
77   x2[1] = _mm_add_epi64(s2[1], s6[1]);
78   x3[0] = _mm_add_epi64(s3[0], s7[0]);
79   x3[1] = _mm_add_epi64(s3[1], s7[1]);
80   x6[0] = _mm_sub_epi64(s2[0], s6[0]);
81   x6[1] = _mm_sub_epi64(s2[1], s6[1]);
82   x7[0] = _mm_sub_epi64(s3[0], s7[0]);
83   x7[1] = _mm_sub_epi64(s3[1], s7[1]);
84 
85   x0[0] = dct_const_round_shift_64bit(x0[0]);
86   x0[1] = dct_const_round_shift_64bit(x0[1]);
87   x1[0] = dct_const_round_shift_64bit(x1[0]);
88   x1[1] = dct_const_round_shift_64bit(x1[1]);
89   x2[0] = dct_const_round_shift_64bit(x2[0]);
90   x2[1] = dct_const_round_shift_64bit(x2[1]);
91   x3[0] = dct_const_round_shift_64bit(x3[0]);
92   x3[1] = dct_const_round_shift_64bit(x3[1]);
93   x4[0] = dct_const_round_shift_64bit(x4[0]);
94   x4[1] = dct_const_round_shift_64bit(x4[1]);
95   x5[0] = dct_const_round_shift_64bit(x5[0]);
96   x5[1] = dct_const_round_shift_64bit(x5[1]);
97   x6[0] = dct_const_round_shift_64bit(x6[0]);
98   x6[1] = dct_const_round_shift_64bit(x6[1]);
99   x7[0] = dct_const_round_shift_64bit(x7[0]);
100   x7[1] = dct_const_round_shift_64bit(x7[1]);
101   s0[0] = pack_4(x0[0], x0[1]);  // s0 = x0;
102   s1[0] = pack_4(x1[0], x1[1]);  // s1 = x1;
103   s2[0] = pack_4(x2[0], x2[1]);  // s2 = x2;
104   s3[0] = pack_4(x3[0], x3[1]);  // s3 = x3;
105   x4[0] = pack_4(x4[0], x4[1]);
106   x5[0] = pack_4(x5[0], x5[1]);
107   x6[0] = pack_4(x6[0], x6[1]);
108   x7[0] = pack_4(x7[0], x7[1]);
109 
110   // stage 2
111   x0[0] = _mm_add_epi32(s0[0], s2[0]);
112   x1[0] = _mm_add_epi32(s1[0], s3[0]);
113   x2[0] = _mm_sub_epi32(s0[0], s2[0]);
114   x3[0] = _mm_sub_epi32(s1[0], s3[0]);
115 
116   highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5);
117   highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6);
118 
119   x4[0] = _mm_add_epi64(s4[0], s6[0]);
120   x4[1] = _mm_add_epi64(s4[1], s6[1]);
121   x5[0] = _mm_add_epi64(s5[0], s7[0]);
122   x5[1] = _mm_add_epi64(s5[1], s7[1]);
123   x6[0] = _mm_sub_epi64(s4[0], s6[0]);
124   x6[1] = _mm_sub_epi64(s4[1], s6[1]);
125   x7[0] = _mm_sub_epi64(s5[0], s7[0]);
126   x7[1] = _mm_sub_epi64(s5[1], s7[1]);
127   x4[0] = dct_const_round_shift_64bit(x4[0]);
128   x4[1] = dct_const_round_shift_64bit(x4[1]);
129   x5[0] = dct_const_round_shift_64bit(x5[0]);
130   x5[1] = dct_const_round_shift_64bit(x5[1]);
131   x6[0] = dct_const_round_shift_64bit(x6[0]);
132   x6[1] = dct_const_round_shift_64bit(x6[1]);
133   x7[0] = dct_const_round_shift_64bit(x7[0]);
134   x7[1] = dct_const_round_shift_64bit(x7[1]);
135   x4[0] = pack_4(x4[0], x4[1]);
136   x5[0] = pack_4(x5[0], x5[1]);
137   x6[0] = pack_4(x6[0], x6[1]);
138   x7[0] = pack_4(x7[0], x7[1]);
139 
140   // stage 3
141   s2[0] = _mm_add_epi32(x2[0], x3[0]);
142   s3[0] = _mm_sub_epi32(x2[0], x3[0]);
143   s6[0] = _mm_add_epi32(x6[0], x7[0]);
144   s7[0] = _mm_sub_epi32(x6[0], x7[0]);
145   highbd_iadst_half_butterfly_sse4_1(s2[0], cospi_16_64, s2);
146   highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3);
147   highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6);
148   highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7);
149 
150   x2[0] = dct_const_round_shift_64bit(s2[0]);
151   x2[1] = dct_const_round_shift_64bit(s2[1]);
152   x3[0] = dct_const_round_shift_64bit(s3[0]);
153   x3[1] = dct_const_round_shift_64bit(s3[1]);
154   x6[0] = dct_const_round_shift_64bit(s6[0]);
155   x6[1] = dct_const_round_shift_64bit(s6[1]);
156   x7[0] = dct_const_round_shift_64bit(s7[0]);
157   x7[1] = dct_const_round_shift_64bit(s7[1]);
158   x2[0] = pack_4(x2[0], x2[1]);
159   x3[0] = pack_4(x3[0], x3[1]);
160   x6[0] = pack_4(x6[0], x6[1]);
161   x7[0] = pack_4(x7[0], x7[1]);
162 
163   io[0] = x0[0];
164   io[1] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]);
165   io[2] = x6[0];
166   io[3] = _mm_sub_epi32(_mm_setzero_si128(), x2[0]);
167   io[4] = x3[0];
168   io[5] = _mm_sub_epi32(_mm_setzero_si128(), x7[0]);
169   io[6] = x5[0];
170   io[7] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]);
171 }
172 
vp9_highbd_iht8x8_64_add_sse4_1(const tran_low_t * input,uint16_t * dest,int stride,int tx_type,int bd)173 void vp9_highbd_iht8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest,
174                                      int stride, int tx_type, int bd) {
175   __m128i io[16];
176 
177   io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
178   io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4));
179   io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
180   io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4));
181   io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
182   io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4));
183   io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
184   io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4));
185   io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
186   io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
187   io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
188   io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
189   io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
190   io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
191   io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
192   io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
193 
194   if (bd == 8) {
195     __m128i io_short[8];
196 
197     io_short[0] = _mm_packs_epi32(io[0], io[4]);
198     io_short[1] = _mm_packs_epi32(io[1], io[5]);
199     io_short[2] = _mm_packs_epi32(io[2], io[6]);
200     io_short[3] = _mm_packs_epi32(io[3], io[7]);
201     io_short[4] = _mm_packs_epi32(io[8], io[12]);
202     io_short[5] = _mm_packs_epi32(io[9], io[13]);
203     io_short[6] = _mm_packs_epi32(io[10], io[14]);
204     io_short[7] = _mm_packs_epi32(io[11], io[15]);
205 
206     if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
207       vpx_idct8_sse2(io_short);
208     } else {
209       iadst8_sse2(io_short);
210     }
211     if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
212       vpx_idct8_sse2(io_short);
213     } else {
214       iadst8_sse2(io_short);
215     }
216     round_shift_8x8(io_short, io);
217   } else {
218     __m128i temp[4];
219 
220     if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
221       vpx_highbd_idct8x8_half1d_sse4_1(io);
222       vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
223     } else {
224       highbd_iadst8_sse4_1(io);
225       highbd_iadst8_sse4_1(&io[8]);
226     }
227 
228     temp[0] = io[4];
229     temp[1] = io[5];
230     temp[2] = io[6];
231     temp[3] = io[7];
232     io[4] = io[8];
233     io[5] = io[9];
234     io[6] = io[10];
235     io[7] = io[11];
236 
237     if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
238       vpx_highbd_idct8x8_half1d_sse4_1(io);
239       io[8] = temp[0];
240       io[9] = temp[1];
241       io[10] = temp[2];
242       io[11] = temp[3];
243       vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
244     } else {
245       highbd_iadst8_sse4_1(io);
246       io[8] = temp[0];
247       io[9] = temp[1];
248       io[10] = temp[2];
249       io[11] = temp[3];
250       highbd_iadst8_sse4_1(&io[8]);
251     }
252     highbd_idct8x8_final_round(io);
253   }
254   recon_and_store_8x8(io, dest, stride, bd);
255 }
256