1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <emmintrin.h>  // SSE2
12 
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx_dsp/x86/inv_txfm_sse2.h"
15 #include "vpx_dsp/x86/transpose_sse2.h"
16 #include "vpx_dsp/x86/txfm_common_sse2.h"
17 
transpose_16bit_4(__m128i * res)18 static INLINE void transpose_16bit_4(__m128i *res) {
19   const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
20   const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
21 
22   res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
23   res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
24 }
25 
vpx_idct4x4_16_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)26 void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
27                              int stride) {
28   const __m128i eight = _mm_set1_epi16(8);
29   __m128i in[2];
30 
31   // Rows
32   in[0] = load_input_data8(input);
33   in[1] = load_input_data8(input + 8);
34   idct4_sse2(in);
35 
36   // Columns
37   idct4_sse2(in);
38 
39   // Final round and shift
40   in[0] = _mm_add_epi16(in[0], eight);
41   in[1] = _mm_add_epi16(in[1], eight);
42   in[0] = _mm_srai_epi16(in[0], 4);
43   in[1] = _mm_srai_epi16(in[1], 4);
44 
45   recon_and_store4x4_sse2(in, dest, stride);
46 }
47 
vpx_idct4x4_1_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)48 void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
49                             int stride) {
50   const __m128i zero = _mm_setzero_si128();
51   int a;
52   __m128i dc_value, d[2];
53 
54   a = (int)dct_const_round_shift((int16_t)input[0] * cospi_16_64);
55   a = (int)dct_const_round_shift(a * cospi_16_64);
56   a = ROUND_POWER_OF_TWO(a, 4);
57 
58   dc_value = _mm_set1_epi16(a);
59 
60   // Reconstruction and Store
61   d[0] = _mm_cvtsi32_si128(*(const int *)(dest));
62   d[1] = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
63   d[0] = _mm_unpacklo_epi32(d[0],
64                             _mm_cvtsi32_si128(*(const int *)(dest + stride)));
65   d[1] = _mm_unpacklo_epi32(
66       _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)), d[1]);
67   d[0] = _mm_unpacklo_epi8(d[0], zero);
68   d[1] = _mm_unpacklo_epi8(d[1], zero);
69   d[0] = _mm_add_epi16(d[0], dc_value);
70   d[1] = _mm_add_epi16(d[1], dc_value);
71   d[0] = _mm_packus_epi16(d[0], d[1]);
72 
73   *(int *)dest = _mm_cvtsi128_si32(d[0]);
74   d[0] = _mm_srli_si128(d[0], 4);
75   *(int *)(dest + stride) = _mm_cvtsi128_si32(d[0]);
76   d[0] = _mm_srli_si128(d[0], 4);
77   *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d[0]);
78   d[0] = _mm_srli_si128(d[0], 4);
79   *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]);
80 }
81 
idct4_sse2(__m128i * const in)82 void idct4_sse2(__m128i *const in) {
83   const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
84   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
85   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
86   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
87   __m128i u[2];
88 
89   transpose_16bit_4(in);
90   // stage 1
91   u[0] = _mm_unpacklo_epi16(in[0], in[1]);
92   u[1] = _mm_unpackhi_epi16(in[0], in[1]);
93   u[0] = idct_calc_wraplow_sse2(k__cospi_p16_p16, k__cospi_p16_m16, u[0]);
94   u[1] = idct_calc_wraplow_sse2(k__cospi_p08_p24, k__cospi_p24_m08, u[1]);
95 
96   // stage 2
97   in[0] = _mm_add_epi16(u[0], u[1]);
98   in[1] = _mm_sub_epi16(u[0], u[1]);
99   in[1] = _mm_shuffle_epi32(in[1], 0x4E);
100 }
101 
iadst4_sse2(__m128i * const in)102 void iadst4_sse2(__m128i *const in) {
103   const __m128i k__sinpi_1_3 = pair_set_epi16(sinpi_1_9, sinpi_3_9);
104   const __m128i k__sinpi_4_2 = pair_set_epi16(sinpi_4_9, sinpi_2_9);
105   const __m128i k__sinpi_2_3 = pair_set_epi16(sinpi_2_9, sinpi_3_9);
106   const __m128i k__sinpi_1_4 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
107   const __m128i k__sinpi_12_n3 =
108       pair_set_epi16(sinpi_1_9 + sinpi_2_9, -sinpi_3_9);
109   __m128i u[4], v[5];
110 
111   // 00 01 20 21  02 03 22 23
112   // 10 11 30 31  12 13 32 33
113   const __m128i tr0_0 = _mm_unpacklo_epi32(in[0], in[1]);
114   const __m128i tr0_1 = _mm_unpackhi_epi32(in[0], in[1]);
115 
116   // 00 01 10 11  20 21 30 31
117   // 02 03 12 13  22 23 32 33
118   in[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
119   in[1] = _mm_unpackhi_epi32(tr0_0, tr0_1);
120 
121   v[0] = _mm_madd_epi16(in[0], k__sinpi_1_3);    // s_1 * x0 + s_3 * x1
122   v[1] = _mm_madd_epi16(in[1], k__sinpi_4_2);    // s_4 * x2 + s_2 * x3
123   v[2] = _mm_madd_epi16(in[0], k__sinpi_2_3);    // s_2 * x0 + s_3 * x1
124   v[3] = _mm_madd_epi16(in[1], k__sinpi_1_4);    // s_1 * x2 + s_4 * x3
125   v[4] = _mm_madd_epi16(in[0], k__sinpi_12_n3);  // (s_1 + s_2) * x0 - s_3 * x1
126   in[0] = _mm_sub_epi16(in[0], in[1]);           // x0 - x2
127   in[1] = _mm_srli_epi32(in[1], 16);
128   in[0] = _mm_add_epi16(in[0], in[1]);
129   in[0] = _mm_slli_epi32(in[0], 16);  // x0 - x2 + x3
130 
131   u[0] = _mm_add_epi32(v[0], v[1]);
132   u[1] = _mm_sub_epi32(v[2], v[3]);
133   u[2] = _mm_madd_epi16(in[0], k__sinpi_1_3);
134   u[3] = _mm_sub_epi32(v[1], v[3]);
135   u[3] = _mm_add_epi32(u[3], v[4]);
136 
137   u[0] = dct_const_round_shift_sse2(u[0]);
138   u[1] = dct_const_round_shift_sse2(u[1]);
139   u[2] = dct_const_round_shift_sse2(u[2]);
140   u[3] = dct_const_round_shift_sse2(u[3]);
141 
142   in[0] = _mm_packs_epi32(u[0], u[1]);
143   in[1] = _mm_packs_epi32(u[2], u[3]);
144 }
145 
load_buffer_8x8(const tran_low_t * const input,__m128i * const in)146 static INLINE void load_buffer_8x8(const tran_low_t *const input,
147                                    __m128i *const in) {
148   in[0] = load_input_data8(input + 0 * 8);
149   in[1] = load_input_data8(input + 1 * 8);
150   in[2] = load_input_data8(input + 2 * 8);
151   in[3] = load_input_data8(input + 3 * 8);
152   in[4] = load_input_data8(input + 4 * 8);
153   in[5] = load_input_data8(input + 5 * 8);
154   in[6] = load_input_data8(input + 6 * 8);
155   in[7] = load_input_data8(input + 7 * 8);
156 }
157 
vpx_idct8x8_64_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)158 void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
159                              int stride) {
160   __m128i in[8];
161   int i;
162 
163   // Load input data.
164   load_buffer_8x8(input, in);
165 
166   // 2-D
167   for (i = 0; i < 2; i++) {
168     vpx_idct8_sse2(in);
169   }
170 
171   write_buffer_8x8(in, dest, stride);
172 }
173 
vpx_idct8x8_12_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)174 void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
175                              int stride) {
176   __m128i io[8];
177 
178   io[0] = load_input_data4(input + 0 * 8);
179   io[1] = load_input_data4(input + 1 * 8);
180   io[2] = load_input_data4(input + 2 * 8);
181   io[3] = load_input_data4(input + 3 * 8);
182 
183   idct8x8_12_add_kernel_sse2(io);
184   write_buffer_8x8(io, dest, stride);
185 }
186 
recon_and_store_8_dual(uint8_t * const dest,const __m128i in_x,const int stride)187 static INLINE void recon_and_store_8_dual(uint8_t *const dest,
188                                           const __m128i in_x,
189                                           const int stride) {
190   const __m128i zero = _mm_setzero_si128();
191   __m128i d0, d1;
192 
193   d0 = _mm_loadl_epi64((__m128i *)(dest + 0 * stride));
194   d1 = _mm_loadl_epi64((__m128i *)(dest + 1 * stride));
195   d0 = _mm_unpacklo_epi8(d0, zero);
196   d1 = _mm_unpacklo_epi8(d1, zero);
197   d0 = _mm_add_epi16(in_x, d0);
198   d1 = _mm_add_epi16(in_x, d1);
199   d0 = _mm_packus_epi16(d0, d1);
200   _mm_storel_epi64((__m128i *)(dest + 0 * stride), d0);
201   _mm_storeh_pi((__m64 *)(dest + 1 * stride), _mm_castsi128_ps(d0));
202 }
203 
vpx_idct8x8_1_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)204 void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
205                             int stride) {
206   __m128i dc_value;
207   tran_high_t a1;
208   tran_low_t out =
209       WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
210 
211   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
212   a1 = ROUND_POWER_OF_TWO(out, 5);
213   dc_value = _mm_set1_epi16((int16_t)a1);
214 
215   recon_and_store_8_dual(dest, dc_value, stride);
216   dest += 2 * stride;
217   recon_and_store_8_dual(dest, dc_value, stride);
218   dest += 2 * stride;
219   recon_and_store_8_dual(dest, dc_value, stride);
220   dest += 2 * stride;
221   recon_and_store_8_dual(dest, dc_value, stride);
222 }
223 
vpx_idct8_sse2(__m128i * const in)224 void vpx_idct8_sse2(__m128i *const in) {
225   // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
226   transpose_16bit_8x8(in, in);
227 
228   // 4-stage 1D idct8x8
229   idct8(in, in);
230 }
231 
iadst8_sse2(__m128i * const in)232 void iadst8_sse2(__m128i *const in) {
233   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
234   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
235   const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
236   const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
237   const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
238   const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
239   const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
240   const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
241   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
242   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
243   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
244   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
245   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
246   const __m128i kZero = _mm_set1_epi16(0);
247   __m128i s[8], u[16], v[8], w[16];
248 
249   // transpose
250   transpose_16bit_8x8(in, in);
251 
252   // column transformation
253   // stage 1
254   // interleave and multiply/add into 32-bit integer
255   s[0] = _mm_unpacklo_epi16(in[7], in[0]);
256   s[1] = _mm_unpackhi_epi16(in[7], in[0]);
257   s[2] = _mm_unpacklo_epi16(in[5], in[2]);
258   s[3] = _mm_unpackhi_epi16(in[5], in[2]);
259   s[4] = _mm_unpacklo_epi16(in[3], in[4]);
260   s[5] = _mm_unpackhi_epi16(in[3], in[4]);
261   s[6] = _mm_unpacklo_epi16(in[1], in[6]);
262   s[7] = _mm_unpackhi_epi16(in[1], in[6]);
263 
264   u[0] = _mm_madd_epi16(s[0], k__cospi_p02_p30);
265   u[1] = _mm_madd_epi16(s[1], k__cospi_p02_p30);
266   u[2] = _mm_madd_epi16(s[0], k__cospi_p30_m02);
267   u[3] = _mm_madd_epi16(s[1], k__cospi_p30_m02);
268   u[4] = _mm_madd_epi16(s[2], k__cospi_p10_p22);
269   u[5] = _mm_madd_epi16(s[3], k__cospi_p10_p22);
270   u[6] = _mm_madd_epi16(s[2], k__cospi_p22_m10);
271   u[7] = _mm_madd_epi16(s[3], k__cospi_p22_m10);
272   u[8] = _mm_madd_epi16(s[4], k__cospi_p18_p14);
273   u[9] = _mm_madd_epi16(s[5], k__cospi_p18_p14);
274   u[10] = _mm_madd_epi16(s[4], k__cospi_p14_m18);
275   u[11] = _mm_madd_epi16(s[5], k__cospi_p14_m18);
276   u[12] = _mm_madd_epi16(s[6], k__cospi_p26_p06);
277   u[13] = _mm_madd_epi16(s[7], k__cospi_p26_p06);
278   u[14] = _mm_madd_epi16(s[6], k__cospi_p06_m26);
279   u[15] = _mm_madd_epi16(s[7], k__cospi_p06_m26);
280 
281   // addition
282   w[0] = _mm_add_epi32(u[0], u[8]);
283   w[1] = _mm_add_epi32(u[1], u[9]);
284   w[2] = _mm_add_epi32(u[2], u[10]);
285   w[3] = _mm_add_epi32(u[3], u[11]);
286   w[4] = _mm_add_epi32(u[4], u[12]);
287   w[5] = _mm_add_epi32(u[5], u[13]);
288   w[6] = _mm_add_epi32(u[6], u[14]);
289   w[7] = _mm_add_epi32(u[7], u[15]);
290   w[8] = _mm_sub_epi32(u[0], u[8]);
291   w[9] = _mm_sub_epi32(u[1], u[9]);
292   w[10] = _mm_sub_epi32(u[2], u[10]);
293   w[11] = _mm_sub_epi32(u[3], u[11]);
294   w[12] = _mm_sub_epi32(u[4], u[12]);
295   w[13] = _mm_sub_epi32(u[5], u[13]);
296   w[14] = _mm_sub_epi32(u[6], u[14]);
297   w[15] = _mm_sub_epi32(u[7], u[15]);
298 
299   // shift and rounding
300   u[0] = dct_const_round_shift_sse2(w[0]);
301   u[1] = dct_const_round_shift_sse2(w[1]);
302   u[2] = dct_const_round_shift_sse2(w[2]);
303   u[3] = dct_const_round_shift_sse2(w[3]);
304   u[4] = dct_const_round_shift_sse2(w[4]);
305   u[5] = dct_const_round_shift_sse2(w[5]);
306   u[6] = dct_const_round_shift_sse2(w[6]);
307   u[7] = dct_const_round_shift_sse2(w[7]);
308   u[8] = dct_const_round_shift_sse2(w[8]);
309   u[9] = dct_const_round_shift_sse2(w[9]);
310   u[10] = dct_const_round_shift_sse2(w[10]);
311   u[11] = dct_const_round_shift_sse2(w[11]);
312   u[12] = dct_const_round_shift_sse2(w[12]);
313   u[13] = dct_const_round_shift_sse2(w[13]);
314   u[14] = dct_const_round_shift_sse2(w[14]);
315   u[15] = dct_const_round_shift_sse2(w[15]);
316 
317   // back to 16-bit and pack 8 integers into __m128i
318   in[0] = _mm_packs_epi32(u[0], u[1]);
319   in[1] = _mm_packs_epi32(u[2], u[3]);
320   in[2] = _mm_packs_epi32(u[4], u[5]);
321   in[3] = _mm_packs_epi32(u[6], u[7]);
322   in[4] = _mm_packs_epi32(u[8], u[9]);
323   in[5] = _mm_packs_epi32(u[10], u[11]);
324   in[6] = _mm_packs_epi32(u[12], u[13]);
325   in[7] = _mm_packs_epi32(u[14], u[15]);
326 
327   // stage 2
328   s[0] = _mm_add_epi16(in[0], in[2]);
329   s[1] = _mm_add_epi16(in[1], in[3]);
330   s[2] = _mm_sub_epi16(in[0], in[2]);
331   s[3] = _mm_sub_epi16(in[1], in[3]);
332   u[0] = _mm_unpacklo_epi16(in[4], in[5]);
333   u[1] = _mm_unpackhi_epi16(in[4], in[5]);
334   u[2] = _mm_unpacklo_epi16(in[6], in[7]);
335   u[3] = _mm_unpackhi_epi16(in[6], in[7]);
336 
337   v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
338   v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
339   v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
340   v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
341   v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
342   v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
343   v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
344   v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
345 
346   w[0] = _mm_add_epi32(v[0], v[4]);
347   w[1] = _mm_add_epi32(v[1], v[5]);
348   w[2] = _mm_add_epi32(v[2], v[6]);
349   w[3] = _mm_add_epi32(v[3], v[7]);
350   w[4] = _mm_sub_epi32(v[0], v[4]);
351   w[5] = _mm_sub_epi32(v[1], v[5]);
352   w[6] = _mm_sub_epi32(v[2], v[6]);
353   w[7] = _mm_sub_epi32(v[3], v[7]);
354 
355   u[0] = dct_const_round_shift_sse2(w[0]);
356   u[1] = dct_const_round_shift_sse2(w[1]);
357   u[2] = dct_const_round_shift_sse2(w[2]);
358   u[3] = dct_const_round_shift_sse2(w[3]);
359   u[4] = dct_const_round_shift_sse2(w[4]);
360   u[5] = dct_const_round_shift_sse2(w[5]);
361   u[6] = dct_const_round_shift_sse2(w[6]);
362   u[7] = dct_const_round_shift_sse2(w[7]);
363 
364   // back to 16-bit intergers
365   s[4] = _mm_packs_epi32(u[0], u[1]);
366   s[5] = _mm_packs_epi32(u[2], u[3]);
367   s[6] = _mm_packs_epi32(u[4], u[5]);
368   s[7] = _mm_packs_epi32(u[6], u[7]);
369 
370   // stage 3
371   u[0] = _mm_unpacklo_epi16(s[2], s[3]);
372   u[1] = _mm_unpackhi_epi16(s[2], s[3]);
373   u[2] = _mm_unpacklo_epi16(s[6], s[7]);
374   u[3] = _mm_unpackhi_epi16(s[6], s[7]);
375 
376   s[2] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_p16);
377   s[3] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_m16);
378   s[6] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_p16);
379   s[7] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_m16);
380 
381   in[0] = s[0];
382   in[1] = _mm_sub_epi16(kZero, s[4]);
383   in[2] = s[6];
384   in[3] = _mm_sub_epi16(kZero, s[2]);
385   in[4] = s[3];
386   in[5] = _mm_sub_epi16(kZero, s[7]);
387   in[6] = s[5];
388   in[7] = _mm_sub_epi16(kZero, s[1]);
389 }
390 
idct16_load8x8(const tran_low_t * const input,__m128i * const in)391 static INLINE void idct16_load8x8(const tran_low_t *const input,
392                                   __m128i *const in) {
393   in[0] = load_input_data8(input + 0 * 16);
394   in[1] = load_input_data8(input + 1 * 16);
395   in[2] = load_input_data8(input + 2 * 16);
396   in[3] = load_input_data8(input + 3 * 16);
397   in[4] = load_input_data8(input + 4 * 16);
398   in[5] = load_input_data8(input + 5 * 16);
399   in[6] = load_input_data8(input + 6 * 16);
400   in[7] = load_input_data8(input + 7 * 16);
401 }
402 
vpx_idct16x16_256_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)403 void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
404                                 int stride) {
405   __m128i l[16], r[16], out[16], *in;
406   int i;
407 
408   in = l;
409   for (i = 0; i < 2; i++) {
410     idct16_load8x8(input, in);
411     transpose_16bit_8x8(in, in);
412     idct16_load8x8(input + 8, in + 8);
413     transpose_16bit_8x8(in + 8, in + 8);
414     idct16_8col(in, in);
415     in = r;
416     input += 128;
417   }
418 
419   for (i = 0; i < 16; i += 8) {
420     int j;
421     transpose_16bit_8x8(l + i, out);
422     transpose_16bit_8x8(r + i, out + 8);
423     idct16_8col(out, out);
424 
425     for (j = 0; j < 16; ++j) {
426       write_buffer_8x1(dest + j * stride, out[j]);
427     }
428 
429     dest += 8;
430   }
431 }
432 
vpx_idct16x16_38_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)433 void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest,
434                                int stride) {
435   __m128i in[16], temp[16], out[16];
436   int i;
437 
438   idct16_load8x8(input, in);
439   transpose_16bit_8x8(in, in);
440 
441   for (i = 8; i < 16; i++) {
442     in[i] = _mm_setzero_si128();
443   }
444   idct16_8col(in, temp);
445 
446   for (i = 0; i < 16; i += 8) {
447     int j;
448     transpose_16bit_8x8(temp + i, in);
449     idct16_8col(in, out);
450 
451     for (j = 0; j < 16; ++j) {
452       write_buffer_8x1(dest + j * stride, out[j]);
453     }
454 
455     dest += 8;
456   }
457 }
458 
vpx_idct16x16_10_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)459 void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
460                                int stride) {
461   __m128i in[16], l[16];
462   int i;
463 
464   // First 1-D inverse DCT
465   // Load input data.
466   in[0] = load_input_data4(input + 0 * 16);
467   in[1] = load_input_data4(input + 1 * 16);
468   in[2] = load_input_data4(input + 2 * 16);
469   in[3] = load_input_data4(input + 3 * 16);
470 
471   idct16x16_10_pass1(in, l);
472 
473   // Second 1-D inverse transform, performed per 8x16 block
474   for (i = 0; i < 16; i += 8) {
475     int j;
476     idct16x16_10_pass2(l + i, in);
477 
478     for (j = 0; j < 16; ++j) {
479       write_buffer_8x1(dest + j * stride, in[j]);
480     }
481 
482     dest += 8;
483   }
484 }
485 
recon_and_store_16(uint8_t * const dest,const __m128i in_x)486 static INLINE void recon_and_store_16(uint8_t *const dest, const __m128i in_x) {
487   const __m128i zero = _mm_setzero_si128();
488   __m128i d0, d1;
489 
490   d0 = _mm_load_si128((__m128i *)(dest));
491   d1 = _mm_unpackhi_epi8(d0, zero);
492   d0 = _mm_unpacklo_epi8(d0, zero);
493   d0 = _mm_add_epi16(in_x, d0);
494   d1 = _mm_add_epi16(in_x, d1);
495   d0 = _mm_packus_epi16(d0, d1);
496   _mm_store_si128((__m128i *)(dest), d0);
497 }
498 
vpx_idct16x16_1_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)499 void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
500                               int stride) {
501   __m128i dc_value;
502   int i;
503   tran_high_t a1;
504   tran_low_t out =
505       WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
506 
507   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
508   a1 = ROUND_POWER_OF_TWO(out, 6);
509   dc_value = _mm_set1_epi16((int16_t)a1);
510 
511   for (i = 0; i < 16; ++i) {
512     recon_and_store_16(dest, dc_value);
513     dest += stride;
514   }
515 }
516 
vpx_iadst16_8col_sse2(__m128i * const in)517 void vpx_iadst16_8col_sse2(__m128i *const in) {
518   // perform 16x16 1-D ADST for 8 columns
519   __m128i s[16], x[16], u[32], v[32];
520   const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
521   const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
522   const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
523   const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
524   const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
525   const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
526   const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
527   const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
528   const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
529   const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
530   const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
531   const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
532   const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
533   const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
534   const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
535   const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
536   const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
537   const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
538   const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
539   const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
540   const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
541   const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
542   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
543   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
544   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
545   const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
546   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
547   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
548   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
549   const __m128i kZero = _mm_set1_epi16(0);
550 
551   u[0] = _mm_unpacklo_epi16(in[15], in[0]);
552   u[1] = _mm_unpackhi_epi16(in[15], in[0]);
553   u[2] = _mm_unpacklo_epi16(in[13], in[2]);
554   u[3] = _mm_unpackhi_epi16(in[13], in[2]);
555   u[4] = _mm_unpacklo_epi16(in[11], in[4]);
556   u[5] = _mm_unpackhi_epi16(in[11], in[4]);
557   u[6] = _mm_unpacklo_epi16(in[9], in[6]);
558   u[7] = _mm_unpackhi_epi16(in[9], in[6]);
559   u[8] = _mm_unpacklo_epi16(in[7], in[8]);
560   u[9] = _mm_unpackhi_epi16(in[7], in[8]);
561   u[10] = _mm_unpacklo_epi16(in[5], in[10]);
562   u[11] = _mm_unpackhi_epi16(in[5], in[10]);
563   u[12] = _mm_unpacklo_epi16(in[3], in[12]);
564   u[13] = _mm_unpackhi_epi16(in[3], in[12]);
565   u[14] = _mm_unpacklo_epi16(in[1], in[14]);
566   u[15] = _mm_unpackhi_epi16(in[1], in[14]);
567 
568   v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
569   v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
570   v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
571   v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
572   v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
573   v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
574   v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
575   v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
576   v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
577   v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
578   v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
579   v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
580   v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
581   v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
582   v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
583   v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
584   v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
585   v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
586   v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
587   v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
588   v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
589   v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
590   v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
591   v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
592   v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
593   v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
594   v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
595   v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
596   v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
597   v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
598   v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
599   v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
600 
601   u[0] = _mm_add_epi32(v[0], v[16]);
602   u[1] = _mm_add_epi32(v[1], v[17]);
603   u[2] = _mm_add_epi32(v[2], v[18]);
604   u[3] = _mm_add_epi32(v[3], v[19]);
605   u[4] = _mm_add_epi32(v[4], v[20]);
606   u[5] = _mm_add_epi32(v[5], v[21]);
607   u[6] = _mm_add_epi32(v[6], v[22]);
608   u[7] = _mm_add_epi32(v[7], v[23]);
609   u[8] = _mm_add_epi32(v[8], v[24]);
610   u[9] = _mm_add_epi32(v[9], v[25]);
611   u[10] = _mm_add_epi32(v[10], v[26]);
612   u[11] = _mm_add_epi32(v[11], v[27]);
613   u[12] = _mm_add_epi32(v[12], v[28]);
614   u[13] = _mm_add_epi32(v[13], v[29]);
615   u[14] = _mm_add_epi32(v[14], v[30]);
616   u[15] = _mm_add_epi32(v[15], v[31]);
617   u[16] = _mm_sub_epi32(v[0], v[16]);
618   u[17] = _mm_sub_epi32(v[1], v[17]);
619   u[18] = _mm_sub_epi32(v[2], v[18]);
620   u[19] = _mm_sub_epi32(v[3], v[19]);
621   u[20] = _mm_sub_epi32(v[4], v[20]);
622   u[21] = _mm_sub_epi32(v[5], v[21]);
623   u[22] = _mm_sub_epi32(v[6], v[22]);
624   u[23] = _mm_sub_epi32(v[7], v[23]);
625   u[24] = _mm_sub_epi32(v[8], v[24]);
626   u[25] = _mm_sub_epi32(v[9], v[25]);
627   u[26] = _mm_sub_epi32(v[10], v[26]);
628   u[27] = _mm_sub_epi32(v[11], v[27]);
629   u[28] = _mm_sub_epi32(v[12], v[28]);
630   u[29] = _mm_sub_epi32(v[13], v[29]);
631   u[30] = _mm_sub_epi32(v[14], v[30]);
632   u[31] = _mm_sub_epi32(v[15], v[31]);
633 
634   u[0] = dct_const_round_shift_sse2(u[0]);
635   u[1] = dct_const_round_shift_sse2(u[1]);
636   u[2] = dct_const_round_shift_sse2(u[2]);
637   u[3] = dct_const_round_shift_sse2(u[3]);
638   u[4] = dct_const_round_shift_sse2(u[4]);
639   u[5] = dct_const_round_shift_sse2(u[5]);
640   u[6] = dct_const_round_shift_sse2(u[6]);
641   u[7] = dct_const_round_shift_sse2(u[7]);
642   u[8] = dct_const_round_shift_sse2(u[8]);
643   u[9] = dct_const_round_shift_sse2(u[9]);
644   u[10] = dct_const_round_shift_sse2(u[10]);
645   u[11] = dct_const_round_shift_sse2(u[11]);
646   u[12] = dct_const_round_shift_sse2(u[12]);
647   u[13] = dct_const_round_shift_sse2(u[13]);
648   u[14] = dct_const_round_shift_sse2(u[14]);
649   u[15] = dct_const_round_shift_sse2(u[15]);
650   u[16] = dct_const_round_shift_sse2(u[16]);
651   u[17] = dct_const_round_shift_sse2(u[17]);
652   u[18] = dct_const_round_shift_sse2(u[18]);
653   u[19] = dct_const_round_shift_sse2(u[19]);
654   u[20] = dct_const_round_shift_sse2(u[20]);
655   u[21] = dct_const_round_shift_sse2(u[21]);
656   u[22] = dct_const_round_shift_sse2(u[22]);
657   u[23] = dct_const_round_shift_sse2(u[23]);
658   u[24] = dct_const_round_shift_sse2(u[24]);
659   u[25] = dct_const_round_shift_sse2(u[25]);
660   u[26] = dct_const_round_shift_sse2(u[26]);
661   u[27] = dct_const_round_shift_sse2(u[27]);
662   u[28] = dct_const_round_shift_sse2(u[28]);
663   u[29] = dct_const_round_shift_sse2(u[29]);
664   u[30] = dct_const_round_shift_sse2(u[30]);
665   u[31] = dct_const_round_shift_sse2(u[31]);
666 
667   s[0] = _mm_packs_epi32(u[0], u[1]);
668   s[1] = _mm_packs_epi32(u[2], u[3]);
669   s[2] = _mm_packs_epi32(u[4], u[5]);
670   s[3] = _mm_packs_epi32(u[6], u[7]);
671   s[4] = _mm_packs_epi32(u[8], u[9]);
672   s[5] = _mm_packs_epi32(u[10], u[11]);
673   s[6] = _mm_packs_epi32(u[12], u[13]);
674   s[7] = _mm_packs_epi32(u[14], u[15]);
675   s[8] = _mm_packs_epi32(u[16], u[17]);
676   s[9] = _mm_packs_epi32(u[18], u[19]);
677   s[10] = _mm_packs_epi32(u[20], u[21]);
678   s[11] = _mm_packs_epi32(u[22], u[23]);
679   s[12] = _mm_packs_epi32(u[24], u[25]);
680   s[13] = _mm_packs_epi32(u[26], u[27]);
681   s[14] = _mm_packs_epi32(u[28], u[29]);
682   s[15] = _mm_packs_epi32(u[30], u[31]);
683 
684   // stage 2
685   u[0] = _mm_unpacklo_epi16(s[8], s[9]);
686   u[1] = _mm_unpackhi_epi16(s[8], s[9]);
687   u[2] = _mm_unpacklo_epi16(s[10], s[11]);
688   u[3] = _mm_unpackhi_epi16(s[10], s[11]);
689   u[4] = _mm_unpacklo_epi16(s[12], s[13]);
690   u[5] = _mm_unpackhi_epi16(s[12], s[13]);
691   u[6] = _mm_unpacklo_epi16(s[14], s[15]);
692   u[7] = _mm_unpackhi_epi16(s[14], s[15]);
693 
694   v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
695   v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
696   v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
697   v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
698   v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
699   v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
700   v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
701   v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
702   v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
703   v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
704   v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
705   v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
706   v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
707   v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
708   v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
709   v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
710 
711   u[0] = _mm_add_epi32(v[0], v[8]);
712   u[1] = _mm_add_epi32(v[1], v[9]);
713   u[2] = _mm_add_epi32(v[2], v[10]);
714   u[3] = _mm_add_epi32(v[3], v[11]);
715   u[4] = _mm_add_epi32(v[4], v[12]);
716   u[5] = _mm_add_epi32(v[5], v[13]);
717   u[6] = _mm_add_epi32(v[6], v[14]);
718   u[7] = _mm_add_epi32(v[7], v[15]);
719   u[8] = _mm_sub_epi32(v[0], v[8]);
720   u[9] = _mm_sub_epi32(v[1], v[9]);
721   u[10] = _mm_sub_epi32(v[2], v[10]);
722   u[11] = _mm_sub_epi32(v[3], v[11]);
723   u[12] = _mm_sub_epi32(v[4], v[12]);
724   u[13] = _mm_sub_epi32(v[5], v[13]);
725   u[14] = _mm_sub_epi32(v[6], v[14]);
726   u[15] = _mm_sub_epi32(v[7], v[15]);
727 
728   u[0] = dct_const_round_shift_sse2(u[0]);
729   u[1] = dct_const_round_shift_sse2(u[1]);
730   u[2] = dct_const_round_shift_sse2(u[2]);
731   u[3] = dct_const_round_shift_sse2(u[3]);
732   u[4] = dct_const_round_shift_sse2(u[4]);
733   u[5] = dct_const_round_shift_sse2(u[5]);
734   u[6] = dct_const_round_shift_sse2(u[6]);
735   u[7] = dct_const_round_shift_sse2(u[7]);
736   u[8] = dct_const_round_shift_sse2(u[8]);
737   u[9] = dct_const_round_shift_sse2(u[9]);
738   u[10] = dct_const_round_shift_sse2(u[10]);
739   u[11] = dct_const_round_shift_sse2(u[11]);
740   u[12] = dct_const_round_shift_sse2(u[12]);
741   u[13] = dct_const_round_shift_sse2(u[13]);
742   u[14] = dct_const_round_shift_sse2(u[14]);
743   u[15] = dct_const_round_shift_sse2(u[15]);
744 
745   x[0] = _mm_add_epi16(s[0], s[4]);
746   x[1] = _mm_add_epi16(s[1], s[5]);
747   x[2] = _mm_add_epi16(s[2], s[6]);
748   x[3] = _mm_add_epi16(s[3], s[7]);
749   x[4] = _mm_sub_epi16(s[0], s[4]);
750   x[5] = _mm_sub_epi16(s[1], s[5]);
751   x[6] = _mm_sub_epi16(s[2], s[6]);
752   x[7] = _mm_sub_epi16(s[3], s[7]);
753   x[8] = _mm_packs_epi32(u[0], u[1]);
754   x[9] = _mm_packs_epi32(u[2], u[3]);
755   x[10] = _mm_packs_epi32(u[4], u[5]);
756   x[11] = _mm_packs_epi32(u[6], u[7]);
757   x[12] = _mm_packs_epi32(u[8], u[9]);
758   x[13] = _mm_packs_epi32(u[10], u[11]);
759   x[14] = _mm_packs_epi32(u[12], u[13]);
760   x[15] = _mm_packs_epi32(u[14], u[15]);
761 
762   // stage 3
763   u[0] = _mm_unpacklo_epi16(x[4], x[5]);
764   u[1] = _mm_unpackhi_epi16(x[4], x[5]);
765   u[2] = _mm_unpacklo_epi16(x[6], x[7]);
766   u[3] = _mm_unpackhi_epi16(x[6], x[7]);
767   u[4] = _mm_unpacklo_epi16(x[12], x[13]);
768   u[5] = _mm_unpackhi_epi16(x[12], x[13]);
769   u[6] = _mm_unpacklo_epi16(x[14], x[15]);
770   u[7] = _mm_unpackhi_epi16(x[14], x[15]);
771 
772   v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
773   v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
774   v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
775   v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
776   v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
777   v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
778   v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
779   v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
780   v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
781   v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
782   v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
783   v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
784   v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
785   v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
786   v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
787   v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
788 
789   u[0] = _mm_add_epi32(v[0], v[4]);
790   u[1] = _mm_add_epi32(v[1], v[5]);
791   u[2] = _mm_add_epi32(v[2], v[6]);
792   u[3] = _mm_add_epi32(v[3], v[7]);
793   u[4] = _mm_sub_epi32(v[0], v[4]);
794   u[5] = _mm_sub_epi32(v[1], v[5]);
795   u[6] = _mm_sub_epi32(v[2], v[6]);
796   u[7] = _mm_sub_epi32(v[3], v[7]);
797   u[8] = _mm_add_epi32(v[8], v[12]);
798   u[9] = _mm_add_epi32(v[9], v[13]);
799   u[10] = _mm_add_epi32(v[10], v[14]);
800   u[11] = _mm_add_epi32(v[11], v[15]);
801   u[12] = _mm_sub_epi32(v[8], v[12]);
802   u[13] = _mm_sub_epi32(v[9], v[13]);
803   u[14] = _mm_sub_epi32(v[10], v[14]);
804   u[15] = _mm_sub_epi32(v[11], v[15]);
805 
806   v[0] = dct_const_round_shift_sse2(u[0]);
807   v[1] = dct_const_round_shift_sse2(u[1]);
808   v[2] = dct_const_round_shift_sse2(u[2]);
809   v[3] = dct_const_round_shift_sse2(u[3]);
810   v[4] = dct_const_round_shift_sse2(u[4]);
811   v[5] = dct_const_round_shift_sse2(u[5]);
812   v[6] = dct_const_round_shift_sse2(u[6]);
813   v[7] = dct_const_round_shift_sse2(u[7]);
814   v[8] = dct_const_round_shift_sse2(u[8]);
815   v[9] = dct_const_round_shift_sse2(u[9]);
816   v[10] = dct_const_round_shift_sse2(u[10]);
817   v[11] = dct_const_round_shift_sse2(u[11]);
818   v[12] = dct_const_round_shift_sse2(u[12]);
819   v[13] = dct_const_round_shift_sse2(u[13]);
820   v[14] = dct_const_round_shift_sse2(u[14]);
821   v[15] = dct_const_round_shift_sse2(u[15]);
822 
823   s[0] = _mm_add_epi16(x[0], x[2]);
824   s[1] = _mm_add_epi16(x[1], x[3]);
825   s[2] = _mm_sub_epi16(x[0], x[2]);
826   s[3] = _mm_sub_epi16(x[1], x[3]);
827   s[4] = _mm_packs_epi32(v[0], v[1]);
828   s[5] = _mm_packs_epi32(v[2], v[3]);
829   s[6] = _mm_packs_epi32(v[4], v[5]);
830   s[7] = _mm_packs_epi32(v[6], v[7]);
831   s[8] = _mm_add_epi16(x[8], x[10]);
832   s[9] = _mm_add_epi16(x[9], x[11]);
833   s[10] = _mm_sub_epi16(x[8], x[10]);
834   s[11] = _mm_sub_epi16(x[9], x[11]);
835   s[12] = _mm_packs_epi32(v[8], v[9]);
836   s[13] = _mm_packs_epi32(v[10], v[11]);
837   s[14] = _mm_packs_epi32(v[12], v[13]);
838   s[15] = _mm_packs_epi32(v[14], v[15]);
839 
840   // stage 4
841   u[0] = _mm_unpacklo_epi16(s[2], s[3]);
842   u[1] = _mm_unpackhi_epi16(s[2], s[3]);
843   u[2] = _mm_unpacklo_epi16(s[6], s[7]);
844   u[3] = _mm_unpackhi_epi16(s[6], s[7]);
845   u[4] = _mm_unpacklo_epi16(s[10], s[11]);
846   u[5] = _mm_unpackhi_epi16(s[10], s[11]);
847   u[6] = _mm_unpacklo_epi16(s[14], s[15]);
848   u[7] = _mm_unpackhi_epi16(s[14], s[15]);
849 
850   in[7] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_m16_m16);
851   in[8] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_m16);
852   in[4] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_p16);
853   in[11] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_m16_p16);
854   in[6] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p16_p16);
855   in[9] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_m16_p16);
856   in[5] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_m16_m16);
857   in[10] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_p16_m16);
858 
859   in[0] = s[0];
860   in[1] = _mm_sub_epi16(kZero, s[8]);
861   in[2] = s[12];
862   in[3] = _mm_sub_epi16(kZero, s[4]);
863   in[12] = s[5];
864   in[13] = _mm_sub_epi16(kZero, s[13]);
865   in[14] = s[9];
866   in[15] = _mm_sub_epi16(kZero, s[1]);
867 }
868 
idct16_sse2(__m128i * const in0,__m128i * const in1)869 void idct16_sse2(__m128i *const in0, __m128i *const in1) {
870   transpose_16bit_16x16(in0, in1);
871   idct16_8col(in0, in0);
872   idct16_8col(in1, in1);
873 }
874 
iadst16_sse2(__m128i * const in0,__m128i * const in1)875 void iadst16_sse2(__m128i *const in0, __m128i *const in1) {
876   transpose_16bit_16x16(in0, in1);
877   vpx_iadst16_8col_sse2(in0);
878   vpx_iadst16_8col_sse2(in1);
879 }
880 
881 // Group the coefficient calculation into smaller functions to prevent stack
882 // spillover in 32x32 idct optimizations:
883 // quarter_1: 0-7
884 // quarter_2: 8-15
885 // quarter_3_4: 16-23, 24-31
886 
887 // For each 8x32 block __m128i in[32],
888 // Input with index, 0, 4
889 // output pixels: 0-7 in __m128i out[32]
idct32_34_8x32_quarter_1(const __m128i * const in,__m128i * const out)890 static INLINE void idct32_34_8x32_quarter_1(const __m128i *const in /*in[32]*/,
891                                             __m128i *const out /*out[8]*/) {
892   const __m128i zero = _mm_setzero_si128();
893   __m128i step1[8], step2[8];
894 
895   // stage 3
896   butterfly(in[4], zero, cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
897 
898   // stage 4
899   step2[0] = butterfly_cospi16(in[0]);
900   step2[4] = step1[4];
901   step2[5] = step1[4];
902   step2[6] = step1[7];
903   step2[7] = step1[7];
904 
905   // stage 5
906   step1[0] = step2[0];
907   step1[1] = step2[0];
908   step1[2] = step2[0];
909   step1[3] = step2[0];
910   step1[4] = step2[4];
911   butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
912   step1[7] = step2[7];
913 
914   // stage 6
915   out[0] = _mm_add_epi16(step1[0], step1[7]);
916   out[1] = _mm_add_epi16(step1[1], step1[6]);
917   out[2] = _mm_add_epi16(step1[2], step1[5]);
918   out[3] = _mm_add_epi16(step1[3], step1[4]);
919   out[4] = _mm_sub_epi16(step1[3], step1[4]);
920   out[5] = _mm_sub_epi16(step1[2], step1[5]);
921   out[6] = _mm_sub_epi16(step1[1], step1[6]);
922   out[7] = _mm_sub_epi16(step1[0], step1[7]);
923 }
924 
925 // For each 8x32 block __m128i in[32],
926 // Input with index, 2, 6
927 // output pixels: 8-15 in __m128i out[32]
idct32_34_8x32_quarter_2(const __m128i * const in,__m128i * const out)928 static INLINE void idct32_34_8x32_quarter_2(const __m128i *const in /*in[32]*/,
929                                             __m128i *const out /*out[16]*/) {
930   const __m128i zero = _mm_setzero_si128();
931   __m128i step1[16], step2[16];
932 
933   // stage 2
934   butterfly(in[2], zero, cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
935   butterfly(zero, in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
936 
937   // stage 3
938   step1[8] = step2[8];
939   step1[9] = step2[8];
940   step1[14] = step2[15];
941   step1[15] = step2[15];
942   step1[10] = step2[11];
943   step1[11] = step2[11];
944   step1[12] = step2[12];
945   step1[13] = step2[12];
946 
947   idct32_8x32_quarter_2_stage_4_to_6(step1, out);
948 }
949 
idct32_34_8x32_quarter_1_2(const __m128i * const in,__m128i * const out)950 static INLINE void idct32_34_8x32_quarter_1_2(
951     const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
952   __m128i temp[16];
953   idct32_34_8x32_quarter_1(in, temp);
954   idct32_34_8x32_quarter_2(in, temp);
955   // stage 7
956   add_sub_butterfly(temp, out, 16);
957 }
958 
959 // For each 8x32 block __m128i in[32],
960 // Input with odd index, 1, 3, 5, 7
961 // output pixels: 16-23, 24-31 in __m128i out[32]
idct32_34_8x32_quarter_3_4(const __m128i * const in,__m128i * const out)962 static INLINE void idct32_34_8x32_quarter_3_4(
963     const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
964   const __m128i zero = _mm_setzero_si128();
965   __m128i step1[32];
966 
967   // stage 1
968   butterfly(in[1], zero, cospi_31_64, cospi_1_64, &step1[16], &step1[31]);
969   butterfly(zero, in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]);
970   butterfly(in[5], zero, cospi_27_64, cospi_5_64, &step1[20], &step1[27]);
971   butterfly(zero, in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]);
972 
973   // stage 3
974   butterfly(step1[31], step1[16], cospi_28_64, cospi_4_64, &step1[17],
975             &step1[30]);
976   butterfly(step1[28], step1[19], -cospi_4_64, cospi_28_64, &step1[18],
977             &step1[29]);
978   butterfly(step1[27], step1[20], cospi_12_64, cospi_20_64, &step1[21],
979             &step1[26]);
980   butterfly(step1[24], step1[23], -cospi_20_64, cospi_12_64, &step1[22],
981             &step1[25]);
982 
983   idct32_8x32_quarter_3_4_stage_4_to_7(step1, out);
984 }
985 
idct32_34_8x32_sse2(const __m128i * const in,__m128i * const out)986 void idct32_34_8x32_sse2(const __m128i *const in /*in[32]*/,
987                          __m128i *const out /*out[32]*/) {
988   __m128i temp[32];
989 
990   idct32_34_8x32_quarter_1_2(in, temp);
991   idct32_34_8x32_quarter_3_4(in, temp);
992   // final stage
993   add_sub_butterfly(temp, out, 32);
994 }
995 
996 // Only upper-left 8x8 has non-zero coeff
vpx_idct32x32_34_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)997 void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
998                                int stride) {
999   __m128i io[32], col[32];
1000   int i;
1001 
1002   // Load input data. Only need to load the top left 8x8 block.
1003   load_transpose_16bit_8x8(input, 32, io);
1004   idct32_34_8x32_sse2(io, col);
1005 
1006   for (i = 0; i < 32; i += 8) {
1007     int j;
1008     transpose_16bit_8x8(col + i, io);
1009     idct32_34_8x32_sse2(io, io);
1010 
1011     for (j = 0; j < 32; ++j) {
1012       write_buffer_8x1(dest + j * stride, io[j]);
1013     }
1014 
1015     dest += 8;
1016   }
1017 }
1018 
1019 // For each 8x32 block __m128i in[32],
1020 // Input with index, 0, 4, 8, 12, 16, 20, 24, 28
1021 // output pixels: 0-7 in __m128i out[32]
idct32_1024_8x32_quarter_1(const __m128i * const in,__m128i * const out)1022 static INLINE void idct32_1024_8x32_quarter_1(
1023     const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
1024   __m128i step1[8], step2[8];
1025 
1026   // stage 3
1027   butterfly(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
1028   butterfly(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5], &step1[6]);
1029 
1030   // stage 4
1031   butterfly(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1], &step2[0]);
1032   butterfly(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
1033   step2[4] = _mm_add_epi16(step1[4], step1[5]);
1034   step2[5] = _mm_sub_epi16(step1[4], step1[5]);
1035   step2[6] = _mm_sub_epi16(step1[7], step1[6]);
1036   step2[7] = _mm_add_epi16(step1[7], step1[6]);
1037 
1038   // stage 5
1039   step1[0] = _mm_add_epi16(step2[0], step2[3]);
1040   step1[1] = _mm_add_epi16(step2[1], step2[2]);
1041   step1[2] = _mm_sub_epi16(step2[1], step2[2]);
1042   step1[3] = _mm_sub_epi16(step2[0], step2[3]);
1043   step1[4] = step2[4];
1044   butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
1045   step1[7] = step2[7];
1046 
1047   // stage 6
1048   out[0] = _mm_add_epi16(step1[0], step1[7]);
1049   out[1] = _mm_add_epi16(step1[1], step1[6]);
1050   out[2] = _mm_add_epi16(step1[2], step1[5]);
1051   out[3] = _mm_add_epi16(step1[3], step1[4]);
1052   out[4] = _mm_sub_epi16(step1[3], step1[4]);
1053   out[5] = _mm_sub_epi16(step1[2], step1[5]);
1054   out[6] = _mm_sub_epi16(step1[1], step1[6]);
1055   out[7] = _mm_sub_epi16(step1[0], step1[7]);
1056 }
1057 
1058 // For each 8x32 block __m128i in[32],
1059 // Input with index, 2, 6, 10, 14, 18, 22, 26, 30
1060 // output pixels: 8-15 in __m128i out[32]
idct32_1024_8x32_quarter_2(const __m128i * const in,__m128i * const out)1061 static INLINE void idct32_1024_8x32_quarter_2(
1062     const __m128i *const in /*in[32]*/, __m128i *const out /*out[16]*/) {
1063   __m128i step1[16], step2[16];
1064 
1065   // stage 2
1066   butterfly(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
1067   butterfly(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9], &step2[14]);
1068   butterfly(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10], &step2[13]);
1069   butterfly(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
1070 
1071   // stage 3
1072   step1[8] = _mm_add_epi16(step2[8], step2[9]);
1073   step1[9] = _mm_sub_epi16(step2[8], step2[9]);
1074   step1[10] = _mm_sub_epi16(step2[11], step2[10]);
1075   step1[11] = _mm_add_epi16(step2[11], step2[10]);
1076   step1[12] = _mm_add_epi16(step2[12], step2[13]);
1077   step1[13] = _mm_sub_epi16(step2[12], step2[13]);
1078   step1[14] = _mm_sub_epi16(step2[15], step2[14]);
1079   step1[15] = _mm_add_epi16(step2[15], step2[14]);
1080 
1081   idct32_8x32_quarter_2_stage_4_to_6(step1, out);
1082 }
1083 
idct32_1024_8x32_quarter_1_2(const __m128i * const in,__m128i * const out)1084 static INLINE void idct32_1024_8x32_quarter_1_2(
1085     const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
1086   __m128i temp[16];
1087   idct32_1024_8x32_quarter_1(in, temp);
1088   idct32_1024_8x32_quarter_2(in, temp);
1089   // stage 7
1090   add_sub_butterfly(temp, out, 16);
1091 }
1092 
1093 // For each 8x32 block __m128i in[32],
1094 // Input with odd index,
1095 // 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
1096 // output pixels: 16-23, 24-31 in __m128i out[32]
idct32_1024_8x32_quarter_3_4(const __m128i * const in,__m128i * const out)1097 static INLINE void idct32_1024_8x32_quarter_3_4(
1098     const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
1099   __m128i step1[32], step2[32];
1100 
1101   // stage 1
1102   butterfly(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16], &step1[31]);
1103   butterfly(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17], &step1[30]);
1104   butterfly(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18], &step1[29]);
1105   butterfly(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]);
1106 
1107   butterfly(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20], &step1[27]);
1108   butterfly(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21], &step1[26]);
1109 
1110   butterfly(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22], &step1[25]);
1111   butterfly(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]);
1112 
1113   // stage 2
1114   step2[16] = _mm_add_epi16(step1[16], step1[17]);
1115   step2[17] = _mm_sub_epi16(step1[16], step1[17]);
1116   step2[18] = _mm_sub_epi16(step1[19], step1[18]);
1117   step2[19] = _mm_add_epi16(step1[19], step1[18]);
1118   step2[20] = _mm_add_epi16(step1[20], step1[21]);
1119   step2[21] = _mm_sub_epi16(step1[20], step1[21]);
1120   step2[22] = _mm_sub_epi16(step1[23], step1[22]);
1121   step2[23] = _mm_add_epi16(step1[23], step1[22]);
1122 
1123   step2[24] = _mm_add_epi16(step1[24], step1[25]);
1124   step2[25] = _mm_sub_epi16(step1[24], step1[25]);
1125   step2[26] = _mm_sub_epi16(step1[27], step1[26]);
1126   step2[27] = _mm_add_epi16(step1[27], step1[26]);
1127   step2[28] = _mm_add_epi16(step1[28], step1[29]);
1128   step2[29] = _mm_sub_epi16(step1[28], step1[29]);
1129   step2[30] = _mm_sub_epi16(step1[31], step1[30]);
1130   step2[31] = _mm_add_epi16(step1[31], step1[30]);
1131 
1132   // stage 3
1133   step1[16] = step2[16];
1134   step1[31] = step2[31];
1135   butterfly(step2[30], step2[17], cospi_28_64, cospi_4_64, &step1[17],
1136             &step1[30]);
1137   butterfly(step2[29], step2[18], -cospi_4_64, cospi_28_64, &step1[18],
1138             &step1[29]);
1139   step1[19] = step2[19];
1140   step1[20] = step2[20];
1141   butterfly(step2[26], step2[21], cospi_12_64, cospi_20_64, &step1[21],
1142             &step1[26]);
1143   butterfly(step2[25], step2[22], -cospi_20_64, cospi_12_64, &step1[22],
1144             &step1[25]);
1145   step1[23] = step2[23];
1146   step1[24] = step2[24];
1147   step1[27] = step2[27];
1148   step1[28] = step2[28];
1149 
1150   idct32_8x32_quarter_3_4_stage_4_to_7(step1, out);
1151 }
1152 
idct32_1024_8x32(const __m128i * const in,__m128i * const out)1153 void idct32_1024_8x32(const __m128i *const in /*in[32]*/,
1154                       __m128i *const out /*out[32]*/) {
1155   __m128i temp[32];
1156 
1157   idct32_1024_8x32_quarter_1_2(in, temp);
1158   idct32_1024_8x32_quarter_3_4(in, temp);
1159   // final stage
1160   add_sub_butterfly(temp, out, 32);
1161 }
1162 
vpx_idct32x32_1024_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)1163 void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
1164                                  int stride) {
1165   __m128i col[4][32], io[32];
1166   int i;
1167 
1168   // rows
1169   for (i = 0; i < 4; i++) {
1170     load_transpose_16bit_8x8(&input[0], 32, &io[0]);
1171     load_transpose_16bit_8x8(&input[8], 32, &io[8]);
1172     load_transpose_16bit_8x8(&input[16], 32, &io[16]);
1173     load_transpose_16bit_8x8(&input[24], 32, &io[24]);
1174     idct32_1024_8x32(io, col[i]);
1175     input += 32 << 3;
1176   }
1177 
1178   // columns
1179   for (i = 0; i < 32; i += 8) {
1180     // Transpose 32x8 block to 8x32 block
1181     transpose_16bit_8x8(col[0] + i, io);
1182     transpose_16bit_8x8(col[1] + i, io + 8);
1183     transpose_16bit_8x8(col[2] + i, io + 16);
1184     transpose_16bit_8x8(col[3] + i, io + 24);
1185 
1186     idct32_1024_8x32(io, io);
1187     store_buffer_8x32(io, dest, stride);
1188     dest += 8;
1189   }
1190 }
1191 
vpx_idct32x32_135_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)1192 void vpx_idct32x32_135_add_sse2(const tran_low_t *input, uint8_t *dest,
1193                                 int stride) {
1194   __m128i col[2][32], in[32], out[32];
1195   int i;
1196 
1197   for (i = 16; i < 32; i++) {
1198     in[i] = _mm_setzero_si128();
1199   }
1200 
1201   // rows
1202   for (i = 0; i < 2; i++) {
1203     load_transpose_16bit_8x8(&input[0], 32, &in[0]);
1204     load_transpose_16bit_8x8(&input[8], 32, &in[8]);
1205     idct32_1024_8x32(in, col[i]);
1206     input += 32 << 3;
1207   }
1208 
1209   // columns
1210   for (i = 0; i < 32; i += 8) {
1211     transpose_16bit_8x8(col[0] + i, in);
1212     transpose_16bit_8x8(col[1] + i, in + 8);
1213     idct32_1024_8x32(in, out);
1214     store_buffer_8x32(out, dest, stride);
1215     dest += 8;
1216   }
1217 }
1218 
vpx_idct32x32_1_add_sse2(const tran_low_t * input,uint8_t * dest,int stride)1219 void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
1220                               int stride) {
1221   __m128i dc_value;
1222   int j;
1223   tran_high_t a1;
1224   tran_low_t out =
1225       WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
1226 
1227   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
1228   a1 = ROUND_POWER_OF_TWO(out, 6);
1229   dc_value = _mm_set1_epi16((int16_t)a1);
1230 
1231   for (j = 0; j < 32; ++j) {
1232     recon_and_store_16(dest + j * stride + 0, dc_value);
1233     recon_and_store_16(dest + j * stride + 16, dc_value);
1234   }
1235 }
1236