1 /*****************************************************************************
2  * Copyright (C) 2013 x265 project
3  *
4  * Authors: Steve Borho <steve@borho.org>
5  *          Mandar Gurav <mandar@multicorewareinc.com>
6  *          Deepthi Devaki Akkoorath <deepthidevaki@multicorewareinc.com>
7  *          Mahesh Pittala <mahesh@multicorewareinc.com>
8  *          Rajesh Paulraj <rajesh@multicorewareinc.com>
9  *          Min Chen <min.chen@multicorewareinc.com>
10  *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
11  *          Nabajit Deka <nabajit@multicorewareinc.com>
12  *
13  * This program is free software; you can redistribute it and/or modify
14  * it under the terms of the GNU General Public License as published by
15  * the Free Software Foundation; either version 2 of the License, or
16  * (at your option) any later version.
17  *
18  * This program is distributed in the hope that it will be useful,
19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
21  * GNU General Public License for more details.
22  *
23  * You should have received a copy of the GNU General Public License
24  * along with this program; if not, write to the Free Software
25  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
26  *
27  * This program is also available under a commercial proprietary license.
28  * For more information, contact us at license @ x265.com.
29  *****************************************************************************/
30 
31 #include "common.h"
32 #include "primitives.h"
33 #include <xmmintrin.h> // SSE
34 #include <pmmintrin.h> // SSE3
35 #include <tmmintrin.h> // SSSE3
36 
37 #define DCT16_SHIFT1  (3 + X265_DEPTH - 8)
38 #define DCT16_ADD1    (1 << ((DCT16_SHIFT1) - 1))
39 
40 #define DCT16_SHIFT2  10
41 #define DCT16_ADD2    (1 << ((DCT16_SHIFT2) - 1))
42 
43 #define DCT32_SHIFT1  (DCT16_SHIFT1 + 1)
44 #define DCT32_ADD1    (1 << ((DCT32_SHIFT1) - 1))
45 
46 #define DCT32_SHIFT2  (DCT16_SHIFT2 + 1)
47 #define DCT32_ADD2    (1 << ((DCT32_SHIFT2) - 1))
48 
49 using namespace X265_NS;
50 
51 ALIGN_VAR_32(static const int16_t, tab_dct_8[][8]) =
52 {
53     { 0x0100, 0x0F0E, 0x0706, 0x0908, 0x0302, 0x0D0C, 0x0504, 0x0B0A },
54 
55     { 64, 64, 64, 64, 64, 64, 64, 64 },
56     { 64, -64, 64, -64, 64, -64, 64, -64 },
57     { 83, 36, 83, 36, 83, 36, 83, 36 },
58     { 36, -83, 36, -83, 36, -83, 36, -83 },
59     { 89, 18, 75, 50, 89, 18, 75, 50 },
60     { 75, -50, -18, -89, 75, -50, -18, -89 },
61     { 50, 75, -89, 18, 50, 75, -89, 18 },
62     { 18, -89, -50, 75, 18, -89, -50, 75 },
63 
64     { 83, 83, -83, -83, 36, 36, -36, -36 },
65     { 36, 36, -36, -36, -83, -83, 83, 83 },
66     { 89, -89, 18, -18, 75, -75, 50, -50 },
67     { 75, -75, -50, 50, -18, 18, -89, 89 },
68     { 50, -50, 75, -75, -89, 89, 18, -18 },
69     { 18, -18, -89, 89, -50, 50, 75, -75 },
70 };
71 
72 ALIGN_VAR_32(static const int16_t, tab_dct_16_0[][8]) =
73 {
74     { 0x0F0E, 0x0D0C, 0x0B0A, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100 },  // 0
75     { 0x0100, 0x0F0E, 0x0706, 0x0908, 0x0302, 0x0D0C, 0x0504, 0x0B0A },  // 1
76     { 0x0100, 0x0706, 0x0302, 0x0504, 0x0F0E, 0x0908, 0x0D0C, 0x0B0A },  // 2
77     { 0x0F0E, 0x0908, 0x0D0C, 0x0B0A, 0x0100, 0x0706, 0x0302, 0x0504 },  // 3
78 };
79 
80 ALIGN_VAR_32(static const int16_t, tab_dct_16_1[][8]) =
81 {
82     { 90, 87, 80, 70, 57, 43, 25,  9 },  //  0
83     { 87, 57,  9, -43, -80, -90, -70, -25 },  //  1
84     { 80,  9, -70, -87, -25, 57, 90, 43 },  //  2
85     { 70, -43, -87,  9, 90, 25, -80, -57 },  //  3
86     { 57, -80, -25, 90, -9, -87, 43, 70 },  //  4
87     { 43, -90, 57, 25, -87, 70,  9, -80 },  //  5
88     { 25, -70, 90, -80, 43,  9, -57, 87 },  //  6
89     {  9, -25, 43, -57, 70, -80, 87, -90 },  //  7
90     { 83, 83, -83, -83, 36, 36, -36, -36 },  //  8
91     { 36, 36, -36, -36, -83, -83, 83, 83 },  //  9
92     { 89, 89, 18, 18, 75, 75, 50, 50 },  // 10
93     { 75, 75, -50, -50, -18, -18, -89, -89 },  // 11
94     { 50, 50, 75, 75, -89, -89, 18, 18 },  // 12
95     { 18, 18, -89, -89, -50, -50, 75, 75 },  // 13
96 
97 #define MAKE_COEF(a0, a1, a2, a3, a4, a5, a6, a7) \
98     { (a0), -(a0), (a3), -(a3), (a1), -(a1), (a2), -(a2) \
99     }, \
100     { (a7), -(a7), (a4), -(a4), (a6), -(a6), (a5), -(a5) },
101 
102     MAKE_COEF(90, 87, 80, 70, 57, 43, 25,  9)
103     MAKE_COEF(87, 57,  9, -43, -80, -90, -70, -25)
104     MAKE_COEF(80,  9, -70, -87, -25, 57, 90, 43)
105     MAKE_COEF(70, -43, -87,  9, 90, 25, -80, -57)
106     MAKE_COEF(57, -80, -25, 90, -9, -87, 43, 70)
107     MAKE_COEF(43, -90, 57, 25, -87, 70,  9, -80)
108     MAKE_COEF(25, -70, 90, -80, 43,  9, -57, 87)
109     MAKE_COEF(9, -25, 43, -57, 70, -80, 87, -90)
110 #undef MAKE_COEF
111 };
112 
dct16(const int16_t * src,int16_t * dst,intptr_t stride)113 static void dct16(const int16_t *src, int16_t *dst, intptr_t stride)
114 {
115     // Const
116     __m128i c_4     = _mm_set1_epi32(DCT16_ADD1);
117     __m128i c_512   = _mm_set1_epi32(DCT16_ADD2);
118 
119     int i;
120 
121     ALIGN_VAR_32(int16_t, tmp[16 * 16]);
122 
123     __m128i T00A, T01A, T02A, T03A, T04A, T05A, T06A, T07A;
124     __m128i T00B, T01B, T02B, T03B, T04B, T05B, T06B, T07B;
125     __m128i T10, T11, T12, T13, T14, T15, T16, T17;
126     __m128i T20, T21, T22, T23, T24, T25, T26, T27;
127     __m128i T30, T31, T32, T33, T34, T35, T36, T37;
128     __m128i T40, T41, T42, T43, T44, T45, T46, T47;
129     __m128i T50, T51, T52, T53;
130     __m128i T60, T61, T62, T63, T64, T65, T66, T67;
131     __m128i T70;
132 
133     // DCT1
134     for (i = 0; i < 16; i += 8)
135     {
136         T00A = _mm_load_si128((__m128i*)&src[(i + 0) * stride + 0]);    // [07 06 05 04 03 02 01 00]
137         T00B = _mm_load_si128((__m128i*)&src[(i + 0) * stride + 8]);    // [0F 0E 0D 0C 0B 0A 09 08]
138         T01A = _mm_load_si128((__m128i*)&src[(i + 1) * stride + 0]);    // [17 16 15 14 13 12 11 10]
139         T01B = _mm_load_si128((__m128i*)&src[(i + 1) * stride + 8]);    // [1F 1E 1D 1C 1B 1A 19 18]
140         T02A = _mm_load_si128((__m128i*)&src[(i + 2) * stride + 0]);    // [27 26 25 24 23 22 21 20]
141         T02B = _mm_load_si128((__m128i*)&src[(i + 2) * stride + 8]);    // [2F 2E 2D 2C 2B 2A 29 28]
142         T03A = _mm_load_si128((__m128i*)&src[(i + 3) * stride + 0]);    // [37 36 35 34 33 32 31 30]
143         T03B = _mm_load_si128((__m128i*)&src[(i + 3) * stride + 8]);    // [3F 3E 3D 3C 3B 3A 39 38]
144         T04A = _mm_load_si128((__m128i*)&src[(i + 4) * stride + 0]);    // [47 46 45 44 43 42 41 40]
145         T04B = _mm_load_si128((__m128i*)&src[(i + 4) * stride + 8]);    // [4F 4E 4D 4C 4B 4A 49 48]
146         T05A = _mm_load_si128((__m128i*)&src[(i + 5) * stride + 0]);    // [57 56 55 54 53 52 51 50]
147         T05B = _mm_load_si128((__m128i*)&src[(i + 5) * stride + 8]);    // [5F 5E 5D 5C 5B 5A 59 58]
148         T06A = _mm_load_si128((__m128i*)&src[(i + 6) * stride + 0]);    // [67 66 65 64 63 62 61 60]
149         T06B = _mm_load_si128((__m128i*)&src[(i + 6) * stride + 8]);    // [6F 6E 6D 6C 6B 6A 69 68]
150         T07A = _mm_load_si128((__m128i*)&src[(i + 7) * stride + 0]);    // [77 76 75 74 73 72 71 70]
151         T07B = _mm_load_si128((__m128i*)&src[(i + 7) * stride + 8]);    // [7F 7E 7D 7C 7B 7A 79 78]
152 
153         T00B = _mm_shuffle_epi8(T00B, _mm_load_si128((__m128i*)tab_dct_16_0[0]));
154         T01B = _mm_shuffle_epi8(T01B, _mm_load_si128((__m128i*)tab_dct_16_0[0]));
155         T02B = _mm_shuffle_epi8(T02B, _mm_load_si128((__m128i*)tab_dct_16_0[0]));
156         T03B = _mm_shuffle_epi8(T03B, _mm_load_si128((__m128i*)tab_dct_16_0[0]));
157         T04B = _mm_shuffle_epi8(T04B, _mm_load_si128((__m128i*)tab_dct_16_0[0]));
158         T05B = _mm_shuffle_epi8(T05B, _mm_load_si128((__m128i*)tab_dct_16_0[0]));
159         T06B = _mm_shuffle_epi8(T06B, _mm_load_si128((__m128i*)tab_dct_16_0[0]));
160         T07B = _mm_shuffle_epi8(T07B, _mm_load_si128((__m128i*)tab_dct_16_0[0]));
161 
162         T10  = _mm_add_epi16(T00A, T00B);
163         T11  = _mm_add_epi16(T01A, T01B);
164         T12  = _mm_add_epi16(T02A, T02B);
165         T13  = _mm_add_epi16(T03A, T03B);
166         T14  = _mm_add_epi16(T04A, T04B);
167         T15  = _mm_add_epi16(T05A, T05B);
168         T16  = _mm_add_epi16(T06A, T06B);
169         T17  = _mm_add_epi16(T07A, T07B);
170 
171         T20  = _mm_sub_epi16(T00A, T00B);
172         T21  = _mm_sub_epi16(T01A, T01B);
173         T22  = _mm_sub_epi16(T02A, T02B);
174         T23  = _mm_sub_epi16(T03A, T03B);
175         T24  = _mm_sub_epi16(T04A, T04B);
176         T25  = _mm_sub_epi16(T05A, T05B);
177         T26  = _mm_sub_epi16(T06A, T06B);
178         T27  = _mm_sub_epi16(T07A, T07B);
179 
180         T30  = _mm_shuffle_epi8(T10, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
181         T31  = _mm_shuffle_epi8(T11, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
182         T32  = _mm_shuffle_epi8(T12, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
183         T33  = _mm_shuffle_epi8(T13, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
184         T34  = _mm_shuffle_epi8(T14, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
185         T35  = _mm_shuffle_epi8(T15, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
186         T36  = _mm_shuffle_epi8(T16, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
187         T37  = _mm_shuffle_epi8(T17, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
188 
189         T40  = _mm_hadd_epi16(T30, T31);
190         T41  = _mm_hadd_epi16(T32, T33);
191         T42  = _mm_hadd_epi16(T34, T35);
192         T43  = _mm_hadd_epi16(T36, T37);
193         T44  = _mm_hsub_epi16(T30, T31);
194         T45  = _mm_hsub_epi16(T32, T33);
195         T46  = _mm_hsub_epi16(T34, T35);
196         T47  = _mm_hsub_epi16(T36, T37);
197 
198         T50  = _mm_hadd_epi16(T40, T41);
199         T51  = _mm_hadd_epi16(T42, T43);
200         T52  = _mm_hsub_epi16(T40, T41);
201         T53  = _mm_hsub_epi16(T42, T43);
202 
203         T60  = _mm_madd_epi16(T50, _mm_load_si128((__m128i*)tab_dct_8[1]));
204         T61  = _mm_madd_epi16(T51, _mm_load_si128((__m128i*)tab_dct_8[1]));
205         T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
206         T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
207         T70  = _mm_packs_epi32(T60, T61);
208         _mm_store_si128((__m128i*)&tmp[0 * 16 + i], T70);
209 
210         T60  = _mm_madd_epi16(T50, _mm_load_si128((__m128i*)tab_dct_8[2]));
211         T61  = _mm_madd_epi16(T51, _mm_load_si128((__m128i*)tab_dct_8[2]));
212         T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
213         T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
214         T70  = _mm_packs_epi32(T60, T61);
215         _mm_store_si128((__m128i*)&tmp[8 * 16 + i], T70);
216 
217         T60  = _mm_madd_epi16(T52, _mm_load_si128((__m128i*)tab_dct_8[3]));
218         T61  = _mm_madd_epi16(T53, _mm_load_si128((__m128i*)tab_dct_8[3]));
219         T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
220         T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
221         T70  = _mm_packs_epi32(T60, T61);
222         _mm_store_si128((__m128i*)&tmp[4 * 16 + i], T70);
223 
224         T60  = _mm_madd_epi16(T52, _mm_load_si128((__m128i*)tab_dct_8[4]));
225         T61  = _mm_madd_epi16(T53, _mm_load_si128((__m128i*)tab_dct_8[4]));
226         T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
227         T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
228         T70  = _mm_packs_epi32(T60, T61);
229         _mm_store_si128((__m128i*)&tmp[12 * 16 + i], T70);
230 
231         T60  = _mm_madd_epi16(T44, _mm_load_si128((__m128i*)tab_dct_8[5]));
232         T61  = _mm_madd_epi16(T45, _mm_load_si128((__m128i*)tab_dct_8[5]));
233         T62  = _mm_madd_epi16(T46, _mm_load_si128((__m128i*)tab_dct_8[5]));
234         T63  = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[5]));
235         T60  = _mm_hadd_epi32(T60, T61);
236         T61  = _mm_hadd_epi32(T62, T63);
237         T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
238         T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
239         T70  = _mm_packs_epi32(T60, T61);
240         _mm_store_si128((__m128i*)&tmp[2 * 16 + i], T70);
241 
242         T60  = _mm_madd_epi16(T44, _mm_load_si128((__m128i*)tab_dct_8[6]));
243         T61  = _mm_madd_epi16(T45, _mm_load_si128((__m128i*)tab_dct_8[6]));
244         T62  = _mm_madd_epi16(T46, _mm_load_si128((__m128i*)tab_dct_8[6]));
245         T63  = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[6]));
246         T60  = _mm_hadd_epi32(T60, T61);
247         T61  = _mm_hadd_epi32(T62, T63);
248         T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
249         T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
250         T70  = _mm_packs_epi32(T60, T61);
251         _mm_store_si128((__m128i*)&tmp[6 * 16 + i], T70);
252 
253         T60  = _mm_madd_epi16(T44, _mm_load_si128((__m128i*)tab_dct_8[7]));
254         T61  = _mm_madd_epi16(T45, _mm_load_si128((__m128i*)tab_dct_8[7]));
255         T62  = _mm_madd_epi16(T46, _mm_load_si128((__m128i*)tab_dct_8[7]));
256         T63  = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[7]));
257         T60  = _mm_hadd_epi32(T60, T61);
258         T61  = _mm_hadd_epi32(T62, T63);
259         T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
260         T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
261         T70  = _mm_packs_epi32(T60, T61);
262         _mm_store_si128((__m128i*)&tmp[10 * 16 + i], T70);
263 
264         T60  = _mm_madd_epi16(T44, _mm_load_si128((__m128i*)tab_dct_8[8]));
265         T61  = _mm_madd_epi16(T45, _mm_load_si128((__m128i*)tab_dct_8[8]));
266         T62  = _mm_madd_epi16(T46, _mm_load_si128((__m128i*)tab_dct_8[8]));
267         T63  = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[8]));
268         T60  = _mm_hadd_epi32(T60, T61);
269         T61  = _mm_hadd_epi32(T62, T63);
270         T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
271         T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
272         T70  = _mm_packs_epi32(T60, T61);
273         _mm_store_si128((__m128i*)&tmp[14 * 16 + i], T70);
274 
275 #define MAKE_ODD(tab, dstPos) \
276     T60  = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
277     T61  = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
278     T62  = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
279     T63  = _mm_madd_epi16(T23, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
280     T64  = _mm_madd_epi16(T24, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
281     T65  = _mm_madd_epi16(T25, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
282     T66  = _mm_madd_epi16(T26, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
283     T67  = _mm_madd_epi16(T27, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
284     T60  = _mm_hadd_epi32(T60, T61); \
285     T61  = _mm_hadd_epi32(T62, T63); \
286     T62  = _mm_hadd_epi32(T64, T65); \
287     T63  = _mm_hadd_epi32(T66, T67); \
288     T60  = _mm_hadd_epi32(T60, T61); \
289     T61  = _mm_hadd_epi32(T62, T63); \
290     T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1); \
291     T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1); \
292     T70  = _mm_packs_epi32(T60, T61); \
293     _mm_store_si128((__m128i*)&tmp[(dstPos) * 16 + i], T70);
294 
295         MAKE_ODD(0, 1);
296         MAKE_ODD(1, 3);
297         MAKE_ODD(2, 5);
298         MAKE_ODD(3, 7);
299         MAKE_ODD(4, 9);
300         MAKE_ODD(5, 11);
301         MAKE_ODD(6, 13);
302         MAKE_ODD(7, 15);
303 #undef MAKE_ODD
304     }
305 
306     // DCT2
307     for (i = 0; i < 16; i += 4)
308     {
309         T00A = _mm_load_si128((__m128i*)&tmp[(i + 0) * 16 + 0]);    // [07 06 05 04 03 02 01 00]
310         T00B = _mm_load_si128((__m128i*)&tmp[(i + 0) * 16 + 8]);    // [0F 0E 0D 0C 0B 0A 09 08]
311         T01A = _mm_load_si128((__m128i*)&tmp[(i + 1) * 16 + 0]);    // [17 16 15 14 13 12 11 10]
312         T01B = _mm_load_si128((__m128i*)&tmp[(i + 1) * 16 + 8]);    // [1F 1E 1D 1C 1B 1A 19 18]
313         T02A = _mm_load_si128((__m128i*)&tmp[(i + 2) * 16 + 0]);    // [27 26 25 24 23 22 21 20]
314         T02B = _mm_load_si128((__m128i*)&tmp[(i + 2) * 16 + 8]);    // [2F 2E 2D 2C 2B 2A 29 28]
315         T03A = _mm_load_si128((__m128i*)&tmp[(i + 3) * 16 + 0]);    // [37 36 35 34 33 32 31 30]
316         T03B = _mm_load_si128((__m128i*)&tmp[(i + 3) * 16 + 8]);    // [3F 3E 3D 3C 3B 3A 39 38]
317 
318         T00A = _mm_shuffle_epi8(T00A, _mm_load_si128((__m128i*)tab_dct_16_0[2]));
319         T00B = _mm_shuffle_epi8(T00B, _mm_load_si128((__m128i*)tab_dct_16_0[3]));
320         T01A = _mm_shuffle_epi8(T01A, _mm_load_si128((__m128i*)tab_dct_16_0[2]));
321         T01B = _mm_shuffle_epi8(T01B, _mm_load_si128((__m128i*)tab_dct_16_0[3]));
322         T02A = _mm_shuffle_epi8(T02A, _mm_load_si128((__m128i*)tab_dct_16_0[2]));
323         T02B = _mm_shuffle_epi8(T02B, _mm_load_si128((__m128i*)tab_dct_16_0[3]));
324         T03A = _mm_shuffle_epi8(T03A, _mm_load_si128((__m128i*)tab_dct_16_0[2]));
325         T03B = _mm_shuffle_epi8(T03B, _mm_load_si128((__m128i*)tab_dct_16_0[3]));
326 
327         T10  = _mm_unpacklo_epi16(T00A, T00B);
328         T11  = _mm_unpackhi_epi16(T00A, T00B);
329         T12  = _mm_unpacklo_epi16(T01A, T01B);
330         T13  = _mm_unpackhi_epi16(T01A, T01B);
331         T14  = _mm_unpacklo_epi16(T02A, T02B);
332         T15  = _mm_unpackhi_epi16(T02A, T02B);
333         T16  = _mm_unpacklo_epi16(T03A, T03B);
334         T17  = _mm_unpackhi_epi16(T03A, T03B);
335 
336         T20  = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_8[1]));
337         T21  = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_8[1]));
338         T22  = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_8[1]));
339         T23  = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_8[1]));
340         T24  = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_8[1]));
341         T25  = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_8[1]));
342         T26  = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_8[1]));
343         T27  = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_8[1]));
344 
345         T30  = _mm_add_epi32(T20, T21);
346         T31  = _mm_add_epi32(T22, T23);
347         T32  = _mm_add_epi32(T24, T25);
348         T33  = _mm_add_epi32(T26, T27);
349 
350         T30  = _mm_hadd_epi32(T30, T31);
351         T31  = _mm_hadd_epi32(T32, T33);
352 
353         T40  = _mm_hadd_epi32(T30, T31);
354         T41  = _mm_hsub_epi32(T30, T31);
355         T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
356         T41  = _mm_srai_epi32(_mm_add_epi32(T41, c_512), DCT16_SHIFT2);
357         T40  = _mm_packs_epi32(T40, T40);
358         T41  = _mm_packs_epi32(T41, T41);
359         _mm_storel_epi64((__m128i*)&dst[0 * 16 + i], T40);
360         _mm_storel_epi64((__m128i*)&dst[8 * 16 + i], T41);
361 
362         T20  = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
363         T21  = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
364         T22  = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
365         T23  = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
366         T24  = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
367         T25  = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
368         T26  = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
369         T27  = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
370 
371         T30  = _mm_add_epi32(T20, T21);
372         T31  = _mm_add_epi32(T22, T23);
373         T32  = _mm_add_epi32(T24, T25);
374         T33  = _mm_add_epi32(T26, T27);
375 
376         T30  = _mm_hadd_epi32(T30, T31);
377         T31  = _mm_hadd_epi32(T32, T33);
378 
379         T40  = _mm_hadd_epi32(T30, T31);
380         T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
381         T40  = _mm_packs_epi32(T40, T40);
382         _mm_storel_epi64((__m128i*)&dst[4 * 16 + i], T40);
383 
384         T20  = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
385         T21  = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
386         T22  = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
387         T23  = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
388         T24  = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
389         T25  = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
390         T26  = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
391         T27  = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
392 
393         T30  = _mm_add_epi32(T20, T21);
394         T31  = _mm_add_epi32(T22, T23);
395         T32  = _mm_add_epi32(T24, T25);
396         T33  = _mm_add_epi32(T26, T27);
397 
398         T30  = _mm_hadd_epi32(T30, T31);
399         T31  = _mm_hadd_epi32(T32, T33);
400 
401         T40  = _mm_hadd_epi32(T30, T31);
402         T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
403         T40  = _mm_packs_epi32(T40, T40);
404         _mm_storel_epi64((__m128i*)&dst[12 * 16 + i], T40);
405 
406         T20  = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[10]));
407         T21  = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[10]));
408         T22  = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[10]));
409         T23  = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[10]));
410         T24  = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[10]));
411         T25  = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[10]));
412         T26  = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[10]));
413         T27  = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[10]));
414 
415         T30  = _mm_sub_epi32(T20, T21);
416         T31  = _mm_sub_epi32(T22, T23);
417         T32  = _mm_sub_epi32(T24, T25);
418         T33  = _mm_sub_epi32(T26, T27);
419 
420         T30  = _mm_hadd_epi32(T30, T31);
421         T31  = _mm_hadd_epi32(T32, T33);
422 
423         T40  = _mm_hadd_epi32(T30, T31);
424         T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
425         T40  = _mm_packs_epi32(T40, T40);
426         _mm_storel_epi64((__m128i*)&dst[2 * 16 + i], T40);
427 
428         T20  = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[11]));
429         T21  = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[11]));
430         T22  = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[11]));
431         T23  = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[11]));
432         T24  = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[11]));
433         T25  = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[11]));
434         T26  = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[11]));
435         T27  = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[11]));
436 
437         T30  = _mm_sub_epi32(T20, T21);
438         T31  = _mm_sub_epi32(T22, T23);
439         T32  = _mm_sub_epi32(T24, T25);
440         T33  = _mm_sub_epi32(T26, T27);
441 
442         T30  = _mm_hadd_epi32(T30, T31);
443         T31  = _mm_hadd_epi32(T32, T33);
444 
445         T40  = _mm_hadd_epi32(T30, T31);
446         T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
447         T40  = _mm_packs_epi32(T40, T40);
448         _mm_storel_epi64((__m128i*)&dst[6 * 16 + i], T40);
449 
450         T20  = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[12]));
451         T21  = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[12]));
452         T22  = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[12]));
453         T23  = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[12]));
454         T24  = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[12]));
455         T25  = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[12]));
456         T26  = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[12]));
457         T27  = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[12]));
458 
459         T30  = _mm_sub_epi32(T20, T21);
460         T31  = _mm_sub_epi32(T22, T23);
461         T32  = _mm_sub_epi32(T24, T25);
462         T33  = _mm_sub_epi32(T26, T27);
463 
464         T30  = _mm_hadd_epi32(T30, T31);
465         T31  = _mm_hadd_epi32(T32, T33);
466 
467         T40  = _mm_hadd_epi32(T30, T31);
468         T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
469         T40  = _mm_packs_epi32(T40, T40);
470         _mm_storel_epi64((__m128i*)&dst[10 * 16 + i], T40);
471 
472         T20  = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[13]));
473         T21  = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[13]));
474         T22  = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[13]));
475         T23  = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[13]));
476         T24  = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[13]));
477         T25  = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[13]));
478         T26  = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[13]));
479         T27  = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[13]));
480 
481         T30  = _mm_sub_epi32(T20, T21);
482         T31  = _mm_sub_epi32(T22, T23);
483         T32  = _mm_sub_epi32(T24, T25);
484         T33  = _mm_sub_epi32(T26, T27);
485 
486         T30  = _mm_hadd_epi32(T30, T31);
487         T31  = _mm_hadd_epi32(T32, T33);
488 
489         T40  = _mm_hadd_epi32(T30, T31);
490         T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2);
491         T40  = _mm_packs_epi32(T40, T40);
492         _mm_storel_epi64((__m128i*)&dst[14 * 16 + i], T40);
493 
494 #define MAKE_ODD(tab, dstPos) \
495     T20  = _mm_madd_epi16(T10, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)]));       /* [*O2_0 *O1_0 *O3_0 *O0_0] */ \
496     T21  = _mm_madd_epi16(T11, _mm_load_si128((__m128i*)tab_dct_16_1[(tab) + 1]));   /* [*O5_0 *O6_0 *O4_0 *O7_0] */ \
497     T22  = _mm_madd_epi16(T12, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
498     T23  = _mm_madd_epi16(T13, _mm_load_si128((__m128i*)tab_dct_16_1[(tab) + 1])); \
499     T24  = _mm_madd_epi16(T14, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
500     T25  = _mm_madd_epi16(T15, _mm_load_si128((__m128i*)tab_dct_16_1[(tab) + 1])); \
501     T26  = _mm_madd_epi16(T16, _mm_load_si128((__m128i*)tab_dct_16_1[(tab)])); \
502     T27  = _mm_madd_epi16(T17, _mm_load_si128((__m128i*)tab_dct_16_1[(tab) + 1])); \
503         \
504     T30  = _mm_add_epi32(T20, T21); \
505     T31  = _mm_add_epi32(T22, T23); \
506     T32  = _mm_add_epi32(T24, T25); \
507     T33  = _mm_add_epi32(T26, T27); \
508         \
509     T30  = _mm_hadd_epi32(T30, T31); \
510     T31  = _mm_hadd_epi32(T32, T33); \
511         \
512     T40  = _mm_hadd_epi32(T30, T31); \
513     T40  = _mm_srai_epi32(_mm_add_epi32(T40, c_512), DCT16_SHIFT2); \
514     T40  = _mm_packs_epi32(T40, T40); \
515     _mm_storel_epi64((__m128i*)&dst[(dstPos) * 16 + i], T40);
516 
517         MAKE_ODD(14,  1);
518         MAKE_ODD(16,  3);
519         MAKE_ODD(18,  5);
520         MAKE_ODD(20,  7);
521         MAKE_ODD(22,  9);
522         MAKE_ODD(24, 11);
523         MAKE_ODD(26, 13);
524         MAKE_ODD(28, 15);
525 #undef MAKE_ODD
526     }
527 }
528 
529 ALIGN_VAR_32(static const int16_t, tab_dct_32_0[][8]) =
530 {
531     { 0x0F0E, 0x0100, 0x0908, 0x0706, 0x0D0C, 0x0302, 0x0B0A, 0x0504 },  // 0
532 };
533 
534 ALIGN_VAR_32(static const int16_t, tab_dct_32_1[][8]) =
535 {
536     { 89, -89, 18, -18, 75, -75, 50, -50 },          //  0
537     { 75, -75, -50, 50, -18, 18, -89, 89 },          //  1
538     { 50, -50, 75, -75, -89, 89, 18, -18 },          //  2
539     { 18, -18, -89, 89, -50, 50, 75, -75 },          //  3
540 
541 #define MAKE_COEF8(a0, a1, a2, a3, a4, a5, a6, a7) \
542     { (a0), (a7), (a3), (a4), (a1), (a6), (a2), (a5) \
543     }, \
544 
545     MAKE_COEF8(90, 87, 80, 70, 57, 43, 25,  9)   //  4
546     MAKE_COEF8(87, 57,  9, -43, -80, -90, -70, -25)   //  5
547     MAKE_COEF8(80,  9, -70, -87, -25, 57, 90, 43)   //  6
548     MAKE_COEF8(70, -43, -87,  9, 90, 25, -80, -57)   //  7
549     MAKE_COEF8(57, -80, -25, 90, -9, -87, 43, 70)   //  8
550     MAKE_COEF8(43, -90, 57, 25, -87, 70,  9, -80)   //  9
551     MAKE_COEF8(25, -70, 90, -80, 43,  9, -57, 87)   // 10
552     MAKE_COEF8(9, -25, 43, -57, 70, -80, 87, -90)   // 11
553 #undef MAKE_COEF8
554 
555 #define MAKE_COEF16(a00, a01, a02, a03, a04, a05, a06, a07, a08, a09, a10, a11, a12, a13, a14, a15) \
556     { (a00), (a07), (a03), (a04), (a01), (a06), (a02), (a05) }, \
557     { (a15), (a08), (a12), (a11), (a14), (a09), (a13), (a10) },
558 
559     MAKE_COEF16(90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13,  4)    // 12
560     MAKE_COEF16(90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13)    // 14
561     MAKE_COEF16(88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22)    // 16
562     MAKE_COEF16(85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31)    // 18
563     MAKE_COEF16(82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67,  4, 73, 88, 38)    // 20
564     MAKE_COEF16(78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46)    // 22
565     MAKE_COEF16(73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54)    // 24
566     MAKE_COEF16(67, -54, -78, 38, 85, -22, -90,  4, 90, 13, -88, -31, 82, 46, -73, -61)    // 26
567     MAKE_COEF16(61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67)    // 28
568     MAKE_COEF16(54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73)    // 30
569     MAKE_COEF16(46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82,  4, 78)    // 32
570     MAKE_COEF16(38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82)    // 34
571     MAKE_COEF16(31, -78, 90, -61,  4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85)    // 36
572     MAKE_COEF16(22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88)    // 38
573     MAKE_COEF16(13, -38, 61, -78, 88, -90, 85, -73, 54, -31,  4, 22, -46, 67, -82, 90)    // 40
574     MAKE_COEF16(4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90)    // 42
575 #undef MAKE_COEF16
576 
577     {
578         64, 64, 64, 64, 64, 64, 64, 64
579     },                                  // 44
580 
581     { 64, 64, -64, -64, -64, -64, 64, 64 },  // 45
582 
583     { 83, 83, 36, 36, -36, -36, -83, -83 },  // 46
584     { -83, -83, -36, -36, 36, 36, 83, 83 },  // 47
585 
586     { 36, 36, -83, -83, 83, 83, -36, -36 },  // 48
587     { -36, -36, 83, 83, -83, -83, 36, 36 },  // 49
588 
589 #define MAKE_COEF16(a00, a01, a02, a03, a04, a05, a06, a07, a08, a09, a10, a11, a12, a13, a14, a15) \
590     { (a00), (a00), (a01), (a01), (a02), (a02), (a03), (a03) }, \
591     { (a04), (a04), (a05), (a05), (a06), (a06), (a07), (a07) }, \
592     { (a08), (a08), (a09), (a09), (a10), (a10), (a11), (a11) }, \
593     { (a12), (a12), (a13), (a13), (a14), (a14), (a15), (a15) },
594 
595     MAKE_COEF16(89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89) // 50
596     MAKE_COEF16(75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75) // 54
597 
598     // TODO: convert below table here
599 #undef MAKE_COEF16
600 
601     {
602         50, 50, -89, -89, 18, 18, 75, 75
603     },                                  // 58
604     { -75, -75, -18, -18, 89, 89, -50, -50 },  // 59
605     { -50, -50, 89, 89, -18, -18, -75, -75 },  // 60
606     { 75, 75, 18, 18, -89, -89, 50, 50 },  // 61
607 
608     { 18, 18, -50, -50, 75, 75, -89, -89 },  // 62
609     { 89, 89, -75, -75, 50, 50, -18, -18 },  // 63
610     { -18, -18, 50, 50, -75, -75, 89, 89 },  // 64
611     { -89, -89, 75, 75, -50, -50, 18, 18 },  // 65
612 
613     { 90, 90, 87, 87, 80, 80, 70, 70 },  // 66
614     { 57, 57, 43, 43, 25, 25,  9,  9 },  // 67
615     { -9, -9, -25, -25, -43, -43, -57, -57 },  // 68
616     { -70, -70, -80, -80, -87, -87, -90, -90 },  // 69
617 
618     { 87, 87, 57, 57,  9,  9, -43, -43 },  // 70
619     { -80, -80, -90, -90, -70, -70, -25, -25 },  // 71
620     { 25, 25, 70, 70, 90, 90, 80, 80 },  // 72
621     { 43, 43, -9, -9, -57, -57, -87, -87 },  // 73
622 
623     { 80, 80,  9,  9, -70, -70, -87, -87 },  // 74
624     { -25, -25, 57, 57, 90, 90, 43, 43 },  // 75
625     { -43, -43, -90, -90, -57, -57, 25, 25 },  // 76
626     { 87, 87, 70, 70, -9, -9, -80, -80 },  // 77
627 
628     { 70, 70, -43, -43, -87, -87,  9,  9 },  // 78
629     { 90, 90, 25, 25, -80, -80, -57, -57 },  // 79
630     { 57, 57, 80, 80, -25, -25, -90, -90 },  // 80
631     { -9, -9, 87, 87, 43, 43, -70, -70 },  // 81
632 
633     { 57, 57, -80, -80, -25, -25, 90, 90 },  // 82
634     { -9, -9, -87, -87, 43, 43, 70, 70 },  // 83
635     { -70, -70, -43, -43, 87, 87,  9,  9 },  // 84
636     { -90, -90, 25, 25, 80, 80, -57, -57 },  // 85
637 
638     { 43, 43, -90, -90, 57, 57, 25, 25 },  // 86
639     { -87, -87, 70, 70,  9,  9, -80, -80 },  // 87
640     { 80, 80, -9, -9, -70, -70, 87, 87 },  // 88
641     { -25, -25, -57, -57, 90, 90, -43, -43 },  // 89
642 
643     { 25, 25, -70, -70, 90, 90, -80, -80 },  // 90
644     { 43, 43,  9,  9, -57, -57, 87, 87 },  // 91
645     { -87, -87, 57, 57, -9, -9, -43, -43 },  // 92
646     { 80, 80, -90, -90, 70, 70, -25, -25 },  // 93
647 
648     {  9,  9, -25, -25, 43, 43, -57, -57 },  // 94
649     { 70, 70, -80, -80, 87, 87, -90, -90 },  // 95
650     { 90, 90, -87, -87, 80, 80, -70, -70 },  // 96
651     { 57, 57, -43, -43, 25, 25, -9, -9 },  // 97
652 
653 #define MAKE_COEF16(a00, a01, a02, a03, a04, a05, a06, a07, a08, a09, a10, a11, a12, a13, a14, a15) \
654     { (a00), -(a00), (a01), -(a01), (a02), -(a02), (a03), -(a03) }, \
655     { (a04), -(a04), (a05), -(a05), (a06), -(a06), (a07), -(a07) }, \
656     { (a08), -(a08), (a09), -(a09), (a10), -(a10), (a11), -(a11) }, \
657     { (a12), -(a12), (a13), -(a13), (a14), -(a14), (a15), -(a15) },
658 
659     MAKE_COEF16(90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4)    // 98
660     MAKE_COEF16(90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13)     //102
661     MAKE_COEF16(88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22)     //106
662     MAKE_COEF16(85, 46, -13, -67, -90, -73, -22, 38, +82, 88, 54, -4, -61, -90, -78, -31)     //110
663     MAKE_COEF16(82, 22, -54, -90, -61, 13, 78, 85, +31, -46, -90, -67,  4, 73, 88, 38)     //114
664     MAKE_COEF16(78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46)     //118
665     MAKE_COEF16(73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54)     //122
666     MAKE_COEF16(67, -54, -78, 38, 85, -22, -90,  4, +90, 13, -88, -31, 82, 46, -73, -61)     //126
667     MAKE_COEF16(61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67)     //130
668     MAKE_COEF16(54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73)     //134
669     MAKE_COEF16(46, -90, 38, 54, -90, 31, 61, -88, +22, 67, -85, 13, 73, -82,  4, 78)     //138
670     MAKE_COEF16(38, -88, 73, -4, -67, 90, -46, -31, +85, -78, 13, 61, -90, 54, 22, -82)     //142
671     MAKE_COEF16(31, -78, 90, -61,  4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85)     //146
672     MAKE_COEF16(22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88)     //150
673     MAKE_COEF16(13, -38, 61, -78, 88, -90, 85, -73, +54, -31,  4, 22, -46, 67, -82, 90)     //154
674     MAKE_COEF16(4, -13, 22, -31, 38, -46, 54, -61, +67, -73, 78, -82, 85, -88, 90, -90)     //158
675 
676 #undef MAKE_COEF16
677 };
678 
dct32(const int16_t * src,int16_t * dst,intptr_t stride)679 static void dct32(const int16_t *src, int16_t *dst, intptr_t stride)
680 {
681     // Const
682     __m128i c_8     = _mm_set1_epi32(DCT32_ADD1);
683     __m128i c_1024  = _mm_set1_epi32(DCT32_ADD2);
684 
685     int i;
686 
687     __m128i T00A, T01A, T02A, T03A, T04A, T05A, T06A, T07A;
688     __m128i T00B, T01B, T02B, T03B, T04B, T05B, T06B, T07B;
689     __m128i T00C, T01C, T02C, T03C, T04C, T05C, T06C, T07C;
690     __m128i T00D, T01D, T02D, T03D, T04D, T05D, T06D, T07D;
691     __m128i T10A, T11A, T12A, T13A, T14A, T15A, T16A, T17A;
692     __m128i T10B, T11B, T12B, T13B, T14B, T15B, T16B, T17B;
693     __m128i T20, T21, T22, T23, T24, T25, T26, T27;
694     __m128i T30, T31, T32, T33, T34, T35, T36, T37;
695     __m128i T40, T41, T42, T43, T44, T45, T46, T47;
696     __m128i T50, T51, T52, T53;
697     __m128i T60, T61, T62, T63, T64, T65, T66, T67;
698     __m128i im[32][4];
699 
700     // DCT1
701     for (i = 0; i < 32 / 8; i++)
702     {
703         T00A = _mm_load_si128((__m128i*)&src[(i * 8 + 0) * stride + 0]);    // [07 06 05 04 03 02 01 00]
704         T00B = _mm_load_si128((__m128i*)&src[(i * 8 + 0) * stride + 8]);    // [15 14 13 12 11 10 09 08]
705         T00C = _mm_load_si128((__m128i*)&src[(i * 8 + 0) * stride + 16]);    // [23 22 21 20 19 18 17 16]
706         T00D = _mm_load_si128((__m128i*)&src[(i * 8 + 0) * stride + 24]);    // [31 30 29 28 27 26 25 24]
707         T01A = _mm_load_si128((__m128i*)&src[(i * 8 + 1) * stride + 0]);
708         T01B = _mm_load_si128((__m128i*)&src[(i * 8 + 1) * stride + 8]);
709         T01C = _mm_load_si128((__m128i*)&src[(i * 8 + 1) * stride + 16]);
710         T01D = _mm_load_si128((__m128i*)&src[(i * 8 + 1) * stride + 24]);
711         T02A = _mm_load_si128((__m128i*)&src[(i * 8 + 2) * stride + 0]);
712         T02B = _mm_load_si128((__m128i*)&src[(i * 8 + 2) * stride + 8]);
713         T02C = _mm_load_si128((__m128i*)&src[(i * 8 + 2) * stride + 16]);
714         T02D = _mm_load_si128((__m128i*)&src[(i * 8 + 2) * stride + 24]);
715         T03A = _mm_load_si128((__m128i*)&src[(i * 8 + 3) * stride + 0]);
716         T03B = _mm_load_si128((__m128i*)&src[(i * 8 + 3) * stride + 8]);
717         T03C = _mm_load_si128((__m128i*)&src[(i * 8 + 3) * stride + 16]);
718         T03D = _mm_load_si128((__m128i*)&src[(i * 8 + 3) * stride + 24]);
719         T04A = _mm_load_si128((__m128i*)&src[(i * 8 + 4) * stride + 0]);
720         T04B = _mm_load_si128((__m128i*)&src[(i * 8 + 4) * stride + 8]);
721         T04C = _mm_load_si128((__m128i*)&src[(i * 8 + 4) * stride + 16]);
722         T04D = _mm_load_si128((__m128i*)&src[(i * 8 + 4) * stride + 24]);
723         T05A = _mm_load_si128((__m128i*)&src[(i * 8 + 5) * stride + 0]);
724         T05B = _mm_load_si128((__m128i*)&src[(i * 8 + 5) * stride + 8]);
725         T05C = _mm_load_si128((__m128i*)&src[(i * 8 + 5) * stride + 16]);
726         T05D = _mm_load_si128((__m128i*)&src[(i * 8 + 5) * stride + 24]);
727         T06A = _mm_load_si128((__m128i*)&src[(i * 8 + 6) * stride + 0]);
728         T06B = _mm_load_si128((__m128i*)&src[(i * 8 + 6) * stride + 8]);
729         T06C = _mm_load_si128((__m128i*)&src[(i * 8 + 6) * stride + 16]);
730         T06D = _mm_load_si128((__m128i*)&src[(i * 8 + 6) * stride + 24]);
731         T07A = _mm_load_si128((__m128i*)&src[(i * 8 + 7) * stride + 0]);
732         T07B = _mm_load_si128((__m128i*)&src[(i * 8 + 7) * stride + 8]);
733         T07C = _mm_load_si128((__m128i*)&src[(i * 8 + 7) * stride + 16]);
734         T07D = _mm_load_si128((__m128i*)&src[(i * 8 + 7) * stride + 24]);
735 
736         T00A = _mm_shuffle_epi8(T00A, _mm_load_si128((__m128i*)tab_dct_16_0[1]));    // [05 02 06 01 04 03 07 00]
737         T00B = _mm_shuffle_epi8(T00B, _mm_load_si128((__m128i*)tab_dct_32_0[0]));    // [10 13 09 14 11 12 08 15]
738         T00C = _mm_shuffle_epi8(T00C, _mm_load_si128((__m128i*)tab_dct_16_0[1]));    // [21 18 22 17 20 19 23 16]
739         T00D = _mm_shuffle_epi8(T00D, _mm_load_si128((__m128i*)tab_dct_32_0[0]));    // [26 29 25 30 27 28 24 31]
740         T01A = _mm_shuffle_epi8(T01A, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
741         T01B = _mm_shuffle_epi8(T01B, _mm_load_si128((__m128i*)tab_dct_32_0[0]));
742         T01C = _mm_shuffle_epi8(T01C, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
743         T01D = _mm_shuffle_epi8(T01D, _mm_load_si128((__m128i*)tab_dct_32_0[0]));
744         T02A = _mm_shuffle_epi8(T02A, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
745         T02B = _mm_shuffle_epi8(T02B, _mm_load_si128((__m128i*)tab_dct_32_0[0]));
746         T02C = _mm_shuffle_epi8(T02C, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
747         T02D = _mm_shuffle_epi8(T02D, _mm_load_si128((__m128i*)tab_dct_32_0[0]));
748         T03A = _mm_shuffle_epi8(T03A, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
749         T03B = _mm_shuffle_epi8(T03B, _mm_load_si128((__m128i*)tab_dct_32_0[0]));
750         T03C = _mm_shuffle_epi8(T03C, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
751         T03D = _mm_shuffle_epi8(T03D, _mm_load_si128((__m128i*)tab_dct_32_0[0]));
752         T04A = _mm_shuffle_epi8(T04A, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
753         T04B = _mm_shuffle_epi8(T04B, _mm_load_si128((__m128i*)tab_dct_32_0[0]));
754         T04C = _mm_shuffle_epi8(T04C, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
755         T04D = _mm_shuffle_epi8(T04D, _mm_load_si128((__m128i*)tab_dct_32_0[0]));
756         T05A = _mm_shuffle_epi8(T05A, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
757         T05B = _mm_shuffle_epi8(T05B, _mm_load_si128((__m128i*)tab_dct_32_0[0]));
758         T05C = _mm_shuffle_epi8(T05C, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
759         T05D = _mm_shuffle_epi8(T05D, _mm_load_si128((__m128i*)tab_dct_32_0[0]));
760         T06A = _mm_shuffle_epi8(T06A, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
761         T06B = _mm_shuffle_epi8(T06B, _mm_load_si128((__m128i*)tab_dct_32_0[0]));
762         T06C = _mm_shuffle_epi8(T06C, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
763         T06D = _mm_shuffle_epi8(T06D, _mm_load_si128((__m128i*)tab_dct_32_0[0]));
764         T07A = _mm_shuffle_epi8(T07A, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
765         T07B = _mm_shuffle_epi8(T07B, _mm_load_si128((__m128i*)tab_dct_32_0[0]));
766         T07C = _mm_shuffle_epi8(T07C, _mm_load_si128((__m128i*)tab_dct_16_0[1]));
767         T07D = _mm_shuffle_epi8(T07D, _mm_load_si128((__m128i*)tab_dct_32_0[0]));
768 
769         T10A = _mm_add_epi16(T00A, T00D);   // [E05 E02 E06 E01 E04 E03 E07 E00]
770         T10B = _mm_add_epi16(T00B, T00C);   // [E10 E13 E09 E14 E11 E12 E08 E15]
771         T11A = _mm_add_epi16(T01A, T01D);
772         T11B = _mm_add_epi16(T01B, T01C);
773         T12A = _mm_add_epi16(T02A, T02D);
774         T12B = _mm_add_epi16(T02B, T02C);
775         T13A = _mm_add_epi16(T03A, T03D);
776         T13B = _mm_add_epi16(T03B, T03C);
777         T14A = _mm_add_epi16(T04A, T04D);
778         T14B = _mm_add_epi16(T04B, T04C);
779         T15A = _mm_add_epi16(T05A, T05D);
780         T15B = _mm_add_epi16(T05B, T05C);
781         T16A = _mm_add_epi16(T06A, T06D);
782         T16B = _mm_add_epi16(T06B, T06C);
783         T17A = _mm_add_epi16(T07A, T07D);
784         T17B = _mm_add_epi16(T07B, T07C);
785 
786         T00A = _mm_sub_epi16(T00A, T00D);   // [O05 O02 O06 O01 O04 O03 O07 O00]
787         T00B = _mm_sub_epi16(T00B, T00C);   // [O10 O13 O09 O14 O11 O12 O08 O15]
788         T01A = _mm_sub_epi16(T01A, T01D);
789         T01B = _mm_sub_epi16(T01B, T01C);
790         T02A = _mm_sub_epi16(T02A, T02D);
791         T02B = _mm_sub_epi16(T02B, T02C);
792         T03A = _mm_sub_epi16(T03A, T03D);
793         T03B = _mm_sub_epi16(T03B, T03C);
794         T04A = _mm_sub_epi16(T04A, T04D);
795         T04B = _mm_sub_epi16(T04B, T04C);
796         T05A = _mm_sub_epi16(T05A, T05D);
797         T05B = _mm_sub_epi16(T05B, T05C);
798         T06A = _mm_sub_epi16(T06A, T06D);
799         T06B = _mm_sub_epi16(T06B, T06C);
800         T07A = _mm_sub_epi16(T07A, T07D);
801         T07B = _mm_sub_epi16(T07B, T07C);
802 
803         T20  = _mm_add_epi16(T10A, T10B);   // [EE5 EE2 EE6 EE1 EE4 EE3 EE7 EE0]
804         T21  = _mm_add_epi16(T11A, T11B);
805         T22  = _mm_add_epi16(T12A, T12B);
806         T23  = _mm_add_epi16(T13A, T13B);
807         T24  = _mm_add_epi16(T14A, T14B);
808         T25  = _mm_add_epi16(T15A, T15B);
809         T26  = _mm_add_epi16(T16A, T16B);
810         T27  = _mm_add_epi16(T17A, T17B);
811 
812         T30  = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_8[1]));
813         T31  = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_8[1]));
814         T32  = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_8[1]));
815         T33  = _mm_madd_epi16(T23, _mm_load_si128((__m128i*)tab_dct_8[1]));
816         T34  = _mm_madd_epi16(T24, _mm_load_si128((__m128i*)tab_dct_8[1]));
817         T35  = _mm_madd_epi16(T25, _mm_load_si128((__m128i*)tab_dct_8[1]));
818         T36  = _mm_madd_epi16(T26, _mm_load_si128((__m128i*)tab_dct_8[1]));
819         T37  = _mm_madd_epi16(T27, _mm_load_si128((__m128i*)tab_dct_8[1]));
820 
821         T40  = _mm_hadd_epi32(T30, T31);
822         T41  = _mm_hadd_epi32(T32, T33);
823         T42  = _mm_hadd_epi32(T34, T35);
824         T43  = _mm_hadd_epi32(T36, T37);
825 
826         T50  = _mm_hadd_epi32(T40, T41);
827         T51  = _mm_hadd_epi32(T42, T43);
828         T50  = _mm_srai_epi32(_mm_add_epi32(T50, c_8), DCT32_SHIFT1);
829         T51  = _mm_srai_epi32(_mm_add_epi32(T51, c_8), DCT32_SHIFT1);
830         T60  = _mm_packs_epi32(T50, T51);
831         im[0][i] = T60;
832 
833         T50  = _mm_hsub_epi32(T40, T41);
834         T51  = _mm_hsub_epi32(T42, T43);
835         T50  = _mm_srai_epi32(_mm_add_epi32(T50, c_8), DCT32_SHIFT1);
836         T51  = _mm_srai_epi32(_mm_add_epi32(T51, c_8), DCT32_SHIFT1);
837         T60  = _mm_packs_epi32(T50, T51);
838         im[16][i] = T60;
839 
840         T30  = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
841         T31  = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
842         T32  = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
843         T33  = _mm_madd_epi16(T23, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
844         T34  = _mm_madd_epi16(T24, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
845         T35  = _mm_madd_epi16(T25, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
846         T36  = _mm_madd_epi16(T26, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
847         T37  = _mm_madd_epi16(T27, _mm_load_si128((__m128i*)tab_dct_16_1[8]));
848 
849         T40  = _mm_hadd_epi32(T30, T31);
850         T41  = _mm_hadd_epi32(T32, T33);
851         T42  = _mm_hadd_epi32(T34, T35);
852         T43  = _mm_hadd_epi32(T36, T37);
853 
854         T50  = _mm_hadd_epi32(T40, T41);
855         T51  = _mm_hadd_epi32(T42, T43);
856         T50  = _mm_srai_epi32(_mm_add_epi32(T50, c_8), DCT32_SHIFT1);
857         T51  = _mm_srai_epi32(_mm_add_epi32(T51, c_8), DCT32_SHIFT1);
858         T60  = _mm_packs_epi32(T50, T51);
859         im[8][i] = T60;
860 
861         T30  = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
862         T31  = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
863         T32  = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
864         T33  = _mm_madd_epi16(T23, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
865         T34  = _mm_madd_epi16(T24, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
866         T35  = _mm_madd_epi16(T25, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
867         T36  = _mm_madd_epi16(T26, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
868         T37  = _mm_madd_epi16(T27, _mm_load_si128((__m128i*)tab_dct_16_1[9]));
869 
870         T40  = _mm_hadd_epi32(T30, T31);
871         T41  = _mm_hadd_epi32(T32, T33);
872         T42  = _mm_hadd_epi32(T34, T35);
873         T43  = _mm_hadd_epi32(T36, T37);
874 
875         T50  = _mm_hadd_epi32(T40, T41);
876         T51  = _mm_hadd_epi32(T42, T43);
877         T50  = _mm_srai_epi32(_mm_add_epi32(T50, c_8), DCT32_SHIFT1);
878         T51  = _mm_srai_epi32(_mm_add_epi32(T51, c_8), DCT32_SHIFT1);
879         T60  = _mm_packs_epi32(T50, T51);
880         im[24][i] = T60;
881 
882 #define MAKE_ODD(tab, dstPos) \
883     T30  = _mm_madd_epi16(T20, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
884     T31  = _mm_madd_epi16(T21, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
885     T32  = _mm_madd_epi16(T22, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
886     T33  = _mm_madd_epi16(T23, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
887     T34  = _mm_madd_epi16(T24, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
888     T35  = _mm_madd_epi16(T25, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
889     T36  = _mm_madd_epi16(T26, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
890     T37  = _mm_madd_epi16(T27, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
891         \
892     T40  = _mm_hadd_epi32(T30, T31); \
893     T41  = _mm_hadd_epi32(T32, T33); \
894     T42  = _mm_hadd_epi32(T34, T35); \
895     T43  = _mm_hadd_epi32(T36, T37); \
896         \
897     T50  = _mm_hadd_epi32(T40, T41); \
898     T51  = _mm_hadd_epi32(T42, T43); \
899     T50  = _mm_srai_epi32(_mm_add_epi32(T50, c_8), DCT32_SHIFT1); \
900     T51  = _mm_srai_epi32(_mm_add_epi32(T51, c_8), DCT32_SHIFT1); \
901     T60  = _mm_packs_epi32(T50, T51); \
902     im[(dstPos)][i] = T60;
903 
904         MAKE_ODD(0, 4);
905         MAKE_ODD(1, 12);
906         MAKE_ODD(2, 20);
907         MAKE_ODD(3, 28);
908 
909         T20  = _mm_sub_epi16(T10A, T10B);   // [EO5 EO2 EO6 EO1 EO4 EO3 EO7 EO0]
910         T21  = _mm_sub_epi16(T11A, T11B);
911         T22  = _mm_sub_epi16(T12A, T12B);
912         T23  = _mm_sub_epi16(T13A, T13B);
913         T24  = _mm_sub_epi16(T14A, T14B);
914         T25  = _mm_sub_epi16(T15A, T15B);
915         T26  = _mm_sub_epi16(T16A, T16B);
916         T27  = _mm_sub_epi16(T17A, T17B);
917 
918         MAKE_ODD(4, 2);
919         MAKE_ODD(5, 6);
920         MAKE_ODD(6, 10);
921         MAKE_ODD(7, 14);
922         MAKE_ODD(8, 18);
923         MAKE_ODD(9, 22);
924         MAKE_ODD(10, 26);
925         MAKE_ODD(11, 30);
926 #undef MAKE_ODD
927 
928 #define MAKE_ODD(tab, dstPos) \
929     T20  = _mm_madd_epi16(T00A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
930     T21  = _mm_madd_epi16(T00B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \
931     T22  = _mm_madd_epi16(T01A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
932     T23  = _mm_madd_epi16(T01B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \
933     T24  = _mm_madd_epi16(T02A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
934     T25  = _mm_madd_epi16(T02B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \
935     T26  = _mm_madd_epi16(T03A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
936     T27  = _mm_madd_epi16(T03B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \
937     T30  = _mm_madd_epi16(T04A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
938     T31  = _mm_madd_epi16(T04B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \
939     T32  = _mm_madd_epi16(T05A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
940     T33  = _mm_madd_epi16(T05B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \
941     T34  = _mm_madd_epi16(T06A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
942     T35  = _mm_madd_epi16(T06B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \
943     T36  = _mm_madd_epi16(T07A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab)])); \
944     T37  = _mm_madd_epi16(T07B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab) + 1])); \
945         \
946     T40  = _mm_hadd_epi32(T20, T21); \
947     T41  = _mm_hadd_epi32(T22, T23); \
948     T42  = _mm_hadd_epi32(T24, T25); \
949     T43  = _mm_hadd_epi32(T26, T27); \
950     T44  = _mm_hadd_epi32(T30, T31); \
951     T45  = _mm_hadd_epi32(T32, T33); \
952     T46  = _mm_hadd_epi32(T34, T35); \
953     T47  = _mm_hadd_epi32(T36, T37); \
954         \
955     T50  = _mm_hadd_epi32(T40, T41); \
956     T51  = _mm_hadd_epi32(T42, T43); \
957     T52  = _mm_hadd_epi32(T44, T45); \
958     T53  = _mm_hadd_epi32(T46, T47); \
959         \
960     T50  = _mm_hadd_epi32(T50, T51); \
961     T51  = _mm_hadd_epi32(T52, T53); \
962     T50  = _mm_srai_epi32(_mm_add_epi32(T50, c_8), DCT32_SHIFT1); \
963     T51  = _mm_srai_epi32(_mm_add_epi32(T51, c_8), DCT32_SHIFT1); \
964     T60  = _mm_packs_epi32(T50, T51); \
965     im[(dstPos)][i] = T60;
966 
967         MAKE_ODD(12,  1);
968         MAKE_ODD(14,  3);
969         MAKE_ODD(16,  5);
970         MAKE_ODD(18,  7);
971         MAKE_ODD(20,  9);
972         MAKE_ODD(22, 11);
973         MAKE_ODD(24, 13);
974         MAKE_ODD(26, 15);
975         MAKE_ODD(28, 17);
976         MAKE_ODD(30, 19);
977         MAKE_ODD(32, 21);
978         MAKE_ODD(34, 23);
979         MAKE_ODD(36, 25);
980         MAKE_ODD(38, 27);
981         MAKE_ODD(40, 29);
982         MAKE_ODD(42, 31);
983 
984 #undef MAKE_ODD
985     }
986 
987     // DCT2
988     for (i = 0; i < 32 / 4; i++)
989     {
990         // OPT_ME: to avoid register spill, I use matrix multiply, have other way?
991         T00A = im[i * 4 + 0][0];    // [07 06 05 04 03 02 01 00]
992         T00B = im[i * 4 + 0][1];    // [15 14 13 12 11 10 09 08]
993         T00C = im[i * 4 + 0][2];    // [23 22 21 20 19 18 17 16]
994         T00D = im[i * 4 + 0][3];    // [31 30 29 28 27 26 25 24]
995         T01A = im[i * 4 + 1][0];
996         T01B = im[i * 4 + 1][1];
997         T01C = im[i * 4 + 1][2];
998         T01D = im[i * 4 + 1][3];
999         T02A = im[i * 4 + 2][0];
1000         T02B = im[i * 4 + 2][1];
1001         T02C = im[i * 4 + 2][2];
1002         T02D = im[i * 4 + 2][3];
1003         T03A = im[i * 4 + 3][0];
1004         T03B = im[i * 4 + 3][1];
1005         T03C = im[i * 4 + 3][2];
1006         T03D = im[i * 4 + 3][3];
1007 
1008         T00C = _mm_shuffle_epi8(T00C, _mm_load_si128((__m128i*)tab_dct_16_0[0]));    // [16 17 18 19 20 21 22 23]
1009         T00D = _mm_shuffle_epi8(T00D, _mm_load_si128((__m128i*)tab_dct_16_0[0]));    // [24 25 26 27 28 29 30 31]
1010         T01C = _mm_shuffle_epi8(T01C, _mm_load_si128((__m128i*)tab_dct_16_0[0]));
1011         T01D = _mm_shuffle_epi8(T01D, _mm_load_si128((__m128i*)tab_dct_16_0[0]));
1012         T02C = _mm_shuffle_epi8(T02C, _mm_load_si128((__m128i*)tab_dct_16_0[0]));
1013         T02D = _mm_shuffle_epi8(T02D, _mm_load_si128((__m128i*)tab_dct_16_0[0]));
1014         T03C = _mm_shuffle_epi8(T03C, _mm_load_si128((__m128i*)tab_dct_16_0[0]));
1015         T03D = _mm_shuffle_epi8(T03D, _mm_load_si128((__m128i*)tab_dct_16_0[0]));
1016 
1017         T10A = _mm_unpacklo_epi16(T00A, T00D);  // [28 03 29 02 30 01 31 00]
1018         T10B = _mm_unpackhi_epi16(T00A, T00D);  // [24 07 25 06 26 05 27 04]
1019         T00A = _mm_unpacklo_epi16(T00B, T00C);  // [20 11 21 10 22 09 23 08]
1020         T00B = _mm_unpackhi_epi16(T00B, T00C);  // [16 15 17 14 18 13 19 12]
1021         T11A = _mm_unpacklo_epi16(T01A, T01D);
1022         T11B = _mm_unpackhi_epi16(T01A, T01D);
1023         T01A = _mm_unpacklo_epi16(T01B, T01C);
1024         T01B = _mm_unpackhi_epi16(T01B, T01C);
1025         T12A = _mm_unpacklo_epi16(T02A, T02D);
1026         T12B = _mm_unpackhi_epi16(T02A, T02D);
1027         T02A = _mm_unpacklo_epi16(T02B, T02C);
1028         T02B = _mm_unpackhi_epi16(T02B, T02C);
1029         T13A = _mm_unpacklo_epi16(T03A, T03D);
1030         T13B = _mm_unpackhi_epi16(T03A, T03D);
1031         T03A = _mm_unpacklo_epi16(T03B, T03C);
1032         T03B = _mm_unpackhi_epi16(T03B, T03C);
1033 
1034 #define MAKE_ODD(tab0, tab1, tab2, tab3, dstPos) \
1035     T20  = _mm_madd_epi16(T10A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \
1036     T21  = _mm_madd_epi16(T10B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \
1037     T22  = _mm_madd_epi16(T00A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \
1038     T23  = _mm_madd_epi16(T00B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab3)])); \
1039     T24  = _mm_madd_epi16(T11A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \
1040     T25  = _mm_madd_epi16(T11B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \
1041     T26  = _mm_madd_epi16(T01A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \
1042     T27  = _mm_madd_epi16(T01B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab3)])); \
1043     T30  = _mm_madd_epi16(T12A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \
1044     T31  = _mm_madd_epi16(T12B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \
1045     T32  = _mm_madd_epi16(T02A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \
1046     T33  = _mm_madd_epi16(T02B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab3)])); \
1047     T34  = _mm_madd_epi16(T13A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab0)])); \
1048     T35  = _mm_madd_epi16(T13B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab1)])); \
1049     T36  = _mm_madd_epi16(T03A, _mm_load_si128((__m128i*)tab_dct_32_1[(tab2)])); \
1050     T37  = _mm_madd_epi16(T03B, _mm_load_si128((__m128i*)tab_dct_32_1[(tab3)])); \
1051         \
1052     T60  = _mm_hadd_epi32(T20, T21); \
1053     T61  = _mm_hadd_epi32(T22, T23); \
1054     T62  = _mm_hadd_epi32(T24, T25); \
1055     T63  = _mm_hadd_epi32(T26, T27); \
1056     T64  = _mm_hadd_epi32(T30, T31); \
1057     T65  = _mm_hadd_epi32(T32, T33); \
1058     T66  = _mm_hadd_epi32(T34, T35); \
1059     T67  = _mm_hadd_epi32(T36, T37); \
1060         \
1061     T60  = _mm_hadd_epi32(T60, T61); \
1062     T61  = _mm_hadd_epi32(T62, T63); \
1063     T62  = _mm_hadd_epi32(T64, T65); \
1064     T63  = _mm_hadd_epi32(T66, T67); \
1065         \
1066     T60  = _mm_hadd_epi32(T60, T61); \
1067     T61  = _mm_hadd_epi32(T62, T63); \
1068         \
1069     T60  = _mm_hadd_epi32(T60, T61); \
1070         \
1071     T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_1024), DCT32_SHIFT2); \
1072     T60  = _mm_packs_epi32(T60, T60); \
1073     _mm_storel_epi64((__m128i*)&dst[(dstPos) * 32 + (i * 4) + 0], T60); \
1074 
1075         MAKE_ODD(44, 44, 44, 44,  0);
1076         MAKE_ODD(45, 45, 45, 45, 16);
1077         MAKE_ODD(46, 47, 46, 47,  8);
1078         MAKE_ODD(48, 49, 48, 49, 24);
1079 
1080         MAKE_ODD(50, 51, 52, 53,  4);
1081         MAKE_ODD(54, 55, 56, 57, 12);
1082         MAKE_ODD(58, 59, 60, 61, 20);
1083         MAKE_ODD(62, 63, 64, 65, 28);
1084 
1085         MAKE_ODD(66, 67, 68, 69,  2);
1086         MAKE_ODD(70, 71, 72, 73,  6);
1087         MAKE_ODD(74, 75, 76, 77, 10);
1088         MAKE_ODD(78, 79, 80, 81, 14);
1089 
1090         MAKE_ODD(82, 83, 84, 85, 18);
1091         MAKE_ODD(86, 87, 88, 89, 22);
1092         MAKE_ODD(90, 91, 92, 93, 26);
1093         MAKE_ODD(94, 95, 96, 97, 30);
1094 
1095         MAKE_ODD(98, 99, 100, 101,  1);
1096         MAKE_ODD(102, 103, 104, 105,  3);
1097         MAKE_ODD(106, 107, 108, 109,  5);
1098         MAKE_ODD(110, 111, 112, 113,  7);
1099         MAKE_ODD(114, 115, 116, 117,  9);
1100         MAKE_ODD(118, 119, 120, 121, 11);
1101         MAKE_ODD(122, 123, 124, 125, 13);
1102         MAKE_ODD(126, 127, 128, 129, 15);
1103         MAKE_ODD(130, 131, 132, 133, 17);
1104         MAKE_ODD(134, 135, 136, 137, 19);
1105         MAKE_ODD(138, 139, 140, 141, 21);
1106         MAKE_ODD(142, 143, 144, 145, 23);
1107         MAKE_ODD(146, 147, 148, 149, 25);
1108         MAKE_ODD(150, 151, 152, 153, 27);
1109         MAKE_ODD(154, 155, 156, 157, 29);
1110         MAKE_ODD(158, 159, 160, 161, 31);
1111 #undef MAKE_ODD
1112     }
1113 }
1114 
1115 namespace X265_NS {
setupIntrinsicDCT_ssse3(EncoderPrimitives & p)1116 void setupIntrinsicDCT_ssse3(EncoderPrimitives &p)
1117 {
1118     /* Note: We have AVX2 assembly for these two functions, but since AVX2 is
1119      * still somewhat rare on end-user PCs we still compile and link these SSSE3
1120      * intrinsic SIMD functions */
1121     p.cu[BLOCK_16x16].dct = dct16;
1122     p.cu[BLOCK_32x32].dct = dct32;
1123 }
1124 }
1125