1 /*
2 * Copyright(c) 2018 Intel Corporation
3 * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 */
5 
6 
7 #include "EbTransforms_SSE2.h"
8 #include "EbIntrinMacros16bit_SSE2.h"
9 #include <emmintrin.h>
10 
11 /*****************************
12 * Defines
13 *****************************/
14 
15 #define MACRO_TRANS_2MAC_NO_SAVE(XMM_1, XMM_2, XMM_3, XMM_4, XMM_OFFSET, OFFSET1, OFFSET2, SHIFT)\
16     XMM_3 = _mm_load_si128((__m128i *)(EbHevcTransformAsmConst + OFFSET1));\
17     XMM_4 = _mm_load_si128((__m128i *)(EbHevcTransformAsmConst + OFFSET2));\
18     XMM_3 = _mm_madd_epi16(XMM_3, XMM_1);\
19     XMM_4 = _mm_madd_epi16(XMM_4, XMM_2);\
20     XMM_3 = _mm_srai_epi32(_mm_add_epi32(XMM_4, _mm_add_epi32(XMM_3, XMM_OFFSET)), SHIFT);\
21     XMM_3 = _mm_packs_epi32(XMM_3, XMM_3);
22 
23 #define MACRO_TRANS_2MAC(XMM_1, XMM_2, XMM_3, XMM_4, XMM_OFFSET, OFFSET1, OFFSET2, SHIFT, OFFSET3)\
24     MACRO_TRANS_2MAC_NO_SAVE(XMM_1, XMM_2, XMM_3, XMM_4, XMM_OFFSET, OFFSET1, OFFSET2, SHIFT)\
25     _mm_storel_epi64((__m128i *)(transformCoefficients+OFFSET3), XMM_3);
26 
27 #define TRANS8x8_OFFSET_83_36    0
28 #define TRANS8x8_OFFSET_36_N83  (8 + TRANS8x8_OFFSET_83_36)
29 #define TRANS8x8_OFFSET_89_75   (8 + TRANS8x8_OFFSET_36_N83)
30 #define TRANS8x8_OFFSET_50_18   (8 + TRANS8x8_OFFSET_89_75)
31 #define TRANS8x8_OFFSET_75_N18  (8 + TRANS8x8_OFFSET_50_18)
32 #define TRANS8x8_OFFSET_N89_N50 (8 + TRANS8x8_OFFSET_75_N18)
33 #define TRANS8x8_OFFSET_50_N89  (8 + TRANS8x8_OFFSET_N89_N50)
34 #define TRANS8x8_OFFSET_18_75   (8 + TRANS8x8_OFFSET_50_N89)
35 #define TRANS8x8_OFFSET_18_N50  (8 + TRANS8x8_OFFSET_18_75)
36 #define TRANS8x8_OFFSET_75_N89  (8 + TRANS8x8_OFFSET_18_N50)
37 #define TRANS8x8_OFFSET_256     (8 + TRANS8x8_OFFSET_75_N89)
38 #define TRANS8x8_OFFSET_64_64   (8 + TRANS8x8_OFFSET_256)
39 #define TRANS8x8_OFFSET_N18_N50 (8 + TRANS8x8_OFFSET_64_64)
40 #define TRANS8x8_OFFSET_N75_N89 (8 + TRANS8x8_OFFSET_N18_N50)
41 #define TRANS8x8_OFFSET_N36_N83 (8 + TRANS8x8_OFFSET_N75_N89)
42 #define TRANS8x8_OFFSET_N83_N36 (8 + TRANS8x8_OFFSET_N36_N83)
43 #define TRANS8x8_OFFSET_36_83   (8 + TRANS8x8_OFFSET_N83_N36)
44 #define TRANS8x8_OFFSET_50_89   (8 + TRANS8x8_OFFSET_36_83)
45 #define TRANS8x8_OFFSET_18_N75  (8 + TRANS8x8_OFFSET_50_89)
46 #define TRANS8x8_OFFSET_N64_64  (8 + TRANS8x8_OFFSET_18_N75)
47 #define TRANS8x8_OFFSET_64_N64  (8 + TRANS8x8_OFFSET_N64_64)
48 #define TRANS8x8_OFFSET_N75_N18 (8 + TRANS8x8_OFFSET_64_N64)
49 #define TRANS8x8_OFFSET_89_N50  (8 + TRANS8x8_OFFSET_N75_N18)
50 #define TRANS8x8_OFFSET_83_N36  (8 + TRANS8x8_OFFSET_89_N50)
51 #define TRANS8x8_OFFSET_N36_83  (8 + TRANS8x8_OFFSET_83_N36)
52 #define TRANS8x8_OFFSET_N83_36  (8 + TRANS8x8_OFFSET_N36_83)
53 #define TRANS8x8_OFFSET_89_N75  (8 + TRANS8x8_OFFSET_N83_36)
54 #define TRANS8x8_OFFSET_50_N18  (8 + TRANS8x8_OFFSET_89_N75)
55 
56 #define MACRO_CALC_EVEN_ODD(XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, XMM8)\
57     even0 = _mm_add_epi16(XMM1, XMM8);\
58     even1 = _mm_add_epi16(XMM2, XMM7);\
59     even2 = _mm_add_epi16(XMM3, XMM6);\
60     even3 = _mm_add_epi16(XMM4, XMM5);\
61     odd0 = _mm_sub_epi16(XMM1, XMM8);\
62     odd1 = _mm_sub_epi16(XMM2, XMM7);\
63     odd2 = _mm_sub_epi16(XMM3, XMM6);\
64     odd3 = _mm_sub_epi16(XMM4, XMM5);
65 
66 #define MACRO_TRANS_4MAC_NO_SAVE(XMM1, XMM2, XMM3, XMM4, XMM_RET, XMM_OFFSET, MEM, OFFSET1, OFFSET2, SHIFT)\
67     XMM_RET = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_madd_epi16(XMM1, _mm_load_si128((__m128i*)(MEM+OFFSET1))),\
68                                                                          _mm_madd_epi16(XMM3, _mm_load_si128((__m128i*)(MEM+OFFSET2)))), XMM_OFFSET), SHIFT),\
69                               _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_madd_epi16(XMM2, _mm_load_si128((__m128i*)(MEM+OFFSET1))),\
70                                                                          _mm_madd_epi16(XMM4, _mm_load_si128((__m128i*)(MEM+OFFSET2)))), XMM_OFFSET), SHIFT));
71 
72 #define MACRO_TRANS_8MAC(XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, XMM8, XMM_OFST, MEM, OFST1, OFST2, OFST3, OFST4, SHIFT, INSTR, DST, OFST5)\
73     sum1 = _mm_add_epi32(_mm_madd_epi16(XMM1, _mm_loadu_si128((__m128i *)(MEM + OFST1))), _mm_madd_epi16(XMM2, _mm_loadu_si128((__m128i *)(MEM + OFST2))));\
74     sum2 = _mm_add_epi32(_mm_madd_epi16(XMM3, _mm_loadu_si128((__m128i *)(MEM + OFST3))), _mm_madd_epi16(XMM4, _mm_loadu_si128((__m128i *)(MEM + OFST4))));\
75     sum1 = _mm_srai_epi32(_mm_add_epi32(XMM_OFST, _mm_add_epi32(sum1, sum2)), SHIFT);\
76     sum3 = _mm_add_epi32(_mm_madd_epi16(XMM5, _mm_loadu_si128((__m128i *)(MEM + OFST1))), _mm_madd_epi16(XMM6, _mm_loadu_si128((__m128i *)(MEM + OFST2))));\
77     sum4 = _mm_add_epi32(_mm_madd_epi16(XMM7, _mm_loadu_si128((__m128i *)(MEM + OFST3))), _mm_madd_epi16(XMM8, _mm_loadu_si128((__m128i *)(MEM + OFST4))));\
78     sum3 = _mm_srai_epi32(_mm_add_epi32(XMM_OFST, _mm_add_epi32(sum3, sum4)), SHIFT);\
79     sum = _mm_packs_epi32(sum1, sum3);\
80     INSTR((__m128i *)(DST + OFST5), sum);
81 
82 #define MACRO_TRANS_8MAC_PF_N2(XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, XMM8, XMM_OFST, MEM, OFST1, OFST2, OFST3, OFST4, SHIFT, INSTR, DST, OFST5)\
83     sum1 = _mm_add_epi32(_mm_madd_epi16(XMM1, _mm_loadu_si128((__m128i *)(MEM + OFST1))), _mm_madd_epi16(XMM2, _mm_loadu_si128((__m128i *)(MEM + OFST2))));\
84     sum2 = _mm_add_epi32(_mm_madd_epi16(XMM3, _mm_loadu_si128((__m128i *)(MEM + OFST3))), _mm_madd_epi16(XMM4, _mm_loadu_si128((__m128i *)(MEM + OFST4))));\
85     sum1 = _mm_srai_epi32(_mm_add_epi32(XMM_OFST, _mm_add_epi32(sum1, sum2)), SHIFT);\
86     /*sum3 = _mm_add_epi32(_mm_madd_epi16(XMM5, _mm_loadu_si128((__m128i *)(MEM + OFST1))), _mm_madd_epi16(XMM6, _mm_loadu_si128((__m128i *)(MEM + OFST2))));*/\
87     /*sum4 = _mm_add_epi32(_mm_madd_epi16(XMM7, _mm_loadu_si128((__m128i *)(MEM + OFST3))), _mm_madd_epi16(XMM8, _mm_loadu_si128((__m128i *)(MEM + OFST4))));*/\
88     /*sum3 = _mm_srai_epi32(_mm_add_epi32(XMM_OFST, _mm_add_epi32(sum3, sum4)), SHIFT);*/\
89     /*sum = _mm_packs_epi32(sum1, sum3);*/\
90 	sum = _mm_packs_epi32(sum1, sum1);\
91     INSTR((__m128i *)(DST + OFST5), sum);
92 #define MACRO_TRANS_8MAC_PF_N4(XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, XMM8, XMM_OFST, MEM, OFST1, OFST2, OFST3, OFST4, SHIFT, INSTR, DST, OFST5)\
93     sum1 = _mm_add_epi32(_mm_madd_epi16(XMM1, _mm_loadu_si128((__m128i *)(MEM + OFST1))), _mm_madd_epi16(XMM2, _mm_loadu_si128((__m128i *)(MEM + OFST2))));\
94     sum2 = _mm_add_epi32(_mm_madd_epi16(XMM3, _mm_loadu_si128((__m128i *)(MEM + OFST3))), _mm_madd_epi16(XMM4, _mm_loadu_si128((__m128i *)(MEM + OFST4))));\
95     sum1 = _mm_srai_epi32(_mm_add_epi32(XMM_OFST, _mm_add_epi32(sum1, sum2)), SHIFT);\
96     /*sum3 = _mm_add_epi32(_mm_madd_epi16(XMM5, _mm_loadu_si128((__m128i *)(MEM + OFST1))), _mm_madd_epi16(XMM6, _mm_loadu_si128((__m128i *)(MEM + OFST2))));*/\
97     /*sum4 = _mm_add_epi32(_mm_madd_epi16(XMM7, _mm_loadu_si128((__m128i *)(MEM + OFST3))), _mm_madd_epi16(XMM8, _mm_loadu_si128((__m128i *)(MEM + OFST4))));*/\
98     /*sum3 = _mm_srai_epi32(_mm_add_epi32(XMM_OFST, _mm_add_epi32(sum3, sum4)), SHIFT);*/\
99     /*sum = _mm_packs_epi32(sum1, sum3);*/\
100 	sum = _mm_packs_epi32(sum1, sum1);\
101     INSTR((__m128i *)(DST + OFST5), sum);
102 
103 #ifdef __GNUC__
104 #ifndef __cplusplus
105 __attribute__((visibility("hidden")))
106 #endif
107 #endif
108 EB_ALIGN(16) const EB_S16 DstTransformAsmConst_SSE2[] = {
109     1, 0, 1, 0, 1, 0, 1, 0,
110     29, 55, 29, 55, 29, 55, 29, 55,
111     74, 84, 74, 84, 74, 84, 74, 84,
112     84, -29, 84, -29, 84, -29, 84, -29,
113     -74, 55, -74, 55, -74, 55, -74, 55,
114     55, -84, 55, -84, 55, -84, 55, -84,
115     74, -29, 74, -29, 74, -29, 74, -29,
116     37, 37, 37, 37, 37, 37, 37, 37,
117     74, 74, 74, 74, 74, 74, 74, 74,
118     0, -37, 0, -37, 0, -37, 0, -37,
119     0, -74, 0, -74, 0, -74, 0, -74,
120     //74,    0,   74,    0,   74,    0,   74,    0,
121     //55,  -29,   55,  -29,   55,  -29,   55,  -29,
122 };
123 
124 #ifdef __GNUC__
125 #ifndef __cplusplus
126 __attribute__((visibility("hidden")))
127 #endif
128 #endif
129 EB_ALIGN(16) const EB_S16 InvTransformAsmConst_SSE2[] = {
130     2, 0, 2, 0, 2, 0, 2, 0,
131     4, 0, 4, 0, 4, 0, 4, 0,
132     8, 0, 8, 0, 8, 0, 8, 0,
133     9, 0, 9, 0, 9, 0, 9, 0,
134     64, 0, 64, 0, 64, 0, 64, 0,
135     256, 0, 256, 0, 256, 0, 256, 0,
136     512, 0, 512, 0, 512, 0, 512, 0,
137     1024, 0, 1024, 0, 1024, 0, 1024, 0,
138     2048, 0, 2048, 0, 2048, 0, 2048, 0,
139     7, 0, 0, 0, 0, 0, 0, 0,
140     12, 0, 0, 0, 0, 0, 0, 0,
141     64, 64, 64, 64, 64, 64, 64, 64,
142     90, 57, 90, 57, 90, 57, 90, 57,
143     89, 50, 89, 50, 89, 50, 89, 50,
144     87, 43, 87, 43, 87, 43, 87, 43,
145     83, 36, 83, 36, 83, 36, 83, 36,
146     80, 25, 80, 25, 80, 25, 80, 25,
147     75, 18, 75, 18, 75, 18, 75, 18,
148     70, 9, 70, 9, 70, 9, 70, 9,
149     64, -64, 64, -64, 64, -64, 64, -64,
150     87, -80, 87, -80, 87, -80, 87, -80,
151     75, -89, 75, -89, 75, -89, 75, -89,
152     57, -90, 57, -90, 57, -90, 57, -90,
153     36, -83, 36, -83, 36, -83, 36, -83,
154     9, -70, 9, -70, 9, -70, 9, -70,
155     -18, -50, -18, -50, -18, -50, -18, -50,
156     -43, -25, -43, -25, -43, -25, -43, -25,
157     80, -25, 80, -25, 80, -25, 80, -25,
158     50, 18, 50, 18, 50, 18, 50, 18,
159     9, 57, 9, 57, 9, 57, 9, 57,
160     -36, 83, -36, 83, -36, 83, -36, 83,
161     -70, 90, -70, 90, -70, 90, -70, 90,
162     -89, 75, -89, 75, -89, 75, -89, 75,
163     -87, 43, -87, 43, -87, 43, -87, 43,
164     70, 90, 70, 90, 70, 90, 70, 90,
165     18, 75, 18, 75, 18, 75, 18, 75,
166     -43, 25, -43, 25, -43, 25, -43, 25,
167     -83, -36, -83, -36, -83, -36, -83, -36,
168     -87, -80, -87, -80, -87, -80, -87, -80,
169     -50, -89, -50, -89, -50, -89, -50, -89,
170     9, -57, 9, -57, 9, -57, 9, -57,
171     57, -9, 57, -9, 57, -9, 57, -9,
172     -18, -75, -18, -75, -18, -75, -18, -75,
173     -80, -87, -80, -87, -80, -87, -80, -87,
174     -25, 43, -25, 43, -25, 43, -25, 43,
175     50, 89, 50, 89, 50, 89, 50, 89,
176     90, 70, 90, 70, 90, 70, 90, 70,
177     43, -87, 43, -87, 43, -87, 43, -87,
178     -50, -18, -50, -18, -50, -18, -50, -18,
179     -90, 70, -90, 70, -90, 70, -90, 70,
180     57, 9, 57, 9, 57, 9, 57, 9,
181     89, -75, 89, -75, 89, -75, 89, -75,
182     25, -80, 25, -80, 25, -80, 25, -80,
183     25, 43, 25, 43, 25, 43, 25, 43,
184     -75, 89, -75, 89, -75, 89, -75, 89,
185     -70, 9, -70, 9, -70, 9, -70, 9,
186     90, -57, 90, -57, 90, -57, 90, -57,
187     18, 50, 18, 50, 18, 50, 18, 50,
188     -80, 87, -80, 87, -80, 87, -80, 87,
189     9, 70, 9, 70, 9, 70, 9, 70,
190     -89, -50, -89, -50, -89, -50, -89, -50,
191     -25, -80, -25, -80, -25, -80, -25, -80,
192     43, 87, 43, 87, 43, 87, 43, 87,
193     -75, -18, -75, -18, -75, -18, -75, -18,
194     -57, -90, -57, -90, -57, -90, -57, -90,
195     -9, -70, -9, -70, -9, -70, -9, -70,
196     25, 80, 25, 80, 25, 80, 25, 80,
197     -43, -87, -43, -87, -43, -87, -43, -87,
198     57, 90, 57, 90, 57, 90, 57, 90,
199     -25, -43, -25, -43, -25, -43, -25, -43,
200     70, -9, 70, -9, 70, -9, 70, -9,
201     -90, 57, -90, 57, -90, 57, -90, 57,
202     80, -87, 80, -87, 80, -87, 80, -87,
203     -43, 87, -43, 87, -43, 87, -43, 87,
204     90, -70, 90, -70, 90, -70, 90, -70,
205     -57, -9, -57, -9, -57, -9, -57, -9,
206     -25, 80, -25, 80, -25, 80, -25, 80,
207     -57, 9, -57, 9, -57, 9, -57, 9,
208     80, 87, 80, 87, 80, 87, 80, 87,
209     25, -43, 25, -43, 25, -43, 25, -43,
210     -90, -70, -90, -70, -90, -70, -90, -70,
211     -70, -90, -70, -90, -70, -90, -70, -90,
212     43, -25, 43, -25, 43, -25, 43, -25,
213     87, 80, 87, 80, 87, 80, 87, 80,
214     -9, 57, -9, 57, -9, 57, -9, 57,
215     -80, 25, -80, 25, -80, 25, -80, 25,
216     -9, -57, -9, -57, -9, -57, -9, -57,
217     70, -90, 70, -90, 70, -90, 70, -90,
218     87, -43, 87, -43, 87, -43, 87, -43,
219     -87, 80, -87, 80, -87, 80, -87, 80,
220     -57, 90, -57, 90, -57, 90, -57, 90,
221     -9, 70, -9, 70, -9, 70, -9, 70,
222     43, 25, 43, 25, 43, 25, 43, 25,
223     -90, -57, -90, -57, -90, -57, -90, -57,
224     -87, -43, -87, -43, -87, -43, -87, -43,
225     -80, -25, -80, -25, -80, -25, -80, -25,
226     -70, -9, -70, -9, -70, -9, -70, -9,
227     90, 61, 90, 61, 90, 61, 90, 61,
228     90, 54, 90, 54, 90, 54, 90, 54,
229     88, 46, 88, 46, 88, 46, 88, 46,
230     85, 38, 85, 38, 85, 38, 85, 38,
231     82, 31, 82, 31, 82, 31, 82, 31,
232     78, 22, 78, 22, 78, 22, 78, 22,
233     73, 13, 73, 13, 73, 13, 73, 13,
234     67, 4, 67, 4, 67, 4, 67, 4,
235     90, -73, 90, -73, 90, -73, 90, -73,
236     82, -85, 82, -85, 82, -85, 82, -85,
237     67, -90, 67, -90, 67, -90, 67, -90,
238     46, -88, 46, -88, 46, -88, 46, -88,
239     22, -78, 22, -78, 22, -78, 22, -78,
240     -4, -61, -4, -61, -4, -61, -4, -61,
241     -31, -38, -31, -38, -31, -38, -31, -38,
242     -54, -13, -54, -13, -54, -13, -54, -13,
243     88, -46, 88, -46, 88, -46, 88, -46,
244     67, -4, 67, -4, 67, -4, 67, -4,
245     31, 38, 31, 38, 31, 38, 31, 38,
246     -13, 73, -13, 73, -13, 73, -13, 73,
247     -54, 90, -54, 90, -54, 90, -54, 90,
248     -82, 85, -82, 85, -82, 85, -82, 85,
249     -90, 61, -90, 61, -90, 61, -90, 61,
250     -78, 22, -78, 22, -78, 22, -78, 22,
251     85, 82, 85, 82, 85, 82, 85, 82,
252     46, 88, 46, 88, 46, 88, 46, 88,
253     -13, 54, -13, 54, -13, 54, -13, 54,
254     -67, -4, -67, -4, -67, -4, -67, -4,
255     -90, -61, -90, -61, -90, -61, -90, -61,
256     -73, -90, -73, -90, -73, -90, -73, -90,
257     -22, -78, -22, -78, -22, -78, -22, -78,
258     38, -31, 38, -31, 38, -31, 38, -31,
259     22, -46, 22, -46, 22, -46, 22, -46,
260     -54, -90, -54, -90, -54, -90, -54, -90,
261     -90, -67, -90, -67, -90, -67, -90, -67,
262     -61, 4, -61, 4, -61, 4, -61, 4,
263     13, 73, 13, 73, 13, 73, 13, 73,
264     78, 88, 78, 88, 78, 88, 78, 88,
265     78, -88, 78, -88, 78, -88, 78, -88,
266     -82, 31, -82, 31, -82, 31, -82, 31,
267     -73, 90, -73, 90, -73, 90, -73, 90,
268     13, 54, 13, 54, 13, 54, 13, 54,
269     85, -38, 85, -38, 85, -38, 85, -38,
270     -22, -46, -22, -46, -22, -46, -22, -46,
271     73, -13, 73, -13, 73, -13, 73, -13,
272     -31, 82, -31, 82, -31, 82, -31, 82,
273     -38, 85, -38, 85, -38, 85, -38, 85,
274     -90, 54, -90, 54, -90, 54, -90, 54,
275     67, 90, 67, 90, 67, 90, 67, 90,
276     -54, 13, -54, 13, -54, 13, -54, 13,
277     -78, -88, -78, -88, -78, -88, -78, -88,
278     -22, 46, -22, 46, -22, 46, -22, 46,
279     -90, -73, -90, -73, -90, -73, -90, -73,
280     4, -61, 4, -61, 4, -61, 4, -61,
281     61, -4, 61, -4, 61, -4, 61, -4,
282     -46, 22, -46, 22, -46, 22, -46, 22,
283     82, 85, 82, 85, 82, 85, 82, 85,
284     31, -38, 31, -38, 31, -38, 31, -38,
285     -88, -78, -88, -78, -88, -78, -88, -78,
286     90, 67, 90, 67, 90, 67, 90, 67,
287     54, -90, 54, -90, 54, -90, 54, -90,
288     -85, 38, -85, 38, -85, 38, -85, 38,
289     -4, 67, -4, 67, -4, 67, -4, 67,
290     88, -78, 88, -78, 88, -78, 88, -78,
291     -46, -22, -46, -22, -46, -22, -46, -22,
292     -61, 90, -61, 90, -61, 90, -61, 90,
293     82, -31, 82, -31, 82, -31, 82, -31,
294     13, -73, 13, -73, 13, -73, 13, -73,
295     46, 22, 46, 22, 46, 22, 46, 22,
296     -90, 67, -90, 67, -90, 67, -90, 67,
297     38, -85, 38, -85, 38, -85, 38, -85,
298     54, 13, 54, 13, 54, 13, 54, 13,
299     -90, 73, -90, 73, -90, 73, -90, 73,
300     31, -82, 31, -82, 31, -82, 31, -82,
301     61, 4, 61, 4, 61, 4, 61, 4,
302     -88, 78, -88, 78, -88, 78, -88, 78,
303     38, 85, 38, 85, 38, 85, 38, 85,
304     -4, 61, -4, 61, -4, 61, -4, 61,
305     -67, -90, -67, -90, -67, -90, -67, -90,
306     -31, -82, -31, -82, -31, -82, -31, -82,
307     -78, -22, -78, -22, -78, -22, -78, -22,
308     90, 73, 90, 73, 90, 73, 90, 73,
309     -61, -90, -61, -90, -61, -90, -61, -90,
310     4, 67, 4, 67, 4, 67, 4, 67,
311     54, -13, 54, -13, 54, -13, 54, -13,
312     -88, -46, -88, -46, -88, -46, -88, -46,
313     85, -82, 85, -82, 85, -82, 85, -82,
314     -38, -31, -38, -31, -38, -31, -38, -31,
315     -13, -73, -13, -73, -13, -73, -13, -73,
316     22, 78, 22, 78, 22, 78, 22, 78,
317     -46, -88, -46, -88, -46, -88, -46, -88,
318     54, 90, 54, 90, 54, 90, 54, 90
319 };
320 
321 #ifdef __GNUC__
322 #ifndef __cplusplus
323 __attribute__((visibility("hidden")))
324 #endif
325 #endif
326 EB_ALIGN(16) const EB_S16 InvDstTransformAsmConst_SSE2[] = {
327     64, 0, 64, 0, 64, 0, 64, 0,
328     29, 84, 29, 84, 29, 84, 29, 84,
329     74, 55, 74, 55, 74, 55, 74, 55,
330     55, -29, 55, -29, 55, -29, 55, -29,
331     74, -84, 74, -84, 74, -84, 74, -84,
332     74, -74, 74, -74, 74, -74, 74, -74,
333     0, 74, 0, 74, 0, 74, 0, 74,
334     84, 55, 84, 55, 84, 55, 84, 55,
335     -74, -29, -74, -29, -74, -29, -74, -29,
336 };
337 
338 
339 // Coefficients for inverse 32-point transform
340 EB_EXTERN const EB_S16 EbHevcCoeff_tbl2[48 * 8] =
341 {
342     64, 89, 64, 75, 64, 50, 64, 18, 64, -18, 64, -50, 64, -75, 64, -89,
343     83, 75, 36, -18, -36, -89, -83, -50, -83, 50, -36, 89, 36, 18, 83, -75,
344     64, 50, -64, -89, -64, 18, 64, 75, 64, -75, -64, -18, -64, 89, 64, -50,
345     36, 18, -83, -50, 83, 75, -36, -89, -36, 89, 83, -75, -83, 50, 36, -18,
346     90, 87, 87, 57, 80, 9, 70, -43, 57, -80, 43, -90, 25, -70, 9, -25,
347     80, 70, 9, -43, -70, -87, -87, 9, -25, 90, 57, 25, 90, -80, 43, -57,
348     57, 43, -80, -90, -25, 57, 90, 25, -9, -87, -87, 70, 43, 9, 70, -80,
349     25, 9, -70, -25, 90, 43, -80, -57, 43, 70, 9, -80, -57, 87, 87, -90,
350     90, 90, 90, 82, 88, 67, 85, 46, 82, 22, 78, -4, 73, -31, 67, -54,
351     61, -73, 54, -85, 46, -90, 38, -88, 31, -78, 22, -61, 13, -38, 4, -13,
352     88, 85, 67, 46, 31, -13, -13, -67, -54, -90, -82, -73, -90, -22, -78, 38,
353     -46, 82, -4, 88, 38, 54, 73, -4, 90, -61, 85, -90, 61, -78, 22, -31,
354     82, 78, 22, -4, -54, -82, -90, -73, -61, 13, 13, 85, 78, 67, 85, -22,
355     31, -88, -46, -61, -90, 31, -67, 90, 4, 54, 73, -38, 88, -90, 38, -46,
356     73, 67, -31, -54, -90, -78, -22, 38, 78, 85, 67, -22, -38, -90, -90, 4,
357     -13, 90, 82, 13, 61, -88, -46, -31, -88, 82, -4, 46, 85, -73, 54, -61,
358     61, 54, -73, -85, -46, -4, 82, 88, 31, -46, -88, -61, -13, 82, 90, 13,
359     -4, -90, -90, 38, 22, 67, 85, -78, -38, -22, -78, 90, 54, -31, 67, -73,
360     46, 38, -90, -88, 38, 73, 54, -4, -90, -67, 31, 90, 61, -46, -88, -31,
361     22, 85, 67, -78, -85, 13, 13, 61, 73, -90, -82, 54, 4, 22, 78, -82,
362     31, 22, -78, -61, 90, 85, -61, -90, 4, 73, 54, -38, -88, -4, 82, 46,
363     -38, -78, -22, 90, 73, -82, -90, 54, 67, -13, -13, -31, -46, 67, 85, -88,
364     13, 4, -38, -13, 61, 22, -78, -31, 88, 38, -90, -46, 85, 54, -73, -61,
365     54, 67, -31, -73, 4, 78, 22, -82, -46, 85, 67, -88, -82, 90, 90, -90
366 };
367 
368 #ifdef __GNUC__
369 #ifndef __cplusplus
370 __attribute__((visibility("hidden")))
371 #endif
372 #endif
373 EB_EXTERN const EB_S16 EbHevcCoeff_tbl[48 * 8] =
374 {
375     64, 64, 89, 75, 83, 36, 75, -18, 64, -64, 50, -89, 36, -83, 18, -50,
376     64, 64, 50, 18, -36, -83, -89, -50, -64, 64, 18, 75, 83, -36, 75, -89,
377     64, 64, -18, -50, -83, -36, 50, 89, 64, -64, -75, -18, -36, 83, 89, -75,
378     64, 64, -75, -89, 36, 83, 18, -75, -64, 64, 89, -50, -83, 36, 50, -18,
379     90, 87, 87, 57, 80, 9, 70, -43, 57, -80, 43, -90, 25, -70, 9, -25,
380     80, 70, 9, -43, -70, -87, -87, 9, -25, 90, 57, 25, 90, -80, 43, -57,
381     57, 43, -80, -90, -25, 57, 90, 25, -9, -87, -87, 70, 43, 9, 70, -80,
382     25, 9, -70, -25, 90, 43, -80, -57, 43, 70, 9, -80, -57, 87, 87, -90,
383     90, 90, 90, 82, 88, 67, 85, 46, 82, 22, 78, -4, 73, -31, 67, -54,
384     61, -73, 54, -85, 46, -90, 38, -88, 31, -78, 22, -61, 13, -38, 4, -13,
385     88, 85, 67, 46, 31, -13, -13, -67, -54, -90, -82, -73, -90, -22, -78, 38,
386     -46, 82, -4, 88, 38, 54, 73, -4, 90, -61, 85, -90, 61, -78, 22, -31,
387     82, 78, 22, -4, -54, -82, -90, -73, -61, 13, 13, 85, 78, 67, 85, -22,
388     31, -88, -46, -61, -90, 31, -67, 90, 4, 54, 73, -38, 88, -90, 38, -46,
389     73, 67, -31, -54, -90, -78, -22, 38, 78, 85, 67, -22, -38, -90, -90, 4,
390     -13, 90, 82, 13, 61, -88, -46, -31, -88, 82, -4, 46, 85, -73, 54, -61,
391     61, 54, -73, -85, -46, -4, 82, 88, 31, -46, -88, -61, -13, 82, 90, 13,
392     -4, -90, -90, 38, 22, 67, 85, -78, -38, -22, -78, 90, 54, -31, 67, -73,
393     46, 38, -90, -88, 38, 73, 54, -4, -90, -67, 31, 90, 61, -46, -88, -31,
394     22, 85, 67, -78, -85, 13, 13, 61, 73, -90, -82, 54, 4, 22, 78, -82,
395     31, 22, -78, -61, 90, 85, -61, -90, 4, 73, 54, -38, -88, -4, 82, 46,
396     -38, -78, -22, 90, 73, -82, -90, 54, 67, -13, -13, -31, -46, 67, 85, -88,
397     13, 4, -38, -13, 61, 22, -78, -31, 88, 38, -90, -46, 85, 54, -73, -61,
398     54, 67, -31, -73, 4, 78, 22, -82, -46, 85, 67, -88, -82, 90, 90, -90
399 };
400 
reverse_epi16(__m128i x)401 static __m128i reverse_epi16(__m128i x)
402 {
403     x = _mm_shuffle_epi32(x, 0x1b); // 00011011
404     x = _mm_shufflelo_epi16(x, 0xb1); // 10110001
405     x = _mm_shufflehi_epi16(x, 0xb1);
406     return x;
407 }
408 
409 // 16-point forward transform (16 rows)
Transform16(short * src,int src_stride,short * dst,int dst_stride,int shift)410 static void Transform16(short *src, int src_stride, short *dst, int dst_stride, int shift)
411 {
412     int i;
413     __m128i s0 = _mm_cvtsi32_si128(shift);
414     __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
415     const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl;
416 
417     for (i = 0; i < 16; i++)
418     {
419         __m128i x0, x1;
420         __m128i y0, y1;
421         __m128i a0, a1, a2, a3;
422         __m128i b0, b1, b2, b3;
423 
424         y0 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x00));
425         y1 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x08));
426 
427 
428         // 16-point butterfly
429         y1 = reverse_epi16(y1);
430 
431         x0 = _mm_add_epi16(y0, y1);
432         x1 = _mm_sub_epi16(y0, y1);
433 
434         a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
435         a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[2]));
436         a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[4]));
437         a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[6]));
438 
439         a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
440         a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[3]));
441         a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[5]));
442         a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[7]));
443 
444         a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]);
445         a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10]));
446         a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[12]));
447         a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[14]));
448 
449         a3 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[9]);
450         a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[11]));
451         a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[13]));
452         a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[15]));
453 
454         b0 = _mm_sra_epi32(_mm_add_epi32(a0, o0), s0);
455         b1 = _mm_sra_epi32(_mm_add_epi32(a1, o0), s0);
456         b2 = _mm_sra_epi32(_mm_add_epi32(a2, o0), s0);
457         b3 = _mm_sra_epi32(_mm_add_epi32(a3, o0), s0);
458 
459         x0 = _mm_packs_epi32(b0, b1);
460         x1 = _mm_packs_epi32(b2, b3);
461 
462         y0 = _mm_unpacklo_epi16(x0, x1);
463         y1 = _mm_unpackhi_epi16(x0, x1);
464 
465         _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x00), y0);
466         _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x08), y1);
467     }
468 }
469 
470 // 16-point inverse transform (16 rows)
InvTransform16(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift)471 static void InvTransform16(
472     EB_S16  *src,
473     EB_U32   src_stride,
474     EB_S16  *dst,
475     EB_U32   dst_stride,
476     EB_U32   shift)
477 {
478     int i;
479     __m128i s0 = _mm_cvtsi32_si128(shift);
480     __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
481     const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl2;
482 
483     for (i = 0; i < 16; i++)
484     {
485         __m128i x0, x1;
486         __m128i y0, y1;
487         __m128i a0, a1, a2, a3;
488         __m128i b0, b1, b2, b3;
489         x0 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x00)); // 00 01 02 03 04 05 06 07
490         x1 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x08)); // 08 09 0a 0b 0c 0d 0e 0f
491 
492         y0 = _mm_unpacklo_epi16(x0, x1); // 00 08 01 09 02 0a 03 0b
493         y1 = _mm_unpackhi_epi16(x0, x1); // 04 0c 05 0d 06 0e 07 0f
494 
495         x0 = _mm_unpacklo_epi16(y0, y1); // 00 04 08 0c 01 05 09 0d
496         x1 = _mm_unpackhi_epi16(y0, y1); // 02 06 0a 0e 03 07 0b 0f
497 
498         y0 = _mm_unpacklo_epi16(x0, x1); // 00 02 04 06 08 0a 0c 0e
499         y1 = _mm_unpackhi_epi16(x0, x1); // 01 03 05 07 09 0b 0d 0f
500 
501         x0 = y0;
502         x1 = y1;
503 
504         a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
505         a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[2]));
506         a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[4]));
507         a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[6]));
508 
509         a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
510         a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[3]));
511         a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[5]));
512         a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[7]));
513 
514         a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]);
515         a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10]));
516         a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[12]));
517         a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[14]));
518 
519         a3 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[9]);
520         a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[11]));
521         a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[13]));
522         a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[15]));
523 
524 
525         a0 = _mm_add_epi32(a0, o0);
526         a1 = _mm_add_epi32(a1, o0);
527 
528         b0 = _mm_add_epi32(a0, a2);
529         b1 = _mm_add_epi32(a1, a3);
530         b2 = _mm_sub_epi32(a0, a2);
531         b3 = _mm_sub_epi32(a1, a3);
532 
533         a0 = b0;
534         a1 = b1;
535         a2 = _mm_shuffle_epi32(b3, 0x1b); // 00011011
536         a3 = _mm_shuffle_epi32(b2, 0x1b);
537 
538         a0 = _mm_sra_epi32(a0, s0);
539         a1 = _mm_sra_epi32(a1, s0);
540         a2 = _mm_sra_epi32(a2, s0);
541         a3 = _mm_sra_epi32(a3, s0);
542 
543         x0 = _mm_packs_epi32(a0, a1);
544         x1 = _mm_packs_epi32(a2, a3);
545 
546         _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x00), x0);
547         _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x08), x1);
548     }
549 }
550 
551 // transpose 16x16 block of data
Transpose16(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride)552 static void Transpose16(
553     EB_S16     *src,
554     EB_U32      src_stride,
555     EB_S16     *dst,
556     EB_U32      dst_stride)
557 {
558     int i, j;
559     for (i = 0; i < 2; i++)
560     {
561         for (j = 0; j < 2; j++)
562         {
563             __m128i a0, a1, a2, a3, a4, a5, a6, a7;
564             __m128i b0, b1, b2, b3, b4, b5, b6, b7;
565 
566             a0 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 0)*src_stride + 8 * j));
567             a1 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 1)*src_stride + 8 * j));
568             a2 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 2)*src_stride + 8 * j));
569             a3 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 3)*src_stride + 8 * j));
570             a4 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 4)*src_stride + 8 * j));
571             a5 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 5)*src_stride + 8 * j));
572             a6 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 6)*src_stride + 8 * j));
573             a7 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 7)*src_stride + 8 * j));
574 
575             b0 = _mm_unpacklo_epi16(a0, a4);
576             b1 = _mm_unpacklo_epi16(a1, a5);
577             b2 = _mm_unpacklo_epi16(a2, a6);
578             b3 = _mm_unpacklo_epi16(a3, a7);
579             b4 = _mm_unpackhi_epi16(a0, a4);
580             b5 = _mm_unpackhi_epi16(a1, a5);
581             b6 = _mm_unpackhi_epi16(a2, a6);
582             b7 = _mm_unpackhi_epi16(a3, a7);
583 
584             a0 = _mm_unpacklo_epi16(b0, b2);
585             a1 = _mm_unpacklo_epi16(b1, b3);
586             a2 = _mm_unpackhi_epi16(b0, b2);
587             a3 = _mm_unpackhi_epi16(b1, b3);
588             a4 = _mm_unpacklo_epi16(b4, b6);
589             a5 = _mm_unpacklo_epi16(b5, b7);
590             a6 = _mm_unpackhi_epi16(b4, b6);
591             a7 = _mm_unpackhi_epi16(b5, b7);
592 
593             b0 = _mm_unpacklo_epi16(a0, a1);
594             b1 = _mm_unpackhi_epi16(a0, a1);
595             b2 = _mm_unpacklo_epi16(a2, a3);
596             b3 = _mm_unpackhi_epi16(a2, a3);
597             b4 = _mm_unpacklo_epi16(a4, a5);
598             b5 = _mm_unpackhi_epi16(a4, a5);
599             b6 = _mm_unpacklo_epi16(a6, a7);
600             b7 = _mm_unpackhi_epi16(a6, a7);
601 
602             _mm_storeu_si128((__m128i *)(dst + (8 * j + 0)*dst_stride + 8 * i), b0);
603             _mm_storeu_si128((__m128i *)(dst + (8 * j + 1)*dst_stride + 8 * i), b1);
604             _mm_storeu_si128((__m128i *)(dst + (8 * j + 2)*dst_stride + 8 * i), b2);
605             _mm_storeu_si128((__m128i *)(dst + (8 * j + 3)*dst_stride + 8 * i), b3);
606             _mm_storeu_si128((__m128i *)(dst + (8 * j + 4)*dst_stride + 8 * i), b4);
607             _mm_storeu_si128((__m128i *)(dst + (8 * j + 5)*dst_stride + 8 * i), b5);
608             _mm_storeu_si128((__m128i *)(dst + (8 * j + 6)*dst_stride + 8 * i), b6);
609             _mm_storeu_si128((__m128i *)(dst + (8 * j + 7)*dst_stride + 8 * i), b7);
610         }
611     }
612 }
613 
PfreqTranspose32_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride)614 static void PfreqTranspose32_SSE2(
615     EB_S16 *src,
616     EB_U32  src_stride,
617     EB_S16 *dst,
618     EB_U32  dst_stride)
619 {
620     EB_U32 i, j;
621     for (i = 0; i < 4; i++)
622     {
623         for (j = 0; j < 2; j++)
624         {
625             __m128i a0, a1, a2, a3, a4, a5, a6, a7;
626             __m128i b0, b1, b2, b3, b4, b5, b6, b7;
627 
628             a0 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 0)*src_stride + 8 * j));
629             a1 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 1)*src_stride + 8 * j));
630             a2 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 2)*src_stride + 8 * j));
631             a3 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 3)*src_stride + 8 * j));
632             a4 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 4)*src_stride + 8 * j));
633             a5 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 5)*src_stride + 8 * j));
634             a6 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 6)*src_stride + 8 * j));
635             a7 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 7)*src_stride + 8 * j));
636 
637             b0 = _mm_unpacklo_epi16(a0, a4);
638             b1 = _mm_unpacklo_epi16(a1, a5);
639             b2 = _mm_unpacklo_epi16(a2, a6);
640             b3 = _mm_unpacklo_epi16(a3, a7);
641             b4 = _mm_unpackhi_epi16(a0, a4);
642             b5 = _mm_unpackhi_epi16(a1, a5);
643             b6 = _mm_unpackhi_epi16(a2, a6);
644             b7 = _mm_unpackhi_epi16(a3, a7);
645 
646             a0 = _mm_unpacklo_epi16(b0, b2);
647             a1 = _mm_unpacklo_epi16(b1, b3);
648             a2 = _mm_unpackhi_epi16(b0, b2);
649             a3 = _mm_unpackhi_epi16(b1, b3);
650             a4 = _mm_unpacklo_epi16(b4, b6);
651             a5 = _mm_unpacklo_epi16(b5, b7);
652             a6 = _mm_unpackhi_epi16(b4, b6);
653             a7 = _mm_unpackhi_epi16(b5, b7);
654 
655             b0 = _mm_unpacklo_epi16(a0, a1);
656             b1 = _mm_unpackhi_epi16(a0, a1);
657             b2 = _mm_unpacklo_epi16(a2, a3);
658             b3 = _mm_unpackhi_epi16(a2, a3);
659             b4 = _mm_unpacklo_epi16(a4, a5);
660             b5 = _mm_unpackhi_epi16(a4, a5);
661             b6 = _mm_unpacklo_epi16(a6, a7);
662             b7 = _mm_unpackhi_epi16(a6, a7);
663 
664             _mm_storeu_si128((__m128i *)(dst + (8 * j + 0)*dst_stride + 8 * i), b0);
665             _mm_storeu_si128((__m128i *)(dst + (8 * j + 1)*dst_stride + 8 * i), b1);
666             _mm_storeu_si128((__m128i *)(dst + (8 * j + 2)*dst_stride + 8 * i), b2);
667             _mm_storeu_si128((__m128i *)(dst + (8 * j + 3)*dst_stride + 8 * i), b3);
668             _mm_storeu_si128((__m128i *)(dst + (8 * j + 4)*dst_stride + 8 * i), b4);
669             _mm_storeu_si128((__m128i *)(dst + (8 * j + 5)*dst_stride + 8 * i), b5);
670             _mm_storeu_si128((__m128i *)(dst + (8 * j + 6)*dst_stride + 8 * i), b6);
671             _mm_storeu_si128((__m128i *)(dst + (8 * j + 7)*dst_stride + 8 * i), b7);
672         }
673     }
674 }
675 
PfreqN4SecTranspose32_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride)676 static void PfreqN4SecTranspose32_SSE2(
677 	EB_S16 *src,
678 	EB_U32  src_stride,
679 	EB_S16 *dst,
680 	EB_U32  dst_stride)
681 {
682 	EB_U32 i, j;
683 
684 	i = j = 0;
685 	{
686 		{
687 			__m128i a0, a1, a2, a3, a4, a5, a6, a7;
688 			__m128i b0, b1, b2, b3, b4, b5, b6, b7;
689 
690 			a0 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 0)*src_stride + 8 * j));
691 			a1 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 1)*src_stride + 8 * j));
692 			a2 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 2)*src_stride + 8 * j));
693 			a3 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 3)*src_stride + 8 * j));
694 			a4 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 4)*src_stride + 8 * j));
695 			a5 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 5)*src_stride + 8 * j));
696 			a6 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 6)*src_stride + 8 * j));
697 			a7 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 7)*src_stride + 8 * j));
698 
699 			b0 = _mm_unpacklo_epi16(a0, a4);
700 			b1 = _mm_unpacklo_epi16(a1, a5);
701 			b2 = _mm_unpacklo_epi16(a2, a6);
702 			b3 = _mm_unpacklo_epi16(a3, a7);
703 			b4 = _mm_unpackhi_epi16(a0, a4);
704 			b5 = _mm_unpackhi_epi16(a1, a5);
705 			b6 = _mm_unpackhi_epi16(a2, a6);
706 			b7 = _mm_unpackhi_epi16(a3, a7);
707 
708 			a0 = _mm_unpacklo_epi16(b0, b2);
709 			a1 = _mm_unpacklo_epi16(b1, b3);
710 			a2 = _mm_unpackhi_epi16(b0, b2);
711 			a3 = _mm_unpackhi_epi16(b1, b3);
712 			a4 = _mm_unpacklo_epi16(b4, b6);
713 			a5 = _mm_unpacklo_epi16(b5, b7);
714 			a6 = _mm_unpackhi_epi16(b4, b6);
715 			a7 = _mm_unpackhi_epi16(b5, b7);
716 
717 			b0 = _mm_unpacklo_epi16(a0, a1);
718 			b1 = _mm_unpackhi_epi16(a0, a1);
719 			b2 = _mm_unpacklo_epi16(a2, a3);
720 			b3 = _mm_unpackhi_epi16(a2, a3);
721 			b4 = _mm_unpacklo_epi16(a4, a5);
722 			b5 = _mm_unpackhi_epi16(a4, a5);
723 			b6 = _mm_unpacklo_epi16(a6, a7);
724 			b7 = _mm_unpackhi_epi16(a6, a7);
725 
726 			_mm_storeu_si128((__m128i *)(dst + (8 * j + 0)*dst_stride + 8 * i), b0);
727 			_mm_storeu_si128((__m128i *)(dst + (8 * j + 1)*dst_stride + 8 * i), b1);
728 			_mm_storeu_si128((__m128i *)(dst + (8 * j + 2)*dst_stride + 8 * i), b2);
729 			_mm_storeu_si128((__m128i *)(dst + (8 * j + 3)*dst_stride + 8 * i), b3);
730 			_mm_storeu_si128((__m128i *)(dst + (8 * j + 4)*dst_stride + 8 * i), b4);
731 			_mm_storeu_si128((__m128i *)(dst + (8 * j + 5)*dst_stride + 8 * i), b5);
732 			_mm_storeu_si128((__m128i *)(dst + (8 * j + 6)*dst_stride + 8 * i), b6);
733 			_mm_storeu_si128((__m128i *)(dst + (8 * j + 7)*dst_stride + 8 * i), b7);
734 
735 		}
736 	}
737 		}
PfreqN4FirstTranspose32_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride)738 static void PfreqN4FirstTranspose32_SSE2(
739 	EB_S16 *src,
740 	EB_U32  src_stride,
741 	EB_S16 *dst,
742 	EB_U32  dst_stride)
743 {
744 	EB_U32 i, j;
745 
746 	for (i = 0; i < 4; i++)
747 	{
748 		//for (j = 0; j < 2; j++)
749 		j = 0;
750 		{
751 
752 			__m128i a0, a1, a2, a3, a4, a5, a6, a7;
753 			__m128i b0, b1, b2, b3, b4, b5, b6, b7;
754 
755 			a0 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 0)*src_stride + 8 * j));
756 			a1 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 1)*src_stride + 8 * j));
757 			a2 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 2)*src_stride + 8 * j));
758 			a3 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 3)*src_stride + 8 * j));
759 			a4 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 4)*src_stride + 8 * j));
760 			a5 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 5)*src_stride + 8 * j));
761 			a6 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 6)*src_stride + 8 * j));
762 			a7 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 7)*src_stride + 8 * j));
763 
764 			b0 = _mm_unpacklo_epi16(a0, a4);
765 			b1 = _mm_unpacklo_epi16(a1, a5);
766 			b2 = _mm_unpacklo_epi16(a2, a6);
767 			b3 = _mm_unpacklo_epi16(a3, a7);
768 			b4 = _mm_unpackhi_epi16(a0, a4);
769 			b5 = _mm_unpackhi_epi16(a1, a5);
770 			b6 = _mm_unpackhi_epi16(a2, a6);
771 			b7 = _mm_unpackhi_epi16(a3, a7);
772 
773 			a0 = _mm_unpacklo_epi16(b0, b2);
774 			a1 = _mm_unpacklo_epi16(b1, b3);
775 			a2 = _mm_unpackhi_epi16(b0, b2);
776 			a3 = _mm_unpackhi_epi16(b1, b3);
777 			a4 = _mm_unpacklo_epi16(b4, b6);
778 			a5 = _mm_unpacklo_epi16(b5, b7);
779 			a6 = _mm_unpackhi_epi16(b4, b6);
780 			a7 = _mm_unpackhi_epi16(b5, b7);
781 
782 			b0 = _mm_unpacklo_epi16(a0, a1);
783 			b1 = _mm_unpackhi_epi16(a0, a1);
784 			b2 = _mm_unpacklo_epi16(a2, a3);
785 			b3 = _mm_unpackhi_epi16(a2, a3);
786 			b4 = _mm_unpacklo_epi16(a4, a5);
787 			b5 = _mm_unpackhi_epi16(a4, a5);
788 			b6 = _mm_unpacklo_epi16(a6, a7);
789 			b7 = _mm_unpackhi_epi16(a6, a7);
790 
791 			_mm_storeu_si128((__m128i *)(dst + (8 * j + 0)*dst_stride + 8 * i), b0);
792 			_mm_storeu_si128((__m128i *)(dst + (8 * j + 1)*dst_stride + 8 * i), b1);
793 			_mm_storeu_si128((__m128i *)(dst + (8 * j + 2)*dst_stride + 8 * i), b2);
794 			_mm_storeu_si128((__m128i *)(dst + (8 * j + 3)*dst_stride + 8 * i), b3);
795 			_mm_storeu_si128((__m128i *)(dst + (8 * j + 4)*dst_stride + 8 * i), b4);
796 			_mm_storeu_si128((__m128i *)(dst + (8 * j + 5)*dst_stride + 8 * i), b5);
797 			_mm_storeu_si128((__m128i *)(dst + (8 * j + 6)*dst_stride + 8 * i), b6);
798 			_mm_storeu_si128((__m128i *)(dst + (8 * j + 7)*dst_stride + 8 * i), b7);
799 
800 		}
801 	}
802 		}
803 
EbHevcPfreqTranspose32Type1_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride)804 void EbHevcPfreqTranspose32Type1_SSE2(
805     EB_S16 *src,
806     EB_U32  src_stride,
807     EB_S16 *dst,
808     EB_U32  dst_stride)
809 {
810     EB_U32 i, j;
811     for (i = 0; i < 2; i++)
812     {
813         for (j = 0; j < 2; j++)
814         {
815             __m128i a0, a1, a2, a3, a4, a5, a6, a7;
816             __m128i b0, b1, b2, b3, b4, b5, b6, b7;
817 
818             a0 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 0)*src_stride + 8 * j));
819             a1 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 1)*src_stride + 8 * j));
820             a2 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 2)*src_stride + 8 * j));
821             a3 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 3)*src_stride + 8 * j));
822             a4 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 4)*src_stride + 8 * j));
823             a5 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 5)*src_stride + 8 * j));
824             a6 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 6)*src_stride + 8 * j));
825             a7 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 7)*src_stride + 8 * j));
826 
827             b0 = _mm_unpacklo_epi16(a0, a4);
828             b1 = _mm_unpacklo_epi16(a1, a5);
829             b2 = _mm_unpacklo_epi16(a2, a6);
830             b3 = _mm_unpacklo_epi16(a3, a7);
831             b4 = _mm_unpackhi_epi16(a0, a4);
832             b5 = _mm_unpackhi_epi16(a1, a5);
833             b6 = _mm_unpackhi_epi16(a2, a6);
834             b7 = _mm_unpackhi_epi16(a3, a7);
835 
836             a0 = _mm_unpacklo_epi16(b0, b2);
837             a1 = _mm_unpacklo_epi16(b1, b3);
838             a2 = _mm_unpackhi_epi16(b0, b2);
839             a3 = _mm_unpackhi_epi16(b1, b3);
840             a4 = _mm_unpacklo_epi16(b4, b6);
841             a5 = _mm_unpacklo_epi16(b5, b7);
842             a6 = _mm_unpackhi_epi16(b4, b6);
843             a7 = _mm_unpackhi_epi16(b5, b7);
844 
845             b0 = _mm_unpacklo_epi16(a0, a1);
846             b1 = _mm_unpackhi_epi16(a0, a1);
847             b2 = _mm_unpacklo_epi16(a2, a3);
848             b3 = _mm_unpackhi_epi16(a2, a3);
849             b4 = _mm_unpacklo_epi16(a4, a5);
850             b5 = _mm_unpackhi_epi16(a4, a5);
851             b6 = _mm_unpacklo_epi16(a6, a7);
852             b7 = _mm_unpackhi_epi16(a6, a7);
853 
854             _mm_storeu_si128((__m128i *)(dst + (8 * j + 0)*dst_stride + 8 * i), b0);
855             _mm_storeu_si128((__m128i *)(dst + (8 * j + 1)*dst_stride + 8 * i), b1);
856             _mm_storeu_si128((__m128i *)(dst + (8 * j + 2)*dst_stride + 8 * i), b2);
857             _mm_storeu_si128((__m128i *)(dst + (8 * j + 3)*dst_stride + 8 * i), b3);
858             _mm_storeu_si128((__m128i *)(dst + (8 * j + 4)*dst_stride + 8 * i), b4);
859             _mm_storeu_si128((__m128i *)(dst + (8 * j + 5)*dst_stride + 8 * i), b5);
860             _mm_storeu_si128((__m128i *)(dst + (8 * j + 6)*dst_stride + 8 * i), b6);
861             _mm_storeu_si128((__m128i *)(dst + (8 * j + 7)*dst_stride + 8 * i), b7);
862         }
863     }
864 }
PfreqTranspose32Type2_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride)865 void PfreqTranspose32Type2_SSE2(
866     EB_S16 *src,
867     EB_U32  src_stride,
868     EB_S16 *dst,
869     EB_U32  dst_stride)
870 {
871     EB_U32 i, j;
872     for (i = 0; i < 2; i++)
873     {
874         for (j = 0; j < 4; j++)
875         {
876             __m128i a0, a1, a2, a3, a4, a5, a6, a7;
877             __m128i b0, b1, b2, b3, b4, b5, b6, b7;
878 
879             a0 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 0)*src_stride + 8 * j));
880             a1 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 1)*src_stride + 8 * j));
881             a2 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 2)*src_stride + 8 * j));
882             a3 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 3)*src_stride + 8 * j));
883             a4 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 4)*src_stride + 8 * j));
884             a5 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 5)*src_stride + 8 * j));
885             a6 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 6)*src_stride + 8 * j));
886             a7 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 7)*src_stride + 8 * j));
887 
888             b0 = _mm_unpacklo_epi16(a0, a4);
889             b1 = _mm_unpacklo_epi16(a1, a5);
890             b2 = _mm_unpacklo_epi16(a2, a6);
891             b3 = _mm_unpacklo_epi16(a3, a7);
892             b4 = _mm_unpackhi_epi16(a0, a4);
893             b5 = _mm_unpackhi_epi16(a1, a5);
894             b6 = _mm_unpackhi_epi16(a2, a6);
895             b7 = _mm_unpackhi_epi16(a3, a7);
896 
897             a0 = _mm_unpacklo_epi16(b0, b2);
898             a1 = _mm_unpacklo_epi16(b1, b3);
899             a2 = _mm_unpackhi_epi16(b0, b2);
900             a3 = _mm_unpackhi_epi16(b1, b3);
901             a4 = _mm_unpacklo_epi16(b4, b6);
902             a5 = _mm_unpacklo_epi16(b5, b7);
903             a6 = _mm_unpackhi_epi16(b4, b6);
904             a7 = _mm_unpackhi_epi16(b5, b7);
905 
906             b0 = _mm_unpacklo_epi16(a0, a1);
907             b1 = _mm_unpackhi_epi16(a0, a1);
908             b2 = _mm_unpacklo_epi16(a2, a3);
909             b3 = _mm_unpackhi_epi16(a2, a3);
910             b4 = _mm_unpacklo_epi16(a4, a5);
911             b5 = _mm_unpackhi_epi16(a4, a5);
912             b6 = _mm_unpacklo_epi16(a6, a7);
913             b7 = _mm_unpackhi_epi16(a6, a7);
914 
915             _mm_storeu_si128((__m128i *)(dst + (8 * j + 0)*dst_stride + 8 * i), b0);
916             _mm_storeu_si128((__m128i *)(dst + (8 * j + 1)*dst_stride + 8 * i), b1);
917             _mm_storeu_si128((__m128i *)(dst + (8 * j + 2)*dst_stride + 8 * i), b2);
918             _mm_storeu_si128((__m128i *)(dst + (8 * j + 3)*dst_stride + 8 * i), b3);
919             _mm_storeu_si128((__m128i *)(dst + (8 * j + 4)*dst_stride + 8 * i), b4);
920             _mm_storeu_si128((__m128i *)(dst + (8 * j + 5)*dst_stride + 8 * i), b5);
921             _mm_storeu_si128((__m128i *)(dst + (8 * j + 6)*dst_stride + 8 * i), b6);
922             _mm_storeu_si128((__m128i *)(dst + (8 * j + 7)*dst_stride + 8 * i), b7);
923         }
924     }
925 }
926 
Transpose32_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride)927 static void Transpose32_SSE2(
928     EB_S16 *src,
929     EB_U32  src_stride,
930     EB_S16 *dst,
931     EB_U32  dst_stride)
932 {
933     EB_U32 i, j;
934     for (i = 0; i < 4; i++)
935     {
936         for (j = 0; j < 4; j++)
937         {
938             __m128i a0, a1, a2, a3, a4, a5, a6, a7;
939             __m128i b0, b1, b2, b3, b4, b5, b6, b7;
940 
941             a0 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 0)*src_stride + 8 * j));
942             a1 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 1)*src_stride + 8 * j));
943             a2 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 2)*src_stride + 8 * j));
944             a3 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 3)*src_stride + 8 * j));
945             a4 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 4)*src_stride + 8 * j));
946             a5 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 5)*src_stride + 8 * j));
947             a6 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 6)*src_stride + 8 * j));
948             a7 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 7)*src_stride + 8 * j));
949 
950             b0 = _mm_unpacklo_epi16(a0, a4);
951             b1 = _mm_unpacklo_epi16(a1, a5);
952             b2 = _mm_unpacklo_epi16(a2, a6);
953             b3 = _mm_unpacklo_epi16(a3, a7);
954             b4 = _mm_unpackhi_epi16(a0, a4);
955             b5 = _mm_unpackhi_epi16(a1, a5);
956             b6 = _mm_unpackhi_epi16(a2, a6);
957             b7 = _mm_unpackhi_epi16(a3, a7);
958 
959             a0 = _mm_unpacklo_epi16(b0, b2);
960             a1 = _mm_unpacklo_epi16(b1, b3);
961             a2 = _mm_unpackhi_epi16(b0, b2);
962             a3 = _mm_unpackhi_epi16(b1, b3);
963             a4 = _mm_unpacklo_epi16(b4, b6);
964             a5 = _mm_unpacklo_epi16(b5, b7);
965             a6 = _mm_unpackhi_epi16(b4, b6);
966             a7 = _mm_unpackhi_epi16(b5, b7);
967 
968             b0 = _mm_unpacklo_epi16(a0, a1);
969             b1 = _mm_unpackhi_epi16(a0, a1);
970             b2 = _mm_unpacklo_epi16(a2, a3);
971             b3 = _mm_unpackhi_epi16(a2, a3);
972             b4 = _mm_unpacklo_epi16(a4, a5);
973             b5 = _mm_unpackhi_epi16(a4, a5);
974             b6 = _mm_unpacklo_epi16(a6, a7);
975             b7 = _mm_unpackhi_epi16(a6, a7);
976 
977             _mm_storeu_si128((__m128i *)(dst + (8 * j + 0)*dst_stride + 8 * i), b0);
978             _mm_storeu_si128((__m128i *)(dst + (8 * j + 1)*dst_stride + 8 * i), b1);
979             _mm_storeu_si128((__m128i *)(dst + (8 * j + 2)*dst_stride + 8 * i), b2);
980             _mm_storeu_si128((__m128i *)(dst + (8 * j + 3)*dst_stride + 8 * i), b3);
981             _mm_storeu_si128((__m128i *)(dst + (8 * j + 4)*dst_stride + 8 * i), b4);
982             _mm_storeu_si128((__m128i *)(dst + (8 * j + 5)*dst_stride + 8 * i), b5);
983             _mm_storeu_si128((__m128i *)(dst + (8 * j + 6)*dst_stride + 8 * i), b6);
984             _mm_storeu_si128((__m128i *)(dst + (8 * j + 7)*dst_stride + 8 * i), b7);
985         }
986     }
987 }
988 
Pfreq2DInvTransform32_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift)989 void Pfreq2DInvTransform32_SSE2(
990     EB_S16 *src,
991     EB_U32  src_stride,
992     EB_S16 *dst,
993     EB_U32  dst_stride,
994     EB_U32  shift)
995 {
996     EB_U32 i;
997     __m128i s0 = _mm_cvtsi32_si128(shift);
998     __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
999     const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl2;
1000 
1001     for (i = 0; i < 32; i++)
1002     {
1003         __m128i x0, x1, x2, x3;
1004         __m128i y0, y1, y2;
1005         __m128i a0, a1, a2, a3, a4, a5, a6, a7;
1006         __m128i b0, b1, b2, b3, b4, b5, b6, b7;
1007         x0 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x00)); // 00 01 02 03 04 05 06 07
1008         x1 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x08)); // 08 09 0a 0b 0c 0d 0e 0f
1009 
1010         y0 = _mm_unpacklo_epi16(x0, x1); // 00 08 01 09 02 0a 03 0b
1011         y1 = _mm_unpackhi_epi16(x0, x1); // 04 0c 05 0d 06 0e 07 0f
1012 
1013         x0 = _mm_unpacklo_epi16(y0, y1); // 00 04 08 0c 01 05 09 0d
1014         x1 = _mm_unpackhi_epi16(y0, y1); // 02 06 0a 0e 03 07 0b 0f
1015 
1016         y0 = _mm_unpacklo_epi64(x0, x0); // 00 04 08 0c 10 14 18 1c     y0=part of it zero
1017         y1 = _mm_unpacklo_epi64(x1, x1); // 02 06 0a 0e 12 16 1a 1e     y1=part of it zero
1018         y2 = _mm_unpackhi_epi16(x0, x1); // 01 03 05 07 09 0b 0d 0f
1019 
1020         x0 = y0;   //part of it zero
1021         x1 = y1;   //part of it zero
1022         x2 = y2;
1023 
1024         a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
1025         a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[2]));
1026 
1027         a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
1028         a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[3]));
1029 
1030         a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]);
1031         a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10]));
1032 
1033         a3 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[9]);
1034         a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[11]));
1035 
1036         a4 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[16]);
1037         a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[20]));
1038         a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[24]));
1039         a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[28]));
1040 
1041         a5 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[17]);
1042         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[21]));
1043         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[25]));
1044         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[29]));
1045 
1046         a6 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[18]);
1047         a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[22]));
1048         a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[26]));
1049         a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[30]));
1050 
1051         a7 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[19]);
1052         a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[23]));
1053         a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[27]));
1054         a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[31]));
1055 
1056         a0 = _mm_add_epi32(a0, o0);
1057         a1 = _mm_add_epi32(a1, o0);
1058 
1059         b0 = _mm_add_epi32(a0, a2);
1060         b1 = _mm_add_epi32(a1, a3);
1061         b2 = _mm_sub_epi32(a0, a2);
1062         b3 = _mm_sub_epi32(a1, a3);
1063 
1064         a0 = b0;
1065         a1 = b1;
1066         a2 = _mm_shuffle_epi32(b3, 0x1b); // 00011011
1067         a3 = _mm_shuffle_epi32(b2, 0x1b);
1068 
1069         b0 = _mm_add_epi32(a0, a4);
1070         b1 = _mm_add_epi32(a1, a5);
1071         b2 = _mm_add_epi32(a2, a6);
1072         b3 = _mm_add_epi32(a3, a7);
1073         b4 = _mm_sub_epi32(a0, a4);
1074         b5 = _mm_sub_epi32(a1, a5);
1075         b6 = _mm_sub_epi32(a2, a6);
1076         b7 = _mm_sub_epi32(a3, a7);
1077 
1078         a0 = _mm_sra_epi32(b0, s0);
1079         a1 = _mm_sra_epi32(b1, s0);
1080         a2 = _mm_sra_epi32(b2, s0);
1081         a3 = _mm_sra_epi32(b3, s0);
1082         a4 = _mm_sra_epi32(_mm_shuffle_epi32(b7, 0x1b), s0);
1083         a5 = _mm_sra_epi32(_mm_shuffle_epi32(b6, 0x1b), s0);
1084         a6 = _mm_sra_epi32(_mm_shuffle_epi32(b5, 0x1b), s0);
1085         a7 = _mm_sra_epi32(_mm_shuffle_epi32(b4, 0x1b), s0);
1086 
1087         x0 = _mm_packs_epi32(a0, a1);
1088         x1 = _mm_packs_epi32(a2, a3);
1089         x2 = _mm_packs_epi32(a4, a5);
1090         x3 = _mm_packs_epi32(a6, a7);
1091 
1092         _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x00), x0);
1093         _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x08), x1);
1094         _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x10), x2);
1095         _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x18), x3);
1096     }
1097 }
Pfreq1DInvTransform32_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift)1098 void Pfreq1DInvTransform32_SSE2(
1099     EB_S16 *src,
1100     EB_U32  src_stride,
1101     EB_S16 *dst,
1102     EB_U32  dst_stride,
1103     EB_U32  shift)
1104 {
1105     EB_U32 i;
1106     __m128i s0 = _mm_cvtsi32_si128(shift);
1107     __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
1108     const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl2;
1109 
1110     for (i = 0; i < 16; i++)
1111     {
1112         __m128i x0, x1, x2, x3;
1113         __m128i y0, y1, y2;
1114         __m128i a0, a1, a2, a3, a4, a5, a6, a7;
1115         __m128i b0, b1, b2, b3, b4, b5, b6, b7;
1116         x0 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x00)); // 00 01 02 03 04 05 06 07
1117         x1 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x08)); // 08 09 0a 0b 0c 0d 0e 0f
1118 
1119         y0 = _mm_unpacklo_epi16(x0, x1); // 00 08 01 09 02 0a 03 0b
1120         y1 = _mm_unpackhi_epi16(x0, x1); // 04 0c 05 0d 06 0e 07 0f
1121 
1122         x0 = _mm_unpacklo_epi16(y0, y1); // 00 04 08 0c 01 05 09 0d
1123         x1 = _mm_unpackhi_epi16(y0, y1); // 02 06 0a 0e 03 07 0b 0f
1124 
1125         y0 = _mm_unpacklo_epi64(x0, x0); // 00 04 08 0c 10 14 18 1c     y0=part of it zero
1126         y1 = _mm_unpacklo_epi64(x1, x1); // 02 06 0a 0e 12 16 1a 1e     y1=part of it zero
1127         y2 = _mm_unpackhi_epi16(x0, x1); // 01 03 05 07 09 0b 0d 0f
1128 
1129         x0 = y0;   //part of it zero
1130         x1 = y1;   //part of it zero
1131         x2 = y2;
1132 
1133         a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
1134         a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[2]));
1135 
1136         a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
1137         a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[3]));
1138 
1139         a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]);
1140         a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10]));
1141 
1142         a3 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[9]);
1143         a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[11]));
1144 
1145         a4 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[16]);
1146         a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[20]));
1147         a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[24]));
1148         a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[28]));
1149 
1150         a5 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[17]);
1151         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[21]));
1152         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[25]));
1153         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[29]));
1154 
1155         a6 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[18]);
1156         a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[22]));
1157         a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[26]));
1158         a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[30]));
1159 
1160         a7 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[19]);
1161         a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[23]));
1162         a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[27]));
1163         a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[31]));
1164 
1165         a0 = _mm_add_epi32(a0, o0);
1166         a1 = _mm_add_epi32(a1, o0);
1167 
1168         b0 = _mm_add_epi32(a0, a2);
1169         b1 = _mm_add_epi32(a1, a3);
1170         b2 = _mm_sub_epi32(a0, a2);
1171         b3 = _mm_sub_epi32(a1, a3);
1172 
1173         a0 = b0;
1174         a1 = b1;
1175         a2 = _mm_shuffle_epi32(b3, 0x1b); // 00011011
1176         a3 = _mm_shuffle_epi32(b2, 0x1b);
1177 
1178         b0 = _mm_add_epi32(a0, a4);
1179         b1 = _mm_add_epi32(a1, a5);
1180         b2 = _mm_add_epi32(a2, a6);
1181         b3 = _mm_add_epi32(a3, a7);
1182         b4 = _mm_sub_epi32(a0, a4);
1183         b5 = _mm_sub_epi32(a1, a5);
1184         b6 = _mm_sub_epi32(a2, a6);
1185         b7 = _mm_sub_epi32(a3, a7);
1186 
1187         a0 = _mm_sra_epi32(b0, s0);
1188         a1 = _mm_sra_epi32(b1, s0);
1189         a2 = _mm_sra_epi32(b2, s0);
1190         a3 = _mm_sra_epi32(b3, s0);
1191         a4 = _mm_sra_epi32(_mm_shuffle_epi32(b7, 0x1b), s0);
1192         a5 = _mm_sra_epi32(_mm_shuffle_epi32(b6, 0x1b), s0);
1193         a6 = _mm_sra_epi32(_mm_shuffle_epi32(b5, 0x1b), s0);
1194         a7 = _mm_sra_epi32(_mm_shuffle_epi32(b4, 0x1b), s0);
1195 
1196         x0 = _mm_packs_epi32(a0, a1);
1197         x1 = _mm_packs_epi32(a2, a3);
1198         x2 = _mm_packs_epi32(a4, a5);
1199         x3 = _mm_packs_epi32(a6, a7);
1200 
1201         _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x00), x0);
1202         _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x08), x1);
1203         _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x10), x2);
1204         _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x18), x3);
1205     }
1206 }
1207 
PfreqEstimateInvTransform32x32_SSE2(EB_S16 * src,const EB_U32 src_stride,EB_S16 * dst,const EB_U32 dst_stride,EB_S16 * intermediate,EB_U32 addshift)1208 void PfreqEstimateInvTransform32x32_SSE2(
1209     EB_S16 *src,
1210     const EB_U32  src_stride,
1211     EB_S16 *dst,
1212     const EB_U32  dst_stride,
1213     EB_S16 *intermediate,
1214     EB_U32  addshift)
1215 {
1216     EbHevcPfreqTranspose32Type1_SSE2(src, src_stride, intermediate, 32);
1217     Pfreq1DInvTransform32_SSE2(intermediate, 32, dst, dst_stride, 7);
1218     PfreqTranspose32Type2_SSE2(dst, dst_stride, intermediate, 32);
1219     Pfreq2DInvTransform32_SSE2(intermediate, 32, dst, dst_stride, 12 - addshift);
1220 }
1221 
1222 // 32-point inverse transform (32 rows)
InvTransform32_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift)1223 static void InvTransform32_SSE2(
1224     EB_S16 *src,
1225     EB_U32  src_stride,
1226     EB_S16 *dst,
1227     EB_U32  dst_stride,
1228     EB_U32  shift)
1229 {
1230     EB_U32 i;
1231     __m128i s0 = _mm_cvtsi32_si128(shift);
1232     __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
1233     const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl2;
1234 
1235     for (i = 0; i < 32; i++)
1236     {
1237         __m128i x0, x1, x2, x3;
1238         __m128i y0, y1, y2, y3;
1239         __m128i a0, a1, a2, a3, a4, a5, a6, a7;
1240         __m128i b0, b1, b2, b3, b4, b5, b6, b7;
1241         x0 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x00)); // 00 01 02 03 04 05 06 07
1242         x1 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x08)); // 08 09 0a 0b 0c 0d 0e 0f
1243         x2 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x10)); // 10 11 12 13 14 15 16 17
1244         x3 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x18)); // 18 19 1a 1b 1c 1d 1e 1f
1245 
1246         y0 = _mm_unpacklo_epi16(x0, x1); // 00 08 01 09 02 0a 03 0b
1247         y1 = _mm_unpackhi_epi16(x0, x1); // 04 0c 05 0d 06 0e 07 0f
1248         y2 = _mm_unpacklo_epi16(x2, x3); // 10 18
1249         y3 = _mm_unpackhi_epi16(x2, x3); // 24 2c
1250 
1251         x0 = _mm_unpacklo_epi16(y0, y1); // 00 04 08 0c 01 05 09 0d
1252         x1 = _mm_unpackhi_epi16(y0, y1); // 02 06 0a 0e 03 07 0b 0f
1253         x2 = _mm_unpacklo_epi16(y2, y3); // 10 14
1254         x3 = _mm_unpackhi_epi16(y2, y3); // 12 16
1255 
1256         y0 = _mm_unpacklo_epi64(x0, x2); // 00 04 08 0c 10 14 18 1c
1257         y1 = _mm_unpacklo_epi64(x1, x3); // 02 06 0a 0e 12 16 1a 1e
1258         y2 = _mm_unpackhi_epi16(x0, x1); // 01 03 05 07 09 0b 0d 0f
1259         y3 = _mm_unpackhi_epi16(x2, x3); // 11 13 15 17 19 1b 1d 1f
1260 
1261         x0 = y0;
1262         x1 = y1;
1263         x2 = y2;
1264         x3 = y3;
1265 
1266         a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
1267         a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[2]));
1268         a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[4]));
1269         a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[6]));
1270 
1271         a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
1272         a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[3]));
1273         a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[5]));
1274         a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[7]));
1275 
1276         a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]);
1277         a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10]));
1278         a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[12]));
1279         a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[14]));
1280 
1281         a3 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[9]);
1282         a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[11]));
1283         a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[13]));
1284         a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[15]));
1285 
1286         a4 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[16]);
1287         a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[20]));
1288         a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[24]));
1289         a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[28]));
1290         a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[32]));
1291         a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[36]));
1292         a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[40]));
1293         a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[44]));
1294 
1295         a5 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[17]);
1296         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[21]));
1297         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[25]));
1298         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[29]));
1299         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[33]));
1300         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[37]));
1301         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[41]));
1302         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[45]));
1303 
1304         a6 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[18]);
1305         a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[22]));
1306         a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[26]));
1307         a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[30]));
1308         a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[34]));
1309         a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[38]));
1310         a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[42]));
1311         a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[46]));
1312 
1313         a7 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[19]);
1314         a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[23]));
1315         a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[27]));
1316         a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[31]));
1317         a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[35]));
1318         a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[39]));
1319         a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[43]));
1320         a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[47]));
1321 
1322         a0 = _mm_add_epi32(a0, o0);
1323         a1 = _mm_add_epi32(a1, o0);
1324 
1325         b0 = _mm_add_epi32(a0, a2);
1326         b1 = _mm_add_epi32(a1, a3);
1327         b2 = _mm_sub_epi32(a0, a2);
1328         b3 = _mm_sub_epi32(a1, a3);
1329 
1330         a0 = b0;
1331         a1 = b1;
1332         a2 = _mm_shuffle_epi32(b3, 0x1b); // 00011011
1333         a3 = _mm_shuffle_epi32(b2, 0x1b);
1334 
1335         b0 = _mm_add_epi32(a0, a4);
1336         b1 = _mm_add_epi32(a1, a5);
1337         b2 = _mm_add_epi32(a2, a6);
1338         b3 = _mm_add_epi32(a3, a7);
1339         b4 = _mm_sub_epi32(a0, a4);
1340         b5 = _mm_sub_epi32(a1, a5);
1341         b6 = _mm_sub_epi32(a2, a6);
1342         b7 = _mm_sub_epi32(a3, a7);
1343 
1344         a0 = _mm_sra_epi32(b0, s0);
1345         a1 = _mm_sra_epi32(b1, s0);
1346         a2 = _mm_sra_epi32(b2, s0);
1347         a3 = _mm_sra_epi32(b3, s0);
1348         a4 = _mm_sra_epi32(_mm_shuffle_epi32(b7, 0x1b), s0);
1349         a5 = _mm_sra_epi32(_mm_shuffle_epi32(b6, 0x1b), s0);
1350         a6 = _mm_sra_epi32(_mm_shuffle_epi32(b5, 0x1b), s0);
1351         a7 = _mm_sra_epi32(_mm_shuffle_epi32(b4, 0x1b), s0);
1352 
1353         x0 = _mm_packs_epi32(a0, a1);
1354         x1 = _mm_packs_epi32(a2, a3);
1355         x2 = _mm_packs_epi32(a4, a5);
1356         x3 = _mm_packs_epi32(a6, a7);
1357 
1358         _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x00), x0);
1359         _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x08), x1);
1360         _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x10), x2);
1361         _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x18), x3);
1362     }
1363 }
1364 
EstimateInvTransform32x32_SSE2(EB_S16 * src,const EB_U32 src_stride,EB_S16 * dst,const EB_U32 dst_stride,EB_S16 * intermediate,EB_U32 addshift)1365 void EstimateInvTransform32x32_SSE2(
1366     EB_S16 *src,
1367     const EB_U32  src_stride,
1368     EB_S16 *dst,
1369     const EB_U32  dst_stride,
1370     EB_S16 *intermediate,
1371     EB_U32  addshift)
1372 {
1373     Transpose32_SSE2(src, src_stride, intermediate, 32);
1374     InvTransform32_SSE2(intermediate, 32, dst, dst_stride, 7);
1375     Transpose32_SSE2(dst, dst_stride, intermediate, 32);
1376     InvTransform32_SSE2(intermediate, 32, dst, dst_stride, 12 - addshift);
1377 }
1378 
1379 
1380 
1381 // forward 16x16 transform
Transform16x16_SSE2(EB_S16 * src,const EB_U32 src_stride,EB_S16 * dst,const EB_U32 dst_stride,EB_S16 * intermediate,EB_U32 addshift)1382 void Transform16x16_SSE2(
1383     EB_S16 *src,
1384     const EB_U32 src_stride,
1385     EB_S16 *dst,
1386     const EB_U32 dst_stride,
1387     EB_S16 *intermediate,
1388     EB_U32  addshift)
1389 {
1390     Transform16(src, src_stride, intermediate, 16, 4 + addshift);
1391     Transpose16(intermediate, 16, dst, dst_stride);
1392     Transform16(dst, dst_stride, intermediate, 16, 9);
1393     Transpose16(intermediate, 16, dst, dst_stride);
1394 }
1395 
1396 
1397 // inverse 16x16 transform
EstimateInvTransform16x16_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_S16 * intermediate,EB_U32 addshift)1398 void EstimateInvTransform16x16_SSE2(
1399     EB_S16  *src,
1400     EB_U32   src_stride,
1401     EB_S16  *dst,
1402     EB_U32   dst_stride,
1403     EB_S16  *intermediate,
1404     EB_U32   addshift)
1405 {
1406     Transpose16(src, src_stride, intermediate, 16);
1407     InvTransform16(intermediate, 16, dst, dst_stride, 7);
1408     Transpose16(dst, dst_stride, intermediate, 16);
1409     InvTransform16(intermediate, 16, dst, dst_stride, 12 - addshift);
1410 }
1411 
1412 
Transform32_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift)1413 static void Transform32_SSE2(
1414     EB_S16 *src,
1415     EB_U32  src_stride,
1416     EB_S16 *dst,
1417     EB_U32  dst_stride,
1418     EB_U32  shift)
1419 {
1420     EB_U32 i;
1421     __m128i s0 = _mm_cvtsi32_si128(shift);
1422     __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
1423     const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl;
1424 
1425     for (i = 0; i < 32; i++)
1426     {
1427         __m128i x0, x1, x2, x3;
1428         __m128i y0, y1, y2, y3;
1429         __m128i a0, a1, a2, a3, a4, a5, a6, a7;
1430         __m128i b0, b1, b2, b3, b4, b5, b6, b7;
1431 
1432         x0 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x00));
1433         x1 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x08));
1434         x2 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x10));
1435         x3 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x18));
1436 
1437         // 32-point butterfly
1438         x2 = reverse_epi16(x2);
1439         x3 = reverse_epi16(x3);
1440 
1441         y0 = _mm_add_epi16(x0, x3);
1442         y1 = _mm_add_epi16(x1, x2);
1443 
1444         y2 = _mm_sub_epi16(x0, x3);
1445         y3 = _mm_sub_epi16(x1, x2);
1446 
1447         // 16-point butterfly
1448         y1 = reverse_epi16(y1);
1449 
1450         x0 = _mm_add_epi16(y0, y1);
1451         x1 = _mm_sub_epi16(y0, y1);
1452 
1453         x2 = y2;
1454         x3 = y3;
1455 
1456         a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
1457         a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[2]));
1458         a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[4]));
1459         a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[6]));
1460 
1461         a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
1462         a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[3]));
1463         a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[5]));
1464         a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[7]));
1465 
1466         a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]);
1467         a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10]));
1468         a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[12]));
1469         a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[14]));
1470 
1471         a3 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[9]);
1472         a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[11]));
1473         a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[13]));
1474         a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[15]));
1475 
1476         a4 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[16]);
1477         a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[20]));
1478         a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[24]));
1479         a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[28]));
1480         a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[32]));
1481         a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[36]));
1482         a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[40]));
1483         a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[44]));
1484 
1485         a5 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[17]);
1486         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[21]));
1487         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[25]));
1488         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[29]));
1489         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[33]));
1490         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[37]));
1491         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[41]));
1492         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[45]));
1493 
1494         a6 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[18]);
1495         a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[22]));
1496         a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[26]));
1497         a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[30]));
1498         a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[34]));
1499         a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[38]));
1500         a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[42]));
1501         a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[46]));
1502 
1503         a7 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[19]);
1504         a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[23]));
1505         a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[27]));
1506         a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[31]));
1507         a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[35]));
1508         a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[39]));
1509         a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[43]));
1510         a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[47]));
1511 
1512         b0 = _mm_sra_epi32(_mm_add_epi32(a0, o0), s0);
1513         b1 = _mm_sra_epi32(_mm_add_epi32(a1, o0), s0);
1514         b2 = _mm_sra_epi32(_mm_add_epi32(a2, o0), s0);
1515         b3 = _mm_sra_epi32(_mm_add_epi32(a3, o0), s0);
1516         b4 = _mm_sra_epi32(_mm_add_epi32(a4, o0), s0);
1517         b5 = _mm_sra_epi32(_mm_add_epi32(a5, o0), s0);
1518         b6 = _mm_sra_epi32(_mm_add_epi32(a6, o0), s0);
1519         b7 = _mm_sra_epi32(_mm_add_epi32(a7, o0), s0);
1520 
1521         x0 = _mm_packs_epi32(b0, b1);
1522         x1 = _mm_packs_epi32(b2, b3);
1523         x2 = _mm_packs_epi32(b4, b5);
1524         x3 = _mm_packs_epi32(b6, b7);
1525 
1526         y0 = _mm_unpacklo_epi16(x0, x1);
1527         y1 = _mm_unpackhi_epi16(x0, x1);
1528         y2 = x2;
1529         y3 = x3;
1530         x0 = _mm_unpacklo_epi16(y0, y2);
1531         x1 = _mm_unpackhi_epi16(y0, y2);
1532         x2 = _mm_unpacklo_epi16(y1, y3);
1533         x3 = _mm_unpackhi_epi16(y1, y3);
1534 
1535         _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x00), x0);
1536         _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x08), x1);
1537         _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x10), x2);
1538         _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x18), x3);
1539     }
1540 }
1541 
Pfreq1DTransform32_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift)1542 static void Pfreq1DTransform32_SSE2(
1543     EB_S16 *src,
1544     EB_U32  src_stride,
1545     EB_S16 *dst,
1546     EB_U32  dst_stride,
1547     EB_U32  shift)
1548 {
1549     EB_U32 i;
1550     __m128i s0 = _mm_cvtsi32_si128(shift);
1551     __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
1552     const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl;
1553 
1554     for (i = 0; i < 32; i++)
1555     {
1556         __m128i x0, x1, x2, x3;
1557         __m128i y0, y1, y2, y3;
1558         __m128i a0, a2, a4, a5;
1559         __m128i b0, b1, b2, b3, b4, b5;
1560 
1561 
1562         b1 = s0;
1563         b3 = s0;
1564 
1565         x0 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x00));
1566         x1 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x08));
1567         x2 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x10));
1568         x3 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x18));
1569 
1570 
1571         // 32-point butterfly
1572         x2 = reverse_epi16(x2);
1573         x3 = reverse_epi16(x3);
1574 
1575         y0 = _mm_add_epi16(x0, x3);
1576         y1 = _mm_add_epi16(x1, x2);
1577 
1578         y2 = _mm_sub_epi16(x0, x3);
1579         y3 = _mm_sub_epi16(x1, x2);
1580 
1581         // 16-point butterfly
1582         y1 = reverse_epi16(y1);
1583 
1584         x0 = _mm_add_epi16(y0, y1);
1585         x1 = _mm_sub_epi16(y0, y1);
1586 
1587 
1588         x2 = y2;
1589         x3 = y3;
1590 
1591 
1592 
1593         a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
1594         a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[2]));
1595         a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[4]));
1596         a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[6]));
1597 
1598         //a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
1599         //a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[3]));
1600         //a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[5]));
1601         //a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[7]));
1602 
1603         a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]);
1604         a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10]));
1605         a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[12]));
1606         a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[14]));
1607 
1608         //a3 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[9]);
1609         //a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[11]));
1610         //a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[13]));
1611         //a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[15]));
1612 
1613         a4 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[16]);
1614         a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[20]));
1615         a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[24]));
1616         a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[28]));
1617         a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[32]));
1618         a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[36]));
1619         a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[40]));
1620         a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[44]));
1621 
1622         a5 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[17]);
1623         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[21]));
1624         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[25]));
1625         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[29]));
1626         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[33]));
1627         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[37]));
1628         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[41]));
1629         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[45]));
1630 
1631         //a6 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[18]);
1632         //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[22]));
1633         //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[26]));
1634         //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[30]));
1635         //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[34]));
1636         //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[38]));
1637         //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[42]));
1638         //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[46]));
1639         //
1640         //a7 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[19]);
1641         //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[23]));
1642         //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[27]));
1643         //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[31]));
1644         //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[35]));
1645         //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[39]));
1646         //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[43]));
1647         //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[47]));
1648 
1649         b0 = _mm_sra_epi32(_mm_add_epi32(a0, o0), s0);
1650         //b1 = _mm_sra_epi32(_mm_add_epi32(a1, o0), s0);
1651         b2 = _mm_sra_epi32(_mm_add_epi32(a2, o0), s0);
1652         //b3 = _mm_sra_epi32(_mm_add_epi32(a3, o0), s0);
1653         b4 = _mm_sra_epi32(_mm_add_epi32(a4, o0), s0);
1654         b5 = _mm_sra_epi32(_mm_add_epi32(a5, o0), s0);
1655         //b6 = _mm_sra_epi32(_mm_add_epi32(a6, o0), s0);
1656         //b7 = _mm_sra_epi32(_mm_add_epi32(a7, o0), s0);
1657 
1658         x0 = _mm_packs_epi32(b0, b1);
1659         x1 = _mm_packs_epi32(b2, b3);
1660         x2 = _mm_packs_epi32(b4, b5);
1661         //x3 = _mm_packs_epi32(b6, b7);
1662 
1663         y0 = _mm_unpacklo_epi16(x0, x1);
1664         //y1 = _mm_unpackhi_epi16(x0, x1);
1665         y2 = x2;
1666         //y3 = x3;
1667         x0 = _mm_unpacklo_epi16(y0, y2);
1668         x1 = _mm_unpackhi_epi16(y0, y2);
1669         //x2 = _mm_unpacklo_epi16(y1, y3);
1670         //x3 = _mm_unpackhi_epi16(y1, y3);
1671 
1672         _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x00), x0);
1673         _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x08), x1);
1674         //_mm_storeu_si128((__m128i *)(dst+i*dst_stride+0x10), x2);
1675         //_mm_storeu_si128((__m128i *)(dst+i*dst_stride+0x18), x3);
1676     }
1677 }
Pfreq2DTransform32_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift)1678 static void Pfreq2DTransform32_SSE2(
1679     EB_S16 *src,
1680     EB_U32  src_stride,
1681     EB_S16 *dst,
1682     EB_U32  dst_stride,
1683     EB_U32  shift)
1684 {
1685     EB_U32 i;
1686     __m128i s0 = _mm_cvtsi32_si128(shift);
1687     __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
1688     const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl;
1689 
1690     for (i = 0; i < 16; i++)
1691     {
1692         __m128i x0, x1, x2, x3;
1693         __m128i y0, y1, y2, y3;
1694         __m128i a0, a2, a4, a5;
1695         __m128i b0, b1, b2, b3, b4, b5;
1696 
1697         b1 = s0;
1698         b3 = s0;
1699 
1700         x0 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x00));
1701         x1 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x08));
1702         x2 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x10));
1703         x3 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x18));
1704 
1705 
1706         // 32-point butterfly
1707         x2 = reverse_epi16(x2);
1708         x3 = reverse_epi16(x3);
1709 
1710         y0 = _mm_add_epi16(x0, x3);
1711         y1 = _mm_add_epi16(x1, x2);
1712 
1713         y2 = _mm_sub_epi16(x0, x3);
1714         y3 = _mm_sub_epi16(x1, x2);
1715 
1716         // 16-point butterfly
1717         y1 = reverse_epi16(y1);
1718 
1719         x0 = _mm_add_epi16(y0, y1);
1720         x1 = _mm_sub_epi16(y0, y1);
1721 
1722 
1723         x2 = y2;
1724         x3 = y3;
1725 
1726 
1727 
1728         a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
1729         a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[2]));
1730         a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[4]));
1731         a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[6]));
1732 
1733         //a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
1734         //a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[3]));
1735         //a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[5]));
1736         //a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[7]));
1737 
1738         a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]);
1739         a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10]));
1740         a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[12]));
1741         a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[14]));
1742 
1743         //a3 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[9]);
1744         //a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[11]));
1745         //a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[13]));
1746         //a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[15]));
1747 
1748         a4 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[16]);
1749         a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[20]));
1750         a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[24]));
1751         a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[28]));
1752         a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[32]));
1753         a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[36]));
1754         a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[40]));
1755         a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[44]));
1756 
1757         a5 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[17]);
1758         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[21]));
1759         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[25]));
1760         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[29]));
1761         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[33]));
1762         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[37]));
1763         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[41]));
1764         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[45]));
1765 
1766         //a6 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[18]);
1767         //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[22]));
1768         //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[26]));
1769         //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[30]));
1770         //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[34]));
1771         //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[38]));
1772         //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[42]));
1773         //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[46]));
1774         //
1775         //a7 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[19]);
1776         //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[23]));
1777         //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[27]));
1778         //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[31]));
1779         //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[35]));
1780         //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[39]));
1781         //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[43]));
1782         //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[47]));
1783 
1784         b0 = _mm_sra_epi32(_mm_add_epi32(a0, o0), s0);
1785         //b1 = _mm_sra_epi32(_mm_add_epi32(a1, o0), s0);
1786         b2 = _mm_sra_epi32(_mm_add_epi32(a2, o0), s0);
1787         //b3 = _mm_sra_epi32(_mm_add_epi32(a3, o0), s0);
1788         b4 = _mm_sra_epi32(_mm_add_epi32(a4, o0), s0);
1789         b5 = _mm_sra_epi32(_mm_add_epi32(a5, o0), s0);
1790         //b6 = _mm_sra_epi32(_mm_add_epi32(a6, o0), s0);
1791         //b7 = _mm_sra_epi32(_mm_add_epi32(a7, o0), s0);
1792 
1793         x0 = _mm_packs_epi32(b0, b1);
1794         x1 = _mm_packs_epi32(b2, b3);
1795         x2 = _mm_packs_epi32(b4, b5);
1796         //x3 = _mm_packs_epi32(b6, b7);
1797 
1798         y0 = _mm_unpacklo_epi16(x0, x1);
1799         //y1 = _mm_unpackhi_epi16(x0, x1);
1800         y2 = x2;
1801         //y3 = x3;
1802         x0 = _mm_unpacklo_epi16(y0, y2);
1803         x1 = _mm_unpackhi_epi16(y0, y2);
1804         //x2 = _mm_unpacklo_epi16(y1, y3);
1805         //x3 = _mm_unpackhi_epi16(y1, y3);
1806 
1807         _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x00), x0);
1808         _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x08), x1);
1809         //_mm_storeu_si128((__m128i *)(dst+i*dst_stride+0x10), x2);
1810         //_mm_storeu_si128((__m128i *)(dst+i*dst_stride+0x18), x3);
1811     }
1812 }
1813 
PfreqN41DTransform32_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift)1814 static void PfreqN41DTransform32_SSE2(
1815 	EB_S16 *src,
1816 	EB_U32  src_stride,
1817 	EB_S16 *dst,
1818 	EB_U32  dst_stride,
1819 	EB_U32  shift)
1820 {
1821 	EB_U32 i;
1822 	__m128i s0 = _mm_cvtsi32_si128(shift);
1823 	__m128i o0 = _mm_set1_epi32(1 << (shift - 1));
1824 	const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl;
1825 
1826 	for (i = 0; i < 32; i++)
1827 	{
1828 		__m128i x0, x1, x2, x3;
1829 		__m128i y0, y1, y2, y3;
1830 		__m128i a0, a2, a4/*, a5*/;
1831 		__m128i b0, b1, b2, b3, b4/*, b5*/;
1832 
1833 
1834 		b1 = s0;
1835 		b3 = s0;
1836 
1837 		x0 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x00));
1838 		x1 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x08));
1839 		x2 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x10));
1840 		x3 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x18));
1841 
1842 
1843 		// 32-point butterfly
1844 		x2 = reverse_epi16(x2);
1845 		x3 = reverse_epi16(x3);
1846 
1847 		y0 = _mm_add_epi16(x0, x3);
1848 		y1 = _mm_add_epi16(x1, x2);
1849 
1850 		y2 = _mm_sub_epi16(x0, x3);
1851 		y3 = _mm_sub_epi16(x1, x2);
1852 
1853 		// 16-point butterfly
1854 		y1 = reverse_epi16(y1);
1855 
1856 		x0 = _mm_add_epi16(y0, y1);
1857 		x1 = _mm_sub_epi16(y0, y1);
1858 
1859 
1860 		x2 = y2;
1861 		x3 = y3;
1862 
1863 
1864 
1865 		a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
1866 		a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[2]));
1867 		a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[4]));
1868 		a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[6]));
1869 
1870 		//a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
1871 		//a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[3]));
1872 		//a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[5]));
1873 		//a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[7]));
1874 
1875 		a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]);
1876 		a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10]));
1877 		a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[12]));
1878 		a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[14]));
1879 
1880 		//a3 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[9]);
1881 		//a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[11]));
1882 		//a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[13]));
1883 		//a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[15]));
1884 
1885 		a4 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[16]);
1886 		a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[20]));
1887 		a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[24]));
1888 		a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[28]));
1889 		a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[32]));
1890 		a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[36]));
1891 		a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[40]));
1892 		a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[44]));
1893 
1894 		/**///        a5 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[17]);
1895 		/**///        a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[21]));
1896 		/**///        a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[25]));
1897 		/**///        a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[29]));
1898 		/**///        a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[33]));
1899 		/**///        a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[37]));
1900 		/**///        a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[41]));
1901 		/**///        a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[45]));
1902 
1903 		//a6 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[18]);
1904 		//a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[22]));
1905 		//a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[26]));
1906 		//a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[30]));
1907 		//a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[34]));
1908 		//a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[38]));
1909 		//a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[42]));
1910 		//a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[46]));
1911 		//
1912 		//a7 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[19]);
1913 		//a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[23]));
1914 		//a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[27]));
1915 		//a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[31]));
1916 		//a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[35]));
1917 		//a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[39]));
1918 		//a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[43]));
1919 		//a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[47]));
1920 
1921 		b0 = _mm_sra_epi32(_mm_add_epi32(a0, o0), s0);
1922 		//b1 = _mm_sra_epi32(_mm_add_epi32(a1, o0), s0);
1923 		b2 = _mm_sra_epi32(_mm_add_epi32(a2, o0), s0);
1924 		//b3 = _mm_sra_epi32(_mm_add_epi32(a3, o0), s0);
1925 		b4 = _mm_sra_epi32(_mm_add_epi32(a4, o0), s0);
1926 		/**///         b5 = _mm_sra_epi32(_mm_add_epi32(a5, o0), s0);
1927 		//b6 = _mm_sra_epi32(_mm_add_epi32(a6, o0), s0);
1928 		//b7 = _mm_sra_epi32(_mm_add_epi32(a7, o0), s0);
1929 
1930 		x0 = _mm_packs_epi32(b0, b1);
1931 		x1 = _mm_packs_epi32(b2, b3);
1932 		x2 = _mm_packs_epi32(b4, b4);
1933 
1934 		//x3 = _mm_packs_epi32(b6, b7);
1935 
1936 		y0 = _mm_unpacklo_epi16(x0, x1);
1937 		//y1 = _mm_unpackhi_epi16(x0, x1);
1938 		y2 = x2;
1939 		//y3 = x3;
1940 		x0 = _mm_unpacklo_epi16(y0, y2);
1941 		/**///        x1 = _mm_unpackhi_epi16(y0, y2);
1942 		//x2 = _mm_unpacklo_epi16(y1, y3);
1943 		//x3 = _mm_unpackhi_epi16(y1, y3);
1944 
1945 		_mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x00), x0);
1946 		/**///       _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x08), x1);
1947 		//_mm_storeu_si128((__m128i *)(dst+i*dst_stride+0x10), x2);
1948 		//_mm_storeu_si128((__m128i *)(dst+i*dst_stride+0x18), x3);
1949 	}
1950 }
PfreqN42DTransform32_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift)1951 static void PfreqN42DTransform32_SSE2(
1952 	EB_S16 *src,
1953 	EB_U32  src_stride,
1954 	EB_S16 *dst,
1955 	EB_U32  dst_stride,
1956 	EB_U32  shift)
1957 {
1958 	EB_U32 i;
1959 	__m128i s0 = _mm_cvtsi32_si128(shift);
1960 	__m128i o0 = _mm_set1_epi32(1 << (shift - 1));
1961 	const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl;
1962 
1963 	for (i = 0; i < 8; i++)
1964 
1965 	{
1966 		__m128i x0, x1, x2, x3;
1967 		__m128i y0, y1, y2, y3;
1968 		__m128i a0, a2, a4/*, a5*/;
1969 		__m128i b0, b1, b2, b3, b4/*, b5*/;
1970 
1971 		b1 = s0;
1972 		b3 = s0;
1973 
1974 		x0 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x00));
1975 		x1 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x08));
1976 		x2 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x10));
1977 		x3 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x18));
1978 
1979 
1980 		// 32-point butterfly
1981 		x2 = reverse_epi16(x2);
1982 		x3 = reverse_epi16(x3);
1983 
1984 		y0 = _mm_add_epi16(x0, x3);
1985 		y1 = _mm_add_epi16(x1, x2);
1986 
1987 		y2 = _mm_sub_epi16(x0, x3);
1988 		y3 = _mm_sub_epi16(x1, x2);
1989 
1990 		// 16-point butterfly
1991 		y1 = reverse_epi16(y1);
1992 
1993 		x0 = _mm_add_epi16(y0, y1);
1994 		x1 = _mm_sub_epi16(y0, y1);
1995 
1996 
1997 		x2 = y2;
1998 		x3 = y3;
1999 
2000 
2001 
2002 		a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
2003 		a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[2]));
2004 		a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[4]));
2005 		a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[6]));
2006 
2007 		//a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
2008 		//a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[3]));
2009 		//a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[5]));
2010 		//a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[7]));
2011 
2012 		a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]);
2013 		a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10]));
2014 		a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[12]));
2015 		a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[14]));
2016 
2017 		//a3 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[9]);
2018 		//a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[11]));
2019 		//a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[13]));
2020 		//a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[15]));
2021 
2022 		a4 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[16]);
2023 		a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[20]));
2024 		a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[24]));
2025 		a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[28]));
2026 		a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[32]));
2027 		a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[36]));
2028 		a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[40]));
2029 		a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[44]));
2030 
2031 		/**///         a5 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[17]);
2032 		/**///         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[21]));
2033 		/**///         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[25]));
2034 		/**///         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[29]));
2035 		/**///         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[33]));
2036 		/**///         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[37]));
2037 		/**///         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[41]));
2038 		/**///         a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[45]));
2039 
2040 		//a6 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[18]);
2041 		//a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[22]));
2042 		//a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[26]));
2043 		//a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[30]));
2044 		//a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[34]));
2045 		//a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[38]));
2046 		//a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[42]));
2047 		//a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[46]));
2048 		//
2049 		//a7 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[19]);
2050 		//a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[23]));
2051 		//a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[27]));
2052 		//a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[31]));
2053 		//a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[35]));
2054 		//a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[39]));
2055 		//a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[43]));
2056 		//a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[47]));
2057 
2058 		b0 = _mm_sra_epi32(_mm_add_epi32(a0, o0), s0);
2059 		//b1 = _mm_sra_epi32(_mm_add_epi32(a1, o0), s0);
2060 		b2 = _mm_sra_epi32(_mm_add_epi32(a2, o0), s0);
2061 		//b3 = _mm_sra_epi32(_mm_add_epi32(a3, o0), s0);
2062 		b4 = _mm_sra_epi32(_mm_add_epi32(a4, o0), s0);
2063 		/**///  b5 = _mm_sra_epi32(_mm_add_epi32(a5, o0), s0);
2064 		//b6 = _mm_sra_epi32(_mm_add_epi32(a6, o0), s0);
2065 		//b7 = _mm_sra_epi32(_mm_add_epi32(a7, o0), s0);
2066 
2067 		x0 = _mm_packs_epi32(b0, b1);
2068 		x1 = _mm_packs_epi32(b2, b3);
2069 		x2 = _mm_packs_epi32(b4, b4);//do not use b5
2070 
2071 		//x3 = _mm_packs_epi32(b6, b7);
2072 
2073 		y0 = _mm_unpacklo_epi16(x0, x1);
2074 		//y1 = _mm_unpackhi_epi16(x0, x1);
2075 		y2 = x2;
2076 		//y3 = x3;
2077 		x0 = _mm_unpacklo_epi16(y0, y2);
2078 		/**///   x1 = _mm_unpackhi_epi16(y0, y2);
2079 		//x2 = _mm_unpacklo_epi16(y1, y3);
2080 		//x3 = _mm_unpackhi_epi16(y1, y3);
2081 
2082 		_mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x00), x0);
2083 		/**///    _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x08), x1);
2084 
2085 		//_mm_storeu_si128((__m128i *)(dst+i*dst_stride+0x10), x2);
2086 		//_mm_storeu_si128((__m128i *)(dst+i*dst_stride+0x18), x3);
2087 	}
2088 }
2089 
Transform32x32_SSE2(EB_S16 * src,const EB_U32 src_stride,EB_S16 * dst,const EB_U32 dst_stride,EB_S16 * intermediate,EB_U32 addshift)2090 void Transform32x32_SSE2(
2091     EB_S16 *src,
2092     const EB_U32 src_stride,
2093     EB_S16 *dst,
2094     const EB_U32 dst_stride,
2095     EB_S16 *intermediate,
2096     EB_U32 addshift)
2097 {
2098     Transform32_SSE2(src, src_stride, intermediate, 32, 6 + addshift);
2099     Transpose32_SSE2(intermediate, 32, dst, dst_stride);
2100     Transform32_SSE2(dst, dst_stride, intermediate, 32, 9);
2101     Transpose32_SSE2(intermediate, 32, dst, dst_stride);
2102 
2103     return;
2104 }
2105 
PfreqN4Transform32x32_SSE2(EB_S16 * src,const EB_U32 src_stride,EB_S16 * dst,const EB_U32 dst_stride,EB_S16 * intermediate,EB_U32 addshift)2106 void PfreqN4Transform32x32_SSE2(
2107 	EB_S16 *src,
2108 	const EB_U32 src_stride,
2109 	EB_S16 *dst,
2110 	const EB_U32 dst_stride,
2111 	EB_S16 *intermediate,
2112 	EB_U32 addshift)
2113 {
2114 	PfreqN41DTransform32_SSE2(src, src_stride, intermediate, 32, 6 + addshift);
2115 	PfreqN4FirstTranspose32_SSE2(intermediate, 32, dst, dst_stride);
2116 	PfreqN42DTransform32_SSE2(dst, dst_stride, intermediate, 32, 9);
2117 	PfreqN4SecTranspose32_SSE2(intermediate, 32, dst, dst_stride);
2118 
2119 	return;
2120 }
2121 
Pfreq1DTransform16_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift)2122 static void Pfreq1DTransform16_SSE2(
2123     EB_S16 *src,
2124     EB_U32  src_stride,
2125     EB_S16 *dst,
2126     EB_U32  dst_stride,
2127     EB_U32  shift)
2128 {
2129     EB_U32 i;
2130     __m128i s0 = _mm_cvtsi32_si128(shift);
2131     __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
2132     const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl;
2133 
2134     for (i = 0; i < 16; i++)
2135     {
2136         __m128i x0, x1;
2137         __m128i y0, y1;
2138         __m128i a0, a2;
2139         __m128i b0, b1, b2, b3;
2140 
2141         b1 = s0;
2142         b3 = s0;
2143 
2144         y0 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x00));
2145         y1 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x08));
2146 
2147 
2148         // 16-point butterfly
2149         y1 = reverse_epi16(y1);
2150 
2151         x0 = _mm_add_epi16(y0, y1);
2152         x1 = _mm_sub_epi16(y0, y1);
2153 
2154         a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
2155         a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[2]));
2156         a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[4]));
2157         a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[6]));
2158 
2159         a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]);
2160         a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10]));
2161         a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[12]));
2162         a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[14]));
2163 
2164         b0 = _mm_sra_epi32(_mm_add_epi32(a0, o0), s0);
2165         b2 = _mm_sra_epi32(_mm_add_epi32(a2, o0), s0);
2166 
2167         x0 = _mm_packs_epi32(b0, b1);
2168         x1 = _mm_packs_epi32(b2, b3);
2169 
2170         y0 = _mm_unpacklo_epi16(x0, x1);
2171 
2172         _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x00), y0);
2173     }
2174 }
2175 
Pfreq2DTransform16_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift)2176 static void Pfreq2DTransform16_SSE2(
2177     EB_S16 *src,
2178     EB_U32  src_stride,
2179     EB_S16 *dst,
2180     EB_U32  dst_stride,
2181     EB_U32  shift)
2182 {
2183     EB_U32 i;
2184     __m128i s0 = _mm_cvtsi32_si128(shift);
2185     __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
2186     const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl;
2187 
2188     for (i = 0; i < 8; i++)
2189     {
2190         __m128i x0, x1;
2191         __m128i y0, y1;
2192         __m128i a0, a2;
2193         __m128i b0, b1, b2, b3;
2194 
2195         b1 = s0;
2196         b3 = s0;
2197 
2198         y0 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x00));
2199         y1 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x08));
2200 
2201 
2202         // 16-point butterfly
2203         y1 = reverse_epi16(y1);
2204 
2205         x0 = _mm_add_epi16(y0, y1);
2206         x1 = _mm_sub_epi16(y0, y1);
2207 
2208         a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
2209         a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[2]));
2210         a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[4]));
2211         a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[6]));
2212 
2213 
2214         a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]);
2215         a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10]));
2216         a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[12]));
2217         a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[14]));
2218 
2219 
2220         b0 = _mm_sra_epi32(_mm_add_epi32(a0, o0), s0);
2221         b2 = _mm_sra_epi32(_mm_add_epi32(a2, o0), s0);
2222 
2223         x0 = _mm_packs_epi32(b0, b1);
2224         x1 = _mm_packs_epi32(b2, b3);
2225 
2226         y0 = _mm_unpacklo_epi16(x0, x1);
2227 
2228         _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x00), y0);
2229     }
2230 }
PfreqTranspose16_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride)2231 static void PfreqTranspose16_SSE2(
2232     EB_S16 *src,
2233     EB_U32  src_stride,
2234     EB_S16 *dst,
2235     EB_U32  dst_stride)
2236 {
2237     EB_U32 i, j;
2238     for (i = 0; i < 2; i++)
2239     {
2240         for (j = 0; j < 1; j++)
2241         {
2242             __m128i a0, a1, a2, a3, a4, a5, a6, a7;
2243             __m128i b0, b1, b2, b3, b4, b5, b6, b7;
2244 
2245             a0 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 0)*src_stride + 8 * j));
2246             a1 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 1)*src_stride + 8 * j));
2247             a2 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 2)*src_stride + 8 * j));
2248             a3 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 3)*src_stride + 8 * j));
2249             a4 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 4)*src_stride + 8 * j));
2250             a5 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 5)*src_stride + 8 * j));
2251             a6 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 6)*src_stride + 8 * j));
2252             a7 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 7)*src_stride + 8 * j));
2253 
2254             b0 = _mm_unpacklo_epi16(a0, a4);
2255             b1 = _mm_unpacklo_epi16(a1, a5);
2256             b2 = _mm_unpacklo_epi16(a2, a6);
2257             b3 = _mm_unpacklo_epi16(a3, a7);
2258             b4 = _mm_unpackhi_epi16(a0, a4);
2259             b5 = _mm_unpackhi_epi16(a1, a5);
2260             b6 = _mm_unpackhi_epi16(a2, a6);
2261             b7 = _mm_unpackhi_epi16(a3, a7);
2262 
2263             a0 = _mm_unpacklo_epi16(b0, b2);
2264             a1 = _mm_unpacklo_epi16(b1, b3);
2265             a2 = _mm_unpackhi_epi16(b0, b2);
2266             a3 = _mm_unpackhi_epi16(b1, b3);
2267             a4 = _mm_unpacklo_epi16(b4, b6);
2268             a5 = _mm_unpacklo_epi16(b5, b7);
2269             a6 = _mm_unpackhi_epi16(b4, b6);
2270             a7 = _mm_unpackhi_epi16(b5, b7);
2271 
2272             b0 = _mm_unpacklo_epi16(a0, a1);
2273             b1 = _mm_unpackhi_epi16(a0, a1);
2274             b2 = _mm_unpacklo_epi16(a2, a3);
2275             b3 = _mm_unpackhi_epi16(a2, a3);
2276             b4 = _mm_unpacklo_epi16(a4, a5);
2277             b5 = _mm_unpackhi_epi16(a4, a5);
2278             b6 = _mm_unpacklo_epi16(a6, a7);
2279             b7 = _mm_unpackhi_epi16(a6, a7);
2280 
2281             _mm_storeu_si128((__m128i *)(dst + (8 * j + 0)*dst_stride + 8 * i), b0);
2282             _mm_storeu_si128((__m128i *)(dst + (8 * j + 1)*dst_stride + 8 * i), b1);
2283             _mm_storeu_si128((__m128i *)(dst + (8 * j + 2)*dst_stride + 8 * i), b2);
2284             _mm_storeu_si128((__m128i *)(dst + (8 * j + 3)*dst_stride + 8 * i), b3);
2285             _mm_storeu_si128((__m128i *)(dst + (8 * j + 4)*dst_stride + 8 * i), b4);
2286             _mm_storeu_si128((__m128i *)(dst + (8 * j + 5)*dst_stride + 8 * i), b5);
2287             _mm_storeu_si128((__m128i *)(dst + (8 * j + 6)*dst_stride + 8 * i), b6);
2288             _mm_storeu_si128((__m128i *)(dst + (8 * j + 7)*dst_stride + 8 * i), b7);
2289         }
2290     }
2291 }
PfreqTransform16x16_SSE2(EB_S16 * src,const EB_U32 src_stride,EB_S16 * dst,const EB_U32 dst_stride,EB_S16 * intermediate,EB_U32 addshift)2292 void PfreqTransform16x16_SSE2(
2293     EB_S16 *src,
2294     const EB_U32 src_stride,
2295     EB_S16 *dst,
2296     const EB_U32 dst_stride,
2297     EB_S16 *intermediate,
2298     EB_U32 addshift)
2299 {
2300     Pfreq1DTransform16_SSE2(src, src_stride, intermediate, 16, 4 + addshift);
2301     PfreqTranspose16_SSE2(intermediate, 16, dst, dst_stride);
2302     Pfreq2DTransform16_SSE2(dst, dst_stride, intermediate, 16, 9);
2303     PfreqTranspose16_SSE2(intermediate, 16, dst, dst_stride);
2304 
2305     return;
2306 }
2307 
PfreqN42DTransform16_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift)2308 static void PfreqN42DTransform16_SSE2(
2309 	EB_S16 *src,
2310 	EB_U32  src_stride,
2311 	EB_S16 *dst,
2312 	EB_U32  dst_stride,
2313 	EB_U32  shift)
2314 {
2315 	EB_U32 i;
2316 	__m128i s0 = _mm_cvtsi32_si128(shift);
2317 	__m128i o0 = _mm_set1_epi32(1 << (shift - 1));
2318 	const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl;
2319 
2320 	for (i = 0; i < 4; i++)
2321 
2322 	{
2323 		__m128i x0, x1;
2324 		__m128i y0, y1;
2325 		__m128i a0, a2;
2326 		__m128i b0, b1, b2, b3;
2327 
2328 		b1 = s0;
2329 		b3 = s0;
2330 
2331 		y0 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x00));
2332 		y1 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x08));
2333 
2334 
2335 		// 16-point butterfly
2336 		y1 = reverse_epi16(y1);
2337 
2338 		x0 = _mm_add_epi16(y0, y1);
2339 		x1 = _mm_sub_epi16(y0, y1);
2340 
2341 		a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
2342 		a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[2]));
2343 		a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[4]));
2344 		a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[6]));
2345 
2346 
2347 		a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]);
2348 		a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10]));
2349 		a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[12]));
2350 		a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[14]));
2351 
2352 
2353 		b0 = _mm_sra_epi32(_mm_add_epi32(a0, o0), s0);
2354 		b2 = _mm_sra_epi32(_mm_add_epi32(a2, o0), s0);
2355 
2356 		x0 = _mm_packs_epi32(b0, b1);
2357 		x1 = _mm_packs_epi32(b2, b3);
2358 
2359 		y0 = _mm_unpacklo_epi16(x0, x1);
2360 
2361 		_mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x00), y0);//TODO change to 64bit
2362 	}
2363 }
PfreqN4FirstTranspose16_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride)2364 static void PfreqN4FirstTranspose16_SSE2(
2365 	EB_S16 *src,
2366 	EB_U32  src_stride,
2367 	EB_S16 *dst,
2368 	EB_U32  dst_stride)
2369 {
2370 	EB_U32 i, j;
2371 	for (i = 0; i < 2; i++)
2372 	{
2373 		for (j = 0; j < 1; j++)
2374 		{
2375 			__m128i a0, a1, a2, a3, a4, a5, a6, a7;
2376 			__m128i b0, b1, b2, b3/*, b4, b5, b6, b7*/;
2377 
2378 			a0 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 0)*src_stride + 8 * j));
2379 			a1 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 1)*src_stride + 8 * j));
2380 			a2 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 2)*src_stride + 8 * j));
2381 			a3 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 3)*src_stride + 8 * j));
2382 			a4 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 4)*src_stride + 8 * j));
2383 			a5 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 5)*src_stride + 8 * j));
2384 			a6 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 6)*src_stride + 8 * j));
2385 			a7 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 7)*src_stride + 8 * j));
2386 
2387 			b0 = _mm_unpacklo_epi16(a0, a4);
2388 			b1 = _mm_unpacklo_epi16(a1, a5);
2389 			b2 = _mm_unpacklo_epi16(a2, a6);
2390 			b3 = _mm_unpacklo_epi16(a3, a7);
2391 			// b4 = _mm_unpackhi_epi16(a0, a4);
2392 			// b5 = _mm_unpackhi_epi16(a1, a5);
2393 			// b6 = _mm_unpackhi_epi16(a2, a6);
2394 			// b7 = _mm_unpackhi_epi16(a3, a7);
2395 
2396 			a0 = _mm_unpacklo_epi16(b0, b2);
2397 			a1 = _mm_unpacklo_epi16(b1, b3);
2398 			a2 = _mm_unpackhi_epi16(b0, b2);
2399 			a3 = _mm_unpackhi_epi16(b1, b3);
2400 			// a4 = _mm_unpacklo_epi16(b4, b6);
2401 			// a5 = _mm_unpacklo_epi16(b5, b7);
2402 			// a6 = _mm_unpackhi_epi16(b4, b6);
2403 			// a7 = _mm_unpackhi_epi16(b5, b7);
2404 
2405 			b0 = _mm_unpacklo_epi16(a0, a1);
2406 			b1 = _mm_unpackhi_epi16(a0, a1);
2407 			b2 = _mm_unpacklo_epi16(a2, a3);
2408 			b3 = _mm_unpackhi_epi16(a2, a3);
2409 			// b4 = _mm_unpacklo_epi16(a4, a5);
2410 			// b5 = _mm_unpackhi_epi16(a4, a5);
2411 			// b6 = _mm_unpacklo_epi16(a6, a7);
2412 			// b7 = _mm_unpackhi_epi16(a6, a7);
2413 
2414 			_mm_storeu_si128((__m128i *)(dst + (8 * j + 0)*dst_stride + 8 * i), b0);
2415 			_mm_storeu_si128((__m128i *)(dst + (8 * j + 1)*dst_stride + 8 * i), b1);
2416 			_mm_storeu_si128((__m128i *)(dst + (8 * j + 2)*dst_stride + 8 * i), b2);
2417 			_mm_storeu_si128((__m128i *)(dst + (8 * j + 3)*dst_stride + 8 * i), b3);
2418 			// _mm_storeu_si128((__m128i *)(dst + (8 * j + 4)*dst_stride + 8 * i), b4);
2419 			// _mm_storeu_si128((__m128i *)(dst + (8 * j + 5)*dst_stride + 8 * i), b5);
2420 			// _mm_storeu_si128((__m128i *)(dst + (8 * j + 6)*dst_stride + 8 * i), b6);
2421 			// _mm_storeu_si128((__m128i *)(dst + (8 * j + 7)*dst_stride + 8 * i), b7);
2422 		}
2423 	}
2424 }
PfreqN4SecondTranspose16_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride)2425 static void PfreqN4SecondTranspose16_SSE2(
2426 	EB_S16 *src,
2427 	EB_U32  src_stride,
2428 	EB_S16 *dst,
2429 	EB_U32  dst_stride)
2430 {
2431 	EB_U32 i, j;
2432 
2433 	i = j = 0;
2434 	{
2435 		{
2436 			__m128i a0, a1, a2, a3/*, a4, a5, a6, a7*/;
2437 			__m128i b0, b1, b2, b3/*, b4, b5, b6, b7*/;
2438 
2439 			a0 = _mm_loadu_si128((const __m128i *)(src + (0)*src_stride));  //TODO load only 64bit
2440 			a1 = _mm_loadu_si128((const __m128i *)(src + (1)*src_stride));
2441 			a2 = _mm_loadu_si128((const __m128i *)(src + (2)*src_stride));
2442 			a3 = _mm_loadu_si128((const __m128i *)(src + (3)*src_stride));
2443 
2444 			b0 = _mm_unpacklo_epi16(a0, a0/*a4*/);
2445 			b1 = _mm_unpacklo_epi16(a1, a1/*a5*/);
2446 			b2 = _mm_unpacklo_epi16(a2, a2/*a6*/);
2447 			b3 = _mm_unpacklo_epi16(a3, a3/*a7*/);
2448 
2449 			a0 = _mm_unpacklo_epi16(b0, b2);
2450 			a1 = _mm_unpacklo_epi16(b1, b3);
2451 			a2 = _mm_unpackhi_epi16(b0, b2);
2452 			a3 = _mm_unpackhi_epi16(b1, b3);
2453 
2454 			b0 = _mm_unpacklo_epi16(a0, a1);
2455 			b1 = _mm_unpackhi_epi16(a0, a1);
2456 			b2 = _mm_unpacklo_epi16(a2, a3);
2457 			b3 = _mm_unpackhi_epi16(a2, a3);
2458 
2459 			_mm_storeu_si128((__m128i *)(dst + (8 * j + 0)*dst_stride + 8 * i), b0);
2460 			_mm_storeu_si128((__m128i *)(dst + (8 * j + 1)*dst_stride + 8 * i), b1);
2461 			_mm_storeu_si128((__m128i *)(dst + (8 * j + 2)*dst_stride + 8 * i), b2);
2462 			_mm_storeu_si128((__m128i *)(dst + (8 * j + 3)*dst_stride + 8 * i), b3);
2463 		}
2464 	}
2465 }
2466 
PfreqTransform32x32_SSE2(EB_S16 * src,const EB_U32 src_stride,EB_S16 * dst,const EB_U32 dst_stride,EB_S16 * intermediate,EB_U32 addshift)2467 void PfreqTransform32x32_SSE2(
2468     EB_S16 *src,
2469     const EB_U32 src_stride,
2470     EB_S16 *dst,
2471     const EB_U32 dst_stride,
2472     EB_S16 *intermediate,
2473     EB_U32 addshift)
2474 {
2475     Pfreq1DTransform32_SSE2(src, src_stride, intermediate, 32, 6 + addshift);
2476     PfreqTranspose32_SSE2(intermediate, 32, dst, dst_stride);
2477     Pfreq2DTransform32_SSE2(dst, dst_stride, intermediate, 32, 9);
2478     PfreqTranspose32_SSE2(intermediate, 32, dst, dst_stride);
2479 
2480     return;
2481 }
2482 
PfreqN4Transform16x16_SSE2(EB_S16 * src,const EB_U32 src_stride,EB_S16 * dst,const EB_U32 dst_stride,EB_S16 * intermediate,EB_U32 addshift)2483 void PfreqN4Transform16x16_SSE2(
2484 	EB_S16 *src,
2485 	const EB_U32 src_stride,
2486 	EB_S16 *dst,
2487 	const EB_U32 dst_stride,
2488 	EB_S16 *intermediate,
2489 	EB_U32 addshift)
2490 {
2491 	Pfreq1DTransform16_SSE2(src, src_stride, intermediate, 16, 4 + addshift);
2492 	PfreqN4FirstTranspose16_SSE2(intermediate, 16, dst, dst_stride);
2493 	PfreqN42DTransform16_SSE2(dst, dst_stride, intermediate, 16, 9);
2494 	PfreqN4SecondTranspose16_SSE2(intermediate, 16, dst, dst_stride);
2495 
2496 	return;
2497 }
2498 
2499 
Transform4x4_SSE2_INTRIN(EB_S16 * residual,const EB_U32 srcStride,EB_S16 * transformCoefficients,const EB_U32 dstStride,EB_S16 * transformInnerArrayPtr,EB_U32 bitIncrement)2500 void Transform4x4_SSE2_INTRIN(
2501     EB_S16                  *residual,
2502     const EB_U32             srcStride,
2503     EB_S16                  *transformCoefficients,
2504     const EB_U32             dstStride,
2505     EB_S16                  *transformInnerArrayPtr,
2506     EB_U32                   bitIncrement)
2507 {
2508 #define OFFSET_128 0
2509 #define OFFSET_64_64 8
2510 #define OFFSET_83_36 16
2511 #define OFFSET_N36_N83 24
2512 #define OFFSET_64_N64 32
2513 #define OFFSET_N64_64 40
2514 #define OFFSET_36_N83 48
2515 #define OFFSET_83_N36 56
2516 
2517 
2518     EB_ALIGN(16) EB_S16 transformIntrinConst_SSE2[] = {
2519          128, 0,   128, 0,   128, 0,   128, 0,
2520          64,  64,  64,  64,  64,  64,  64,  64,
2521          83,  36,  83,  36,  83,  36,  83,  36,
2522         -36, -83, -36, -83, -36, -83, -36, -83,
2523          64, -64,  64, -64,  64, -64,  64, -64,
2524         -64,  64, -64,  64, -64,  64, -64,  64,
2525          36, -83,  36, -83,  36, -83,  36, -83,
2526          83, -36,  83, -36,  83, -36,  83, -36
2527     };
2528      EB_ALIGN(16) const EB_S16 * EbHevcTransformAsmConst = transformIntrinConst_SSE2;
2529     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm_offset, xmm_shift;
2530 
2531     xmm_shift = _mm_cvtsi32_si128(5 - bitIncrement);
2532     xmm0 = _mm_loadl_epi64((__m128i*)(residual));
2533     xmm1 = _mm_loadl_epi64((__m128i*)(residual + srcStride));
2534     xmm2 = _mm_loadl_epi64((__m128i *)(residual + 2 * srcStride));
2535     xmm3 = _mm_loadl_epi64((__m128i *)(residual + 3 * srcStride));
2536     xmm0 = _mm_unpacklo_epi16(xmm0, xmm1);
2537     xmm2 = _mm_unpacklo_epi16(xmm2, xmm3);
2538 
2539     xmm1 = _mm_unpackhi_epi32(xmm0, xmm2);
2540     xmm0 = _mm_unpacklo_epi32(xmm0, xmm2);
2541     xmm1 = _mm_unpacklo_epi64(_mm_srli_si128(xmm1, 8), xmm1);
2542     xmm3 = _mm_sub_epi16(xmm0, xmm1);
2543     xmm0 = _mm_add_epi16(xmm0, xmm1);
2544 
2545     xmm4 = xmm2 = xmm0;
2546     xmm0 = _mm_srli_si128(xmm0, 8);
2547     xmm2 = _mm_sll_epi16(_mm_add_epi16(xmm2, xmm0), xmm_shift);
2548     xmm4 = _mm_sll_epi16(_mm_sub_epi16(xmm4, xmm0), xmm_shift);
2549 
2550     xmm_offset = _mm_slli_epi16(_mm_set1_epi32(1), bitIncrement);
2551     xmm_shift = _mm_cvtsi32_si128(bitIncrement + 1);
2552 
2553     xmm1 = _mm_unpacklo_epi16(xmm3, _mm_srli_si128(xmm3, 8));
2554 
2555     xmm3 = _mm_madd_epi16(xmm1, _mm_load_si128((__m128i *)(transformIntrinConst_SSE2 + OFFSET_36_N83)));
2556     xmm1 = _mm_madd_epi16(xmm1, _mm_load_si128((__m128i *)(transformIntrinConst_SSE2 + OFFSET_83_36)));
2557     xmm1 = _mm_add_epi32(xmm1, xmm_offset);
2558     xmm3 = _mm_add_epi32(xmm3, xmm_offset);
2559     xmm1 = _mm_sra_epi32(xmm1, xmm_shift);
2560     xmm3 = _mm_sra_epi32(xmm3, xmm_shift);
2561     xmm1 = _mm_packs_epi32(xmm1, xmm3);
2562 
2563     xmm2 = _mm_unpacklo_epi32(xmm2, xmm1);
2564     xmm1 = _mm_srli_si128(xmm1, 8);
2565     xmm4 = _mm_unpacklo_epi32(xmm4, xmm1);
2566 
2567     xmm3 = _mm_unpackhi_epi64(xmm2, xmm4);
2568     xmm2 = _mm_unpacklo_epi64(xmm2, xmm4);
2569 
2570     xmm_offset = _mm_load_si128((__m128i *)(transformIntrinConst_SSE2 + OFFSET_128));
2571 
2572     MACRO_TRANS_2MAC(xmm2, xmm3, xmm0, xmm1, xmm_offset, OFFSET_64_64, OFFSET_64_64, 8, 0)
2573     MACRO_TRANS_2MAC(xmm2, xmm3, xmm4, xmm5, xmm_offset, OFFSET_83_36, OFFSET_N36_N83, 8, dstStride)
2574     MACRO_TRANS_2MAC(xmm2, xmm3, xmm6, xmm0, xmm_offset, OFFSET_64_N64, OFFSET_N64_64, 8, 2 * dstStride)
2575     MACRO_TRANS_2MAC(xmm2, xmm3, xmm1, xmm4, xmm_offset, OFFSET_36_N83, OFFSET_83_N36, 8, 3 * dstStride)
2576 
2577     (void)transformCoefficients;
2578     (void)transformInnerArrayPtr;
2579 
2580 #undef OFFSET_128
2581 #undef OFFSET_64_64
2582 #undef OFFSET_83_36
2583 #undef OFFSET_N36_N83
2584 #undef OFFSET_64_N64
2585 #undef OFFSET_N64_64
2586 #undef OFFSET_36_N83
2587 #undef OFFSET_83_N36
2588 }
2589 
DstTransform4x4_SSE2_INTRIN(EB_S16 * residual,const EB_U32 srcStride,EB_S16 * transformCoefficients,const EB_U32 dstStride,EB_S16 * transformInnerArrayPtr,EB_U32 bitIncrement)2590 void DstTransform4x4_SSE2_INTRIN(
2591     EB_S16                  *residual,
2592     const EB_U32             srcStride,
2593     EB_S16                  *transformCoefficients,
2594     const EB_U32             dstStride,
2595     EB_S16                  *transformInnerArrayPtr,
2596     EB_U32                   bitIncrement)
2597 {
2598 #define OFFSET_DST_1        0
2599 #define OFFSET_DST_29_55    (8+OFFSET_DST_1)
2600 #define OFFSET_DST_74_84    (8+OFFSET_DST_29_55)
2601 #define OFFSET_DST_84_N29   (8+OFFSET_DST_74_84)
2602 #define OFFSET_DST_N74_55   (8+OFFSET_DST_84_N29)
2603 #define OFFSET_DST_55_N84   (8+OFFSET_DST_N74_55)
2604 #define OFFSET_DST_74_N29   (8+OFFSET_DST_55_N84)
2605 #define OFFSET_DST_37_37    (8+OFFSET_DST_74_N29)
2606 #define OFFSET_DST_74_74    (8+OFFSET_DST_37_37)
2607 #define OFFSET_DST_0_N37    (8+OFFSET_DST_74_74)
2608 #define OFFSET_DST_0_N74    (8+OFFSET_DST_0_N37)
2609 
2610     __m128i xmm_res0, xmm_res1, xmm_res2, xmm_res3, xmm_res0_1, xmm_res2_3, xmm_res_lo, xmm_res_hi, xmm_offset;
2611     __m128i xmm_trans0, xmm_trans1, xmm_trans2, xmm_trans3, xmm_trans0_1, xmm_trans2_3, xmm_trans_lo, xmm_trans_hi;
2612     __m128i xmm_temp;
2613 
2614     EB_U32 shift = bitIncrement + 1;
2615     EB_ALIGN(16) const EB_S16 * EbHevcTransformAsmConst = DstTransformAsmConst_SSE2;
2616 
2617     xmm_res0 = _mm_loadl_epi64((__m128i *)(residual));
2618     xmm_res1 = _mm_loadl_epi64((__m128i *)(residual + srcStride));
2619     xmm_res2 = _mm_loadl_epi64((__m128i *)(residual + 2 * srcStride));
2620     xmm_res3 = _mm_loadl_epi64((__m128i *)(residual + 3 * srcStride));
2621     xmm_offset = _mm_srli_epi32(_mm_slli_epi32(_mm_load_si128((__m128i *)(DstTransformAsmConst_SSE2 + OFFSET_DST_1)), shift), 1);
2622 
2623     xmm_res0_1 = _mm_unpacklo_epi32(xmm_res0, xmm_res1); // |res01    |res-S1-01|res23    |res-S1-23|
2624     xmm_res2_3 = _mm_unpacklo_epi32(xmm_res2, xmm_res3); // |res-S2-01|res-S3-01|res-S2-23|res-S3-23|
2625     xmm_res_hi = _mm_unpackhi_epi64(xmm_res0_1, xmm_res2_3); // |res23    |res-S1-23|res-S2-23|res-S3-23|
2626     xmm_res_lo = _mm_unpacklo_epi64(xmm_res0_1, xmm_res2_3); // |res01    |res-S1-01|res-S2-01|res-S3-01|
2627 
2628     MACRO_TRANS_2MAC_NO_SAVE(xmm_res_lo, xmm_res_hi, xmm_trans0, xmm_temp, xmm_offset, OFFSET_DST_29_55, OFFSET_DST_74_84, shift)
2629     MACRO_TRANS_2MAC_NO_SAVE(xmm_res_lo, xmm_res_hi, xmm_trans1, xmm_temp, xmm_offset, OFFSET_DST_74_74, OFFSET_DST_0_N74, shift)
2630     MACRO_TRANS_2MAC_NO_SAVE(xmm_res_lo, xmm_res_hi, xmm_trans2, xmm_temp, xmm_offset, OFFSET_DST_84_N29, OFFSET_DST_N74_55, shift)
2631     MACRO_TRANS_2MAC_NO_SAVE(xmm_res_lo, xmm_res_hi, xmm_trans3, xmm_temp, xmm_offset, OFFSET_DST_55_N84, OFFSET_DST_74_N29, shift)
2632 
2633     // Second Partial Bufferfly
2634     xmm_offset = _mm_set1_epi32(0x00000080); // 128
2635     xmm_trans0_1 = _mm_unpacklo_epi32(xmm_trans0, xmm_trans1);
2636     xmm_trans2_3 = _mm_unpacklo_epi32(xmm_trans2, xmm_trans3);
2637     xmm_trans_hi = _mm_unpackhi_epi64(xmm_trans0_1, xmm_trans2_3);
2638     xmm_trans_lo = _mm_unpacklo_epi64(xmm_trans0_1, xmm_trans2_3);
2639 
2640     MACRO_TRANS_2MAC(xmm_trans_lo, xmm_trans_hi, xmm_trans0, xmm_temp, xmm_offset, OFFSET_DST_29_55, OFFSET_DST_74_84, 8, 0)
2641     MACRO_TRANS_2MAC(xmm_trans_lo, xmm_trans_hi, xmm_trans1, xmm_temp, xmm_offset, OFFSET_DST_74_74, OFFSET_DST_0_N74, 8, dstStride)
2642     MACRO_TRANS_2MAC(xmm_trans_lo, xmm_trans_hi, xmm_trans2, xmm_temp, xmm_offset, OFFSET_DST_84_N29, OFFSET_DST_N74_55, 8, (2 * dstStride))
2643     MACRO_TRANS_2MAC(xmm_trans_lo, xmm_trans_hi, xmm_trans3, xmm_temp, xmm_offset, OFFSET_DST_55_N84, OFFSET_DST_74_N29, 8, (3 * dstStride))
2644 
2645     (void)transformInnerArrayPtr;
2646 }
2647 
Transform8x8_SSE2_INTRIN(EB_S16 * residual,const EB_U32 srcStride,EB_S16 * transformCoefficients,const EB_U32 dstStride,EB_S16 * transformInnerArrayPtr,EB_U32 bitIncrement)2648 void Transform8x8_SSE2_INTRIN(
2649     EB_S16                  *residual,
2650     const EB_U32             srcStride,
2651     EB_S16                  *transformCoefficients,
2652     const EB_U32             dstStride,
2653     EB_S16                  *transformInnerArrayPtr,
2654     EB_U32                   bitIncrement)
2655 {
2656     // Transform8x8 has its own table because the larger table's offset macros exceed 256 (which is maximum macro expansion depth
2657     // Use a smaller table with values just for Transform8x8.
2658 
2659     EB_ALIGN(16) EB_S16 transformIntrinConst_8x8[] = {
2660         83,  36,  83,  36,  83,  36,  83,  36,
2661         36, -83,  36, -83,  36, -83,  36, -83,
2662         89,  75,  89,  75,  89,  75,  89,  75,
2663         50,  18,  50,  18,  50,  18,  50,  18,
2664         75, -18,  75, -18,  75, -18,  75, -18,
2665        -89, -50, -89, -50, -89, -50, -89, -50,
2666         50, -89,  50, -89,  50, -89,  50, -89,
2667         18,  75,  18,  75,  18,  75,  18,  75,
2668         18, -50,  18, -50,  18, -50,  18, -50,
2669         75, -89,  75, -89,  75, -89,  75, -89,
2670         256, 0,   256, 0,   256, 0,   256, 0,
2671         64,  64,  64,  64,  64,  64,  64,  64,
2672        -18, -50, -18, -50, -18, -50, -18, -50,
2673        -75, -89, -75, -89, -75, -89, -75, -89,
2674        -36, -83, -36, -83, -36, -83, -36, -83,
2675        -83, -36, -83, -36, -83, -36, -83, -36,
2676         36,  83,  36,  83,  36,  83,  36,  83,
2677         50,  89,  50,  89,  50,  89,  50,  89,
2678         18, -75,  18, -75,  18, -75,  18, -75,
2679        -64,  64, -64,  64, -64,  64, -64,  64,
2680         64, -64,  64, -64,  64, -64,  64, -64,
2681        -75, -18, -75, -18, -75, -18, -75, -18,
2682         89, -50,  89, -50,  89, -50,  89, -50,
2683         83, -36,  83, -36,  83, -36,  83, -36,
2684        -36,  83, -36,  83, -36,  83, -36,  83,
2685        -83,  36, -83,  36, -83,  36, -83,  36,
2686         89, -75,  89, -75,  89, -75,  89, -75,
2687         50, -18,  50, -18,  50, -18,  50, -18,
2688     };
2689     __m128i sum, sum1, sum2, sum3, sum4;
2690     __m128i res0, res1, res2, res3, res4, res5, res6, res7;
2691     __m128i res01, res23, res45, res67, res02, res0123, res46, res4567, res04, res0246, res0145, res0_to_7;
2692     __m128i even0, even1, even2, even3, odd0, odd1, odd2, odd3, odd01_lo, odd01_hi, odd23_lo, odd23_hi;
2693     __m128i evenEven0, evenEven1, evenOdd0, evenOdd1;
2694     __m128i trans0, trans1, trans2, trans3, trans4, trans5, trans6, trans7, trans01, trans23, trans45, trans67;
2695     __m128i trans02, trans0123, trans46, trans4567;
2696     __m128i xmm_offset;
2697     EB_ALIGN(16) EB_S16 * TransformIntrinConst = transformIntrinConst_8x8;
2698     EB_U32 shift;
2699 
2700     res0 = _mm_loadu_si128((__m128i *)residual);
2701     res1 = _mm_loadu_si128((__m128i *)(residual + srcStride));
2702     res2 = _mm_loadu_si128((__m128i *)(residual + 2 * srcStride));
2703     res3 = _mm_loadu_si128((__m128i *)(residual + 3 * srcStride));
2704     residual += (srcStride << 2);
2705     res4 = _mm_loadu_si128((__m128i *)residual);
2706     res5 = _mm_loadu_si128((__m128i *)(residual + srcStride));
2707     res6 = _mm_loadu_si128((__m128i *)(residual + 2 * srcStride));
2708     res7 = _mm_loadu_si128((__m128i *)(residual + 3 * srcStride));
2709 
2710     MACRO_UNPACK(16, res0, res1, res2, res3, res4, res5, res6, res7, res01, res23, res45, res67)
2711     MACRO_UNPACK(32, res0, res2, res01, res23, res4, res6, res45, res67, res02, res0123, res46, res4567)
2712     MACRO_UNPACK(64, res0, res4, res02, res46, res01, res45, res0123, res4567, res04, res0246, res0145, res0_to_7)
2713     MACRO_CALC_EVEN_ODD(res0, res04, res02, res0246, res01, res0145, res0123, res0_to_7)
2714 
2715     evenEven0 = _mm_add_epi16(even0, even3);
2716     evenEven1 = _mm_add_epi16(even1, even2);
2717     evenOdd0 = _mm_sub_epi16(even0, even3);
2718     evenOdd1 = _mm_sub_epi16(even1, even2);
2719 
2720     shift = 4 - bitIncrement;
2721     trans0 = _mm_slli_epi16(_mm_add_epi16(evenEven0, evenEven1), shift);
2722     trans4 = _mm_slli_epi16(_mm_sub_epi16(evenEven0, evenEven1), shift);
2723 
2724     xmm_offset = _mm_slli_epi32(_mm_set1_epi32(0x00000002), bitIncrement);
2725     shift = bitIncrement + 2;
2726 
2727     trans2 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(_mm_load_si128((__m128i *)(TransformIntrinConst + TRANS8x8_OFFSET_83_36)),_mm_unpacklo_epi16(evenOdd0, evenOdd1)), xmm_offset), shift),
2728                              _mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(_mm_load_si128((__m128i *)(TransformIntrinConst + TRANS8x8_OFFSET_83_36)),_mm_unpackhi_epi16(evenOdd0, evenOdd1)), xmm_offset), shift));
2729 
2730     trans6 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(_mm_load_si128((__m128i *)(TransformIntrinConst + TRANS8x8_OFFSET_36_N83)),_mm_unpacklo_epi16(evenOdd0, evenOdd1)), xmm_offset), shift),
2731                              _mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(_mm_load_si128((__m128i *)(TransformIntrinConst + TRANS8x8_OFFSET_36_N83)),_mm_unpackhi_epi16(evenOdd0, evenOdd1)), xmm_offset), shift));
2732 
2733     // TransformCoefficients 1, 3, 5, 7
2734     odd01_lo = _mm_unpacklo_epi16(odd0, odd1);
2735     odd01_hi = _mm_unpackhi_epi16(odd0, odd1);
2736     odd23_lo = _mm_unpacklo_epi16(odd2, odd3);
2737     odd23_hi = _mm_unpackhi_epi16(odd2, odd3);
2738 
2739     MACRO_TRANS_4MAC_NO_SAVE(odd01_lo, odd01_hi, odd23_lo, odd23_hi, trans1, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_89_75,  TRANS8x8_OFFSET_50_18, shift)
2740     MACRO_TRANS_4MAC_NO_SAVE(odd01_lo, odd01_hi, odd23_lo, odd23_hi, trans3, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_75_N18, TRANS8x8_OFFSET_N89_N50, shift)
2741     MACRO_TRANS_4MAC_NO_SAVE(odd01_lo, odd01_hi, odd23_lo, odd23_hi, trans5, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_50_N89, TRANS8x8_OFFSET_18_75, shift)
2742     MACRO_TRANS_4MAC_NO_SAVE(odd01_lo, odd01_hi, odd23_lo, odd23_hi, trans7, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_18_N50, TRANS8x8_OFFSET_75_N89, shift)
2743 
2744     MACRO_UNPACK(32, trans0, trans1, trans2, trans3, trans4, trans5, trans6, trans7, trans01, trans23, trans45, trans67)
2745     MACRO_UNPACK(64, trans0, trans2, trans01, trans23, trans4, trans6, trans45, trans67, trans02, trans0123, trans46, trans4567)
2746 
2747     xmm_offset = _mm_loadu_si128((__m128i *)(TransformIntrinConst + TRANS8x8_OFFSET_256));
2748 
2749     MACRO_TRANS_8MAC(trans0, trans02, trans01, trans0123, trans4, trans46, trans45, trans4567, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_64_64, TRANS8x8_OFFSET_64_64,    TRANS8x8_OFFSET_64_64,   TRANS8x8_OFFSET_64_64, 9, _mm_storeu_si128, transformCoefficients, 0)
2750     MACRO_TRANS_8MAC(trans0, trans02, trans01, trans0123, trans4, trans46, trans45, trans4567, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_89_75, TRANS8x8_OFFSET_50_18,    TRANS8x8_OFFSET_N18_N50, TRANS8x8_OFFSET_N75_N89, 9, _mm_storeu_si128, transformCoefficients, (dstStride))
2751     MACRO_TRANS_8MAC(trans0, trans02, trans01, trans0123, trans4, trans46, trans45, trans4567, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_83_36, TRANS8x8_OFFSET_N36_N83,  TRANS8x8_OFFSET_N83_N36, TRANS8x8_OFFSET_36_83, 9, _mm_storeu_si128, transformCoefficients, (2 * dstStride))
2752     MACRO_TRANS_8MAC(trans0, trans02, trans01, trans0123, trans4, trans46, trans45, trans4567, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_75_N18, TRANS8x8_OFFSET_N89_N50, TRANS8x8_OFFSET_50_89,   TRANS8x8_OFFSET_18_N75, 9, _mm_storeu_si128, transformCoefficients, (3 * dstStride))
2753     transformCoefficients += 4 * dstStride;
2754     MACRO_TRANS_8MAC(trans0, trans02, trans01, trans0123, trans4, trans46, trans45, trans4567, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_64_N64, TRANS8x8_OFFSET_N64_64, TRANS8x8_OFFSET_64_N64, TRANS8x8_OFFSET_N64_64, 9, _mm_storeu_si128, transformCoefficients, 0)
2755     MACRO_TRANS_8MAC(trans0, trans02, trans01, trans0123, trans4, trans46, trans45, trans4567, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_50_N89, TRANS8x8_OFFSET_18_75,  TRANS8x8_OFFSET_N75_N18, TRANS8x8_OFFSET_89_N50, 9, _mm_storeu_si128, transformCoefficients, (dstStride))
2756     MACRO_TRANS_8MAC(trans0, trans02, trans01, trans0123, trans4, trans46, trans45, trans4567, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_36_N83, TRANS8x8_OFFSET_83_N36, TRANS8x8_OFFSET_N36_83, TRANS8x8_OFFSET_N83_36, 9, _mm_storeu_si128, transformCoefficients, (2 * dstStride))
2757     MACRO_TRANS_8MAC(trans0, trans02, trans01, trans0123, trans4, trans46, trans45, trans4567, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_18_N50, TRANS8x8_OFFSET_75_N89, TRANS8x8_OFFSET_89_N75, TRANS8x8_OFFSET_50_N18, 9, _mm_storeu_si128, transformCoefficients, (3 * dstStride))
2758 
2759     (void)transformInnerArrayPtr;
2760 }
2761 
2762 
PfreqTransform8x8_SSE2_INTRIN(EB_S16 * residual,const EB_U32 srcStride,EB_S16 * transformCoefficients,const EB_U32 dstStride,EB_S16 * transformInnerArrayPtr,EB_U32 bitIncrement)2763 void PfreqTransform8x8_SSE2_INTRIN(
2764 	EB_S16                  *residual,
2765 	const EB_U32             srcStride,
2766 	EB_S16                  *transformCoefficients,
2767 	const EB_U32             dstStride,
2768 	EB_S16                  *transformInnerArrayPtr,
2769 	EB_U32                   bitIncrement)
2770 {
2771 	// Transform8x8 has its own table because the larger table's offset macros exceed 256 (which is maximum macro expansion depth
2772 	// Use a smaller table with values just for Transform8x8.
2773 
2774 	EB_ALIGN(16) EB_S16 transformIntrinConst_8x8[] = {
2775 		83, 36, 83, 36, 83, 36, 83, 36,
2776 		36, -83, 36, -83, 36, -83, 36, -83,
2777 		89, 75, 89, 75, 89, 75, 89, 75,
2778 		50, 18, 50, 18, 50, 18, 50, 18,
2779 		75, -18, 75, -18, 75, -18, 75, -18,
2780 		-89, -50, -89, -50, -89, -50, -89, -50,
2781 		50, -89, 50, -89, 50, -89, 50, -89,
2782 		18, 75, 18, 75, 18, 75, 18, 75,
2783 		18, -50, 18, -50, 18, -50, 18, -50,
2784 		75, -89, 75, -89, 75, -89, 75, -89,
2785 		256, 0, 256, 0, 256, 0, 256, 0,
2786 		64, 64, 64, 64, 64, 64, 64, 64,
2787 		-18, -50, -18, -50, -18, -50, -18, -50,
2788 		-75, -89, -75, -89, -75, -89, -75, -89,
2789 		-36, -83, -36, -83, -36, -83, -36, -83,
2790 		-83, -36, -83, -36, -83, -36, -83, -36,
2791 		36, 83, 36, 83, 36, 83, 36, 83,
2792 		50, 89, 50, 89, 50, 89, 50, 89,
2793 		18, -75, 18, -75, 18, -75, 18, -75,
2794 		-64, 64, -64, 64, -64, 64, -64, 64,
2795 		64, -64, 64, -64, 64, -64, 64, -64,
2796 		-75, -18, -75, -18, -75, -18, -75, -18,
2797 		89, -50, 89, -50, 89, -50, 89, -50,
2798 		83, -36, 83, -36, 83, -36, 83, -36,
2799 		-36, 83, -36, 83, -36, 83, -36, 83,
2800 		-83, 36, -83, 36, -83, 36, -83, 36,
2801 		89, -75, 89, -75, 89, -75, 89, -75,
2802 		50, -18, 50, -18, 50, -18, 50, -18,
2803 	};
2804 	__m128i sum, sum1, sum2/*, sum3, sum4*/;
2805 	__m128i res0, res1, res2, res3, res4, res5, res6, res7;
2806 	__m128i res01, res23, res45, res67, res02, res0123, res46, res4567, res04, res0246, res0145, res0_to_7;
2807 	__m128i even0, even1, even2, even3, odd0, odd1, odd2, odd3, odd01_lo, odd01_hi, odd23_lo, odd23_hi;
2808 	__m128i evenEven0, evenEven1, evenOdd0, evenOdd1;
2809 	__m128i trans0, trans1, trans2, trans3, trans4/*, trans5, trans6, trans7*/, trans01, trans23, trans45, trans67;
2810 	__m128i trans02, trans0123;
2811 	__m128i xmm_offset;
2812 	EB_ALIGN(16) EB_S16 * TransformIntrinConst = transformIntrinConst_8x8;
2813 	EB_U32 shift;
2814 
2815 	res0 = _mm_loadu_si128((__m128i *)residual);
2816 	res1 = _mm_loadu_si128((__m128i *)(residual + srcStride));
2817 	res2 = _mm_loadu_si128((__m128i *)(residual + 2 * srcStride));
2818 	res3 = _mm_loadu_si128((__m128i *)(residual + 3 * srcStride));
2819 	residual += (srcStride << 2);
2820 	res4 = _mm_loadu_si128((__m128i *)residual);
2821 	res5 = _mm_loadu_si128((__m128i *)(residual + srcStride));
2822 	res6 = _mm_loadu_si128((__m128i *)(residual + 2 * srcStride));
2823 	res7 = _mm_loadu_si128((__m128i *)(residual + 3 * srcStride));
2824 
2825 	MACRO_UNPACK(16, res0, res1, res2, res3, res4, res5, res6, res7, res01, res23, res45, res67)
2826 	MACRO_UNPACK(32, res0, res2, res01, res23, res4, res6, res45, res67, res02, res0123, res46, res4567)
2827 	MACRO_UNPACK(64, res0, res4, res02, res46, res01, res45, res0123, res4567, res04, res0246, res0145, res0_to_7)
2828 	MACRO_CALC_EVEN_ODD(res0, res04, res02, res0246, res01, res0145, res0123, res0_to_7)
2829 
2830 	evenEven0 = _mm_add_epi16(even0, even3);
2831 	evenEven1 = _mm_add_epi16(even1, even2);
2832 	evenOdd0 = _mm_sub_epi16(even0, even3);
2833 	evenOdd1 = _mm_sub_epi16(even1, even2);
2834 
2835 	shift = 4 - bitIncrement;
2836 	trans0 = _mm_slli_epi16(_mm_add_epi16(evenEven0, evenEven1), shift);
2837 	trans4 = _mm_slli_epi16(_mm_sub_epi16(evenEven0, evenEven1), shift);
2838 
2839 	xmm_offset = _mm_slli_epi32(_mm_set1_epi32(0x00000002), bitIncrement);
2840 	shift = bitIncrement + 2;
2841 
2842 	trans2 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(_mm_load_si128((__m128i *)(TransformIntrinConst + TRANS8x8_OFFSET_83_36)), _mm_unpacklo_epi16(evenOdd0, evenOdd1)), xmm_offset), shift),
2843 	_mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(_mm_load_si128((__m128i *)(TransformIntrinConst + TRANS8x8_OFFSET_83_36)), _mm_unpackhi_epi16(evenOdd0, evenOdd1)), xmm_offset), shift));
2844 
2845 	//trans6 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(_mm_load_si128((__m128i *)(TransformIntrinConst + TRANS8x8_OFFSET_36_N83)), _mm_unpacklo_epi16(evenOdd0, evenOdd1)), xmm_offset), shift),
2846 	//	_mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(_mm_load_si128((__m128i *)(TransformIntrinConst + TRANS8x8_OFFSET_36_N83)), _mm_unpackhi_epi16(evenOdd0, evenOdd1)), xmm_offset), shift));
2847 
2848 	// TransformCoefficients 1, 3, 5, 7
2849 	odd01_lo = _mm_unpacklo_epi16(odd0, odd1);
2850 	odd01_hi = _mm_unpackhi_epi16(odd0, odd1);
2851 	odd23_lo = _mm_unpacklo_epi16(odd2, odd3);
2852 	odd23_hi = _mm_unpackhi_epi16(odd2, odd3);
2853 
2854 	MACRO_TRANS_4MAC_NO_SAVE(odd01_lo, odd01_hi, odd23_lo, odd23_hi, trans1, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_89_75, TRANS8x8_OFFSET_50_18, shift)
2855     MACRO_TRANS_4MAC_NO_SAVE(odd01_lo, odd01_hi, odd23_lo, odd23_hi, trans3, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_75_N18, TRANS8x8_OFFSET_N89_N50, shift)
2856     //MACRO_TRANS_4MAC_NO_SAVE(odd01_lo, odd01_hi, odd23_lo, odd23_hi, trans5, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_50_N89, TRANS8x8_OFFSET_18_75, shift)
2857     //MACRO_TRANS_4MAC_NO_SAVE(odd01_lo, odd01_hi, odd23_lo, odd23_hi, trans7, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_18_N50, TRANS8x8_OFFSET_75_N89, shift)
2858 
2859 	MACRO_UNPACK(32, trans0, trans1, trans2, trans3, trans4/*, trans5, trans6, trans7*/, trans1, trans1, trans1, trans01, trans23, trans45, trans67)
2860 	MACRO_UNPACK_V2(64, trans0, trans2, trans01, trans23, trans4, trans0, /*trans6,*/ trans45, trans67, trans02, trans0123)
2861 
2862     xmm_offset = _mm_loadu_si128((__m128i *)(TransformIntrinConst + TRANS8x8_OFFSET_256));
2863 
2864 	MACRO_TRANS_8MAC_PF_N2(trans0, trans02, trans01, trans0123, trans4, trans45, trans45, trans45, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_64_64, TRANS8x8_OFFSET_64_64, TRANS8x8_OFFSET_64_64, TRANS8x8_OFFSET_64_64, 9, _mm_storeu_si128, transformCoefficients, 0)
2865 	MACRO_TRANS_8MAC_PF_N2(trans0, trans02, trans01, trans0123, trans4, trans45, trans45, trans45, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_89_75, TRANS8x8_OFFSET_50_18, TRANS8x8_OFFSET_N18_N50, TRANS8x8_OFFSET_N75_N89, 9, _mm_storeu_si128, transformCoefficients, (dstStride))
2866 	MACRO_TRANS_8MAC_PF_N2(trans0, trans02, trans01, trans0123, trans4, trans45, trans45, trans45, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_83_36, TRANS8x8_OFFSET_N36_N83, TRANS8x8_OFFSET_N83_N36, TRANS8x8_OFFSET_36_83, 9, _mm_storeu_si128, transformCoefficients, (2 * dstStride))
2867 	MACRO_TRANS_8MAC_PF_N2(trans0, trans02, trans01, trans0123, trans4, trans45, trans45, trans45, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_75_N18, TRANS8x8_OFFSET_N89_N50, TRANS8x8_OFFSET_50_89, TRANS8x8_OFFSET_18_N75, 9, _mm_storeu_si128, transformCoefficients, (3 * dstStride))
2868 	//transformCoefficients += 4 * dstStride;
2869 	//MACRO_TRANS_8MAC(trans0, trans02, trans01, trans0123, trans4, trans46, trans45, trans4567, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_64_N64, TRANS8x8_OFFSET_N64_64, TRANS8x8_OFFSET_64_N64, TRANS8x8_OFFSET_N64_64, 9, _mm_storeu_si128, transformCoefficients, 0)
2870 	//MACRO_TRANS_8MAC(trans0, trans02, trans01, trans0123, trans4, trans46, trans45, trans4567, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_50_N89, TRANS8x8_OFFSET_18_75, TRANS8x8_OFFSET_N75_N18, TRANS8x8_OFFSET_89_N50, 9, _mm_storeu_si128, transformCoefficients, (dstStride))
2871 	//MACRO_TRANS_8MAC(trans0, trans02, trans01, trans0123, trans4, trans46, trans45, trans4567, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_36_N83, TRANS8x8_OFFSET_83_N36, TRANS8x8_OFFSET_N36_83, TRANS8x8_OFFSET_N83_36, 9, _mm_storeu_si128, transformCoefficients, (2 * dstStride))
2872 	//MACRO_TRANS_8MAC(trans0, trans02, trans01, trans0123, trans4, trans46, trans45, trans4567, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_18_N50, TRANS8x8_OFFSET_75_N89, TRANS8x8_OFFSET_89_N75, TRANS8x8_OFFSET_50_N18, 9, _mm_storeu_si128, transformCoefficients, (3 * dstStride))
2873 
2874 	(void)transformInnerArrayPtr;
2875 }
2876 
PfreqN4Transform8x8_SSE2_INTRIN(EB_S16 * residual,const EB_U32 srcStride,EB_S16 * transformCoefficients,const EB_U32 dstStride,EB_S16 * transformInnerArrayPtr,EB_U32 bitIncrement)2877 void PfreqN4Transform8x8_SSE2_INTRIN(
2878 	EB_S16                  *residual,
2879 	const EB_U32             srcStride,
2880 	EB_S16                  *transformCoefficients,
2881 	const EB_U32             dstStride,
2882 	EB_S16                  *transformInnerArrayPtr,
2883 	EB_U32                   bitIncrement)
2884 {
2885 	// Transform8x8 has its own table because the larger table's offset macros exceed 256 (which is maximum macro expansion depth
2886 	// Use a smaller table with values just for Transform8x8.
2887 
2888 	EB_ALIGN(16) EB_S16 transformIntrinConst_8x8[] = {
2889 		83, 36, 83, 36, 83, 36, 83, 36,
2890 		36, -83, 36, -83, 36, -83, 36, -83,
2891 		89, 75, 89, 75, 89, 75, 89, 75,
2892 		50, 18, 50, 18, 50, 18, 50, 18,
2893 		75, -18, 75, -18, 75, -18, 75, -18,
2894 		-89, -50, -89, -50, -89, -50, -89, -50,
2895 		50, -89, 50, -89, 50, -89, 50, -89,
2896 		18, 75, 18, 75, 18, 75, 18, 75,
2897 		18, -50, 18, -50, 18, -50, 18, -50,
2898 		75, -89, 75, -89, 75, -89, 75, -89,
2899 		256, 0, 256, 0, 256, 0, 256, 0,
2900 		64, 64, 64, 64, 64, 64, 64, 64,
2901 		-18, -50, -18, -50, -18, -50, -18, -50,
2902 		-75, -89, -75, -89, -75, -89, -75, -89,
2903 		-36, -83, -36, -83, -36, -83, -36, -83,
2904 		-83, -36, -83, -36, -83, -36, -83, -36,
2905 		36, 83, 36, 83, 36, 83, 36, 83,
2906 		50, 89, 50, 89, 50, 89, 50, 89,
2907 		18, -75, 18, -75, 18, -75, 18, -75,
2908 		-64, 64, -64, 64, -64, 64, -64, 64,
2909 		64, -64, 64, -64, 64, -64, 64, -64,
2910 		-75, -18, -75, -18, -75, -18, -75, -18,
2911 		89, -50, 89, -50, 89, -50, 89, -50,
2912 		83, -36, 83, -36, 83, -36, 83, -36,
2913 		-36, 83, -36, 83, -36, 83, -36, 83,
2914 		-83, 36, -83, 36, -83, 36, -83, 36,
2915 		89, -75, 89, -75, 89, -75, 89, -75,
2916 		50, -18, 50, -18, 50, -18, 50, -18,
2917 	};
2918 	__m128i sum, sum1, sum2/*, sum3, sum4*/;
2919 	__m128i res0, res1, res2, res3, res4, res5, res6, res7;
2920 	__m128i res01, res23, res45, res67, res02, res0123, res46, res4567, res04, res0246, res0145, res0_to_7;
2921 	__m128i even0, even1, even2, even3, odd0, odd1, odd2, odd3, odd01_lo, odd01_hi, odd23_lo, odd23_hi;
2922 	__m128i evenEven0, evenEven1, evenOdd0, evenOdd1;
2923 	__m128i trans0, trans1, trans2, trans3, trans4/*, trans5, trans6, trans7*/, trans01, trans23, trans45, trans67;
2924 	__m128i trans02, trans0123;
2925 	__m128i xmm_offset;
2926 	EB_ALIGN(16) EB_S16 * TransformIntrinConst = transformIntrinConst_8x8;
2927 	EB_U32 shift;
2928 
2929 	res0 = _mm_loadu_si128((__m128i *)residual);
2930 	res1 = _mm_loadu_si128((__m128i *)(residual + srcStride));
2931 	res2 = _mm_loadu_si128((__m128i *)(residual + 2 * srcStride));
2932 	res3 = _mm_loadu_si128((__m128i *)(residual + 3 * srcStride));
2933 	residual += (srcStride << 2);
2934 	res4 = _mm_loadu_si128((__m128i *)residual);
2935 	res5 = _mm_loadu_si128((__m128i *)(residual + srcStride));
2936 	res6 = _mm_loadu_si128((__m128i *)(residual + 2 * srcStride));
2937 	res7 = _mm_loadu_si128((__m128i *)(residual + 3 * srcStride));
2938 
2939 	MACRO_UNPACK(16, res0, res1, res2, res3, res4, res5, res6, res7, res01, res23, res45, res67)
2940 	MACRO_UNPACK(32, res0, res2, res01, res23, res4, res6, res45, res67, res02, res0123, res46, res4567)
2941 	MACRO_UNPACK(64, res0, res4, res02, res46, res01, res45, res0123, res4567, res04, res0246, res0145, res0_to_7)
2942 	MACRO_CALC_EVEN_ODD(res0, res04, res02, res0246, res01, res0145, res0123, res0_to_7)
2943 
2944 	evenEven0 = _mm_add_epi16(even0, even3);
2945 	evenEven1 = _mm_add_epi16(even1, even2);
2946 	evenOdd0 = _mm_sub_epi16(even0, even3);
2947 	evenOdd1 = _mm_sub_epi16(even1, even2);
2948 
2949 	shift = 4 - bitIncrement;
2950 	trans0 = _mm_slli_epi16(_mm_add_epi16(evenEven0, evenEven1), shift);
2951 	trans4 = _mm_slli_epi16(_mm_sub_epi16(evenEven0, evenEven1), shift);
2952 
2953 	xmm_offset = _mm_slli_epi32(_mm_set1_epi32(0x00000002), bitIncrement);
2954 	shift = bitIncrement + 2;
2955 
2956 	trans2 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(_mm_load_si128((__m128i *)(TransformIntrinConst + TRANS8x8_OFFSET_83_36)), _mm_unpacklo_epi16(evenOdd0, evenOdd1)), xmm_offset), shift),
2957 		_mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(_mm_load_si128((__m128i *)(TransformIntrinConst + TRANS8x8_OFFSET_83_36)), _mm_unpackhi_epi16(evenOdd0, evenOdd1)), xmm_offset), shift));
2958 
2959 	//trans6 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(_mm_load_si128((__m128i *)(TransformIntrinConst + TRANS8x8_OFFSET_36_N83)), _mm_unpacklo_epi16(evenOdd0, evenOdd1)), xmm_offset), shift),
2960 	//	_mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(_mm_load_si128((__m128i *)(TransformIntrinConst + TRANS8x8_OFFSET_36_N83)), _mm_unpackhi_epi16(evenOdd0, evenOdd1)), xmm_offset), shift));
2961 
2962 	// TransformCoefficients 1, 3, 5, 7
2963 	odd01_lo = _mm_unpacklo_epi16(odd0, odd1);
2964 	odd01_hi = _mm_unpackhi_epi16(odd0, odd1);
2965 	odd23_lo = _mm_unpacklo_epi16(odd2, odd3);
2966 	odd23_hi = _mm_unpackhi_epi16(odd2, odd3);
2967 
2968 	MACRO_TRANS_4MAC_NO_SAVE(odd01_lo, odd01_hi, odd23_lo, odd23_hi, trans1, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_89_75, TRANS8x8_OFFSET_50_18, shift)
2969 	MACRO_TRANS_4MAC_NO_SAVE(odd01_lo, odd01_hi, odd23_lo, odd23_hi, trans3, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_75_N18, TRANS8x8_OFFSET_N89_N50, shift)
2970 	//MACRO_TRANS_4MAC_NO_SAVE(odd01_lo, odd01_hi, odd23_lo, odd23_hi, trans5, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_50_N89, TRANS8x8_OFFSET_18_75, shift)
2971 	//MACRO_TRANS_4MAC_NO_SAVE(odd01_lo, odd01_hi, odd23_lo, odd23_hi, trans7, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_18_N50, TRANS8x8_OFFSET_75_N89, shift)
2972 
2973 	MACRO_UNPACK(32, trans0, trans1, trans2, trans3, trans4/*, trans5, trans6, trans7*/, trans1, trans1, trans1, trans01, trans23, trans45, trans67)
2974 	MACRO_UNPACK_V2(64, trans0, trans2, trans01, trans23, trans4, trans0, /*trans6,*/ trans45, trans67, trans02, trans0123)
2975 
2976 	xmm_offset = _mm_loadu_si128((__m128i *)(TransformIntrinConst + TRANS8x8_OFFSET_256));
2977 
2978 	MACRO_TRANS_8MAC_PF_N4(trans0, trans02, trans01, trans0123, trans4, trans45, trans45, trans45, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_64_64, TRANS8x8_OFFSET_64_64, TRANS8x8_OFFSET_64_64, TRANS8x8_OFFSET_64_64, 9, _mm_storeu_si128, transformCoefficients, 0)
2979 	MACRO_TRANS_8MAC_PF_N4(trans0, trans02, trans01, trans0123, trans4, trans45, trans45, trans45, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_89_75, TRANS8x8_OFFSET_50_18, TRANS8x8_OFFSET_N18_N50, TRANS8x8_OFFSET_N75_N89, 9, _mm_storeu_si128, transformCoefficients, (dstStride))
2980 	//MACRO_TRANS_8MAC_PF_N2(trans0, trans02, trans01, trans0123, trans4, trans46, trans45, trans4567, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_83_36, TRANS8x8_OFFSET_N36_N83, TRANS8x8_OFFSET_N83_N36, TRANS8x8_OFFSET_36_83, 9, _mm_storeu_si128, transformCoefficients, (2 * dstStride))
2981 	//MACRO_TRANS_8MAC_PF_N2(trans0, trans02, trans01, trans0123, trans4, trans46, trans45, trans4567, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_75_N18, TRANS8x8_OFFSET_N89_N50, TRANS8x8_OFFSET_50_89, TRANS8x8_OFFSET_18_N75, 9, _mm_storeu_si128, transformCoefficients, (3 * dstStride))
2982 	//transformCoefficients += 4 * dstStride;
2983 	//MACRO_TRANS_8MAC(trans0, trans02, trans01, trans0123, trans4, trans46, trans45, trans4567, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_64_N64, TRANS8x8_OFFSET_N64_64, TRANS8x8_OFFSET_64_N64, TRANS8x8_OFFSET_N64_64, 9, _mm_storeu_si128, transformCoefficients, 0)
2984 	//MACRO_TRANS_8MAC(trans0, trans02, trans01, trans0123, trans4, trans46, trans45, trans4567, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_50_N89, TRANS8x8_OFFSET_18_75, TRANS8x8_OFFSET_N75_N18, TRANS8x8_OFFSET_89_N50, 9, _mm_storeu_si128, transformCoefficients, (dstStride))
2985 	//MACRO_TRANS_8MAC(trans0, trans02, trans01, trans0123, trans4, trans46, trans45, trans4567, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_36_N83, TRANS8x8_OFFSET_83_N36, TRANS8x8_OFFSET_N36_83, TRANS8x8_OFFSET_N83_36, 9, _mm_storeu_si128, transformCoefficients, (2 * dstStride))
2986 	//MACRO_TRANS_8MAC(trans0, trans02, trans01, trans0123, trans4, trans46, trans45, trans4567, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_18_N50, TRANS8x8_OFFSET_75_N89, TRANS8x8_OFFSET_89_N75, TRANS8x8_OFFSET_50_N18, 9, _mm_storeu_si128, transformCoefficients, (3 * dstStride))
2987 
2988 	(void)transformInnerArrayPtr;
2989 }
2990