1 /*
2 * Copyright(c) 2018 Intel Corporation
3 * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 */
5
6
7 #include "EbTransforms_SSE2.h"
8 #include "EbIntrinMacros16bit_SSE2.h"
9 #include <emmintrin.h>
10
11 /*****************************
12 * Defines
13 *****************************/
14
15 #define MACRO_TRANS_2MAC_NO_SAVE(XMM_1, XMM_2, XMM_3, XMM_4, XMM_OFFSET, OFFSET1, OFFSET2, SHIFT)\
16 XMM_3 = _mm_load_si128((__m128i *)(EbHevcTransformAsmConst + OFFSET1));\
17 XMM_4 = _mm_load_si128((__m128i *)(EbHevcTransformAsmConst + OFFSET2));\
18 XMM_3 = _mm_madd_epi16(XMM_3, XMM_1);\
19 XMM_4 = _mm_madd_epi16(XMM_4, XMM_2);\
20 XMM_3 = _mm_srai_epi32(_mm_add_epi32(XMM_4, _mm_add_epi32(XMM_3, XMM_OFFSET)), SHIFT);\
21 XMM_3 = _mm_packs_epi32(XMM_3, XMM_3);
22
23 #define MACRO_TRANS_2MAC(XMM_1, XMM_2, XMM_3, XMM_4, XMM_OFFSET, OFFSET1, OFFSET2, SHIFT, OFFSET3)\
24 MACRO_TRANS_2MAC_NO_SAVE(XMM_1, XMM_2, XMM_3, XMM_4, XMM_OFFSET, OFFSET1, OFFSET2, SHIFT)\
25 _mm_storel_epi64((__m128i *)(transformCoefficients+OFFSET3), XMM_3);
26
27 #define TRANS8x8_OFFSET_83_36 0
28 #define TRANS8x8_OFFSET_36_N83 (8 + TRANS8x8_OFFSET_83_36)
29 #define TRANS8x8_OFFSET_89_75 (8 + TRANS8x8_OFFSET_36_N83)
30 #define TRANS8x8_OFFSET_50_18 (8 + TRANS8x8_OFFSET_89_75)
31 #define TRANS8x8_OFFSET_75_N18 (8 + TRANS8x8_OFFSET_50_18)
32 #define TRANS8x8_OFFSET_N89_N50 (8 + TRANS8x8_OFFSET_75_N18)
33 #define TRANS8x8_OFFSET_50_N89 (8 + TRANS8x8_OFFSET_N89_N50)
34 #define TRANS8x8_OFFSET_18_75 (8 + TRANS8x8_OFFSET_50_N89)
35 #define TRANS8x8_OFFSET_18_N50 (8 + TRANS8x8_OFFSET_18_75)
36 #define TRANS8x8_OFFSET_75_N89 (8 + TRANS8x8_OFFSET_18_N50)
37 #define TRANS8x8_OFFSET_256 (8 + TRANS8x8_OFFSET_75_N89)
38 #define TRANS8x8_OFFSET_64_64 (8 + TRANS8x8_OFFSET_256)
39 #define TRANS8x8_OFFSET_N18_N50 (8 + TRANS8x8_OFFSET_64_64)
40 #define TRANS8x8_OFFSET_N75_N89 (8 + TRANS8x8_OFFSET_N18_N50)
41 #define TRANS8x8_OFFSET_N36_N83 (8 + TRANS8x8_OFFSET_N75_N89)
42 #define TRANS8x8_OFFSET_N83_N36 (8 + TRANS8x8_OFFSET_N36_N83)
43 #define TRANS8x8_OFFSET_36_83 (8 + TRANS8x8_OFFSET_N83_N36)
44 #define TRANS8x8_OFFSET_50_89 (8 + TRANS8x8_OFFSET_36_83)
45 #define TRANS8x8_OFFSET_18_N75 (8 + TRANS8x8_OFFSET_50_89)
46 #define TRANS8x8_OFFSET_N64_64 (8 + TRANS8x8_OFFSET_18_N75)
47 #define TRANS8x8_OFFSET_64_N64 (8 + TRANS8x8_OFFSET_N64_64)
48 #define TRANS8x8_OFFSET_N75_N18 (8 + TRANS8x8_OFFSET_64_N64)
49 #define TRANS8x8_OFFSET_89_N50 (8 + TRANS8x8_OFFSET_N75_N18)
50 #define TRANS8x8_OFFSET_83_N36 (8 + TRANS8x8_OFFSET_89_N50)
51 #define TRANS8x8_OFFSET_N36_83 (8 + TRANS8x8_OFFSET_83_N36)
52 #define TRANS8x8_OFFSET_N83_36 (8 + TRANS8x8_OFFSET_N36_83)
53 #define TRANS8x8_OFFSET_89_N75 (8 + TRANS8x8_OFFSET_N83_36)
54 #define TRANS8x8_OFFSET_50_N18 (8 + TRANS8x8_OFFSET_89_N75)
55
56 #define MACRO_CALC_EVEN_ODD(XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, XMM8)\
57 even0 = _mm_add_epi16(XMM1, XMM8);\
58 even1 = _mm_add_epi16(XMM2, XMM7);\
59 even2 = _mm_add_epi16(XMM3, XMM6);\
60 even3 = _mm_add_epi16(XMM4, XMM5);\
61 odd0 = _mm_sub_epi16(XMM1, XMM8);\
62 odd1 = _mm_sub_epi16(XMM2, XMM7);\
63 odd2 = _mm_sub_epi16(XMM3, XMM6);\
64 odd3 = _mm_sub_epi16(XMM4, XMM5);
65
66 #define MACRO_TRANS_4MAC_NO_SAVE(XMM1, XMM2, XMM3, XMM4, XMM_RET, XMM_OFFSET, MEM, OFFSET1, OFFSET2, SHIFT)\
67 XMM_RET = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_madd_epi16(XMM1, _mm_load_si128((__m128i*)(MEM+OFFSET1))),\
68 _mm_madd_epi16(XMM3, _mm_load_si128((__m128i*)(MEM+OFFSET2)))), XMM_OFFSET), SHIFT),\
69 _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(_mm_madd_epi16(XMM2, _mm_load_si128((__m128i*)(MEM+OFFSET1))),\
70 _mm_madd_epi16(XMM4, _mm_load_si128((__m128i*)(MEM+OFFSET2)))), XMM_OFFSET), SHIFT));
71
72 #define MACRO_TRANS_8MAC(XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, XMM8, XMM_OFST, MEM, OFST1, OFST2, OFST3, OFST4, SHIFT, INSTR, DST, OFST5)\
73 sum1 = _mm_add_epi32(_mm_madd_epi16(XMM1, _mm_loadu_si128((__m128i *)(MEM + OFST1))), _mm_madd_epi16(XMM2, _mm_loadu_si128((__m128i *)(MEM + OFST2))));\
74 sum2 = _mm_add_epi32(_mm_madd_epi16(XMM3, _mm_loadu_si128((__m128i *)(MEM + OFST3))), _mm_madd_epi16(XMM4, _mm_loadu_si128((__m128i *)(MEM + OFST4))));\
75 sum1 = _mm_srai_epi32(_mm_add_epi32(XMM_OFST, _mm_add_epi32(sum1, sum2)), SHIFT);\
76 sum3 = _mm_add_epi32(_mm_madd_epi16(XMM5, _mm_loadu_si128((__m128i *)(MEM + OFST1))), _mm_madd_epi16(XMM6, _mm_loadu_si128((__m128i *)(MEM + OFST2))));\
77 sum4 = _mm_add_epi32(_mm_madd_epi16(XMM7, _mm_loadu_si128((__m128i *)(MEM + OFST3))), _mm_madd_epi16(XMM8, _mm_loadu_si128((__m128i *)(MEM + OFST4))));\
78 sum3 = _mm_srai_epi32(_mm_add_epi32(XMM_OFST, _mm_add_epi32(sum3, sum4)), SHIFT);\
79 sum = _mm_packs_epi32(sum1, sum3);\
80 INSTR((__m128i *)(DST + OFST5), sum);
81
82 #define MACRO_TRANS_8MAC_PF_N2(XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, XMM8, XMM_OFST, MEM, OFST1, OFST2, OFST3, OFST4, SHIFT, INSTR, DST, OFST5)\
83 sum1 = _mm_add_epi32(_mm_madd_epi16(XMM1, _mm_loadu_si128((__m128i *)(MEM + OFST1))), _mm_madd_epi16(XMM2, _mm_loadu_si128((__m128i *)(MEM + OFST2))));\
84 sum2 = _mm_add_epi32(_mm_madd_epi16(XMM3, _mm_loadu_si128((__m128i *)(MEM + OFST3))), _mm_madd_epi16(XMM4, _mm_loadu_si128((__m128i *)(MEM + OFST4))));\
85 sum1 = _mm_srai_epi32(_mm_add_epi32(XMM_OFST, _mm_add_epi32(sum1, sum2)), SHIFT);\
86 /*sum3 = _mm_add_epi32(_mm_madd_epi16(XMM5, _mm_loadu_si128((__m128i *)(MEM + OFST1))), _mm_madd_epi16(XMM6, _mm_loadu_si128((__m128i *)(MEM + OFST2))));*/\
87 /*sum4 = _mm_add_epi32(_mm_madd_epi16(XMM7, _mm_loadu_si128((__m128i *)(MEM + OFST3))), _mm_madd_epi16(XMM8, _mm_loadu_si128((__m128i *)(MEM + OFST4))));*/\
88 /*sum3 = _mm_srai_epi32(_mm_add_epi32(XMM_OFST, _mm_add_epi32(sum3, sum4)), SHIFT);*/\
89 /*sum = _mm_packs_epi32(sum1, sum3);*/\
90 sum = _mm_packs_epi32(sum1, sum1);\
91 INSTR((__m128i *)(DST + OFST5), sum);
92 #define MACRO_TRANS_8MAC_PF_N4(XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, XMM8, XMM_OFST, MEM, OFST1, OFST2, OFST3, OFST4, SHIFT, INSTR, DST, OFST5)\
93 sum1 = _mm_add_epi32(_mm_madd_epi16(XMM1, _mm_loadu_si128((__m128i *)(MEM + OFST1))), _mm_madd_epi16(XMM2, _mm_loadu_si128((__m128i *)(MEM + OFST2))));\
94 sum2 = _mm_add_epi32(_mm_madd_epi16(XMM3, _mm_loadu_si128((__m128i *)(MEM + OFST3))), _mm_madd_epi16(XMM4, _mm_loadu_si128((__m128i *)(MEM + OFST4))));\
95 sum1 = _mm_srai_epi32(_mm_add_epi32(XMM_OFST, _mm_add_epi32(sum1, sum2)), SHIFT);\
96 /*sum3 = _mm_add_epi32(_mm_madd_epi16(XMM5, _mm_loadu_si128((__m128i *)(MEM + OFST1))), _mm_madd_epi16(XMM6, _mm_loadu_si128((__m128i *)(MEM + OFST2))));*/\
97 /*sum4 = _mm_add_epi32(_mm_madd_epi16(XMM7, _mm_loadu_si128((__m128i *)(MEM + OFST3))), _mm_madd_epi16(XMM8, _mm_loadu_si128((__m128i *)(MEM + OFST4))));*/\
98 /*sum3 = _mm_srai_epi32(_mm_add_epi32(XMM_OFST, _mm_add_epi32(sum3, sum4)), SHIFT);*/\
99 /*sum = _mm_packs_epi32(sum1, sum3);*/\
100 sum = _mm_packs_epi32(sum1, sum1);\
101 INSTR((__m128i *)(DST + OFST5), sum);
102
103 #ifdef __GNUC__
104 #ifndef __cplusplus
105 __attribute__((visibility("hidden")))
106 #endif
107 #endif
108 EB_ALIGN(16) const EB_S16 DstTransformAsmConst_SSE2[] = {
109 1, 0, 1, 0, 1, 0, 1, 0,
110 29, 55, 29, 55, 29, 55, 29, 55,
111 74, 84, 74, 84, 74, 84, 74, 84,
112 84, -29, 84, -29, 84, -29, 84, -29,
113 -74, 55, -74, 55, -74, 55, -74, 55,
114 55, -84, 55, -84, 55, -84, 55, -84,
115 74, -29, 74, -29, 74, -29, 74, -29,
116 37, 37, 37, 37, 37, 37, 37, 37,
117 74, 74, 74, 74, 74, 74, 74, 74,
118 0, -37, 0, -37, 0, -37, 0, -37,
119 0, -74, 0, -74, 0, -74, 0, -74,
120 //74, 0, 74, 0, 74, 0, 74, 0,
121 //55, -29, 55, -29, 55, -29, 55, -29,
122 };
123
124 #ifdef __GNUC__
125 #ifndef __cplusplus
126 __attribute__((visibility("hidden")))
127 #endif
128 #endif
129 EB_ALIGN(16) const EB_S16 InvTransformAsmConst_SSE2[] = {
130 2, 0, 2, 0, 2, 0, 2, 0,
131 4, 0, 4, 0, 4, 0, 4, 0,
132 8, 0, 8, 0, 8, 0, 8, 0,
133 9, 0, 9, 0, 9, 0, 9, 0,
134 64, 0, 64, 0, 64, 0, 64, 0,
135 256, 0, 256, 0, 256, 0, 256, 0,
136 512, 0, 512, 0, 512, 0, 512, 0,
137 1024, 0, 1024, 0, 1024, 0, 1024, 0,
138 2048, 0, 2048, 0, 2048, 0, 2048, 0,
139 7, 0, 0, 0, 0, 0, 0, 0,
140 12, 0, 0, 0, 0, 0, 0, 0,
141 64, 64, 64, 64, 64, 64, 64, 64,
142 90, 57, 90, 57, 90, 57, 90, 57,
143 89, 50, 89, 50, 89, 50, 89, 50,
144 87, 43, 87, 43, 87, 43, 87, 43,
145 83, 36, 83, 36, 83, 36, 83, 36,
146 80, 25, 80, 25, 80, 25, 80, 25,
147 75, 18, 75, 18, 75, 18, 75, 18,
148 70, 9, 70, 9, 70, 9, 70, 9,
149 64, -64, 64, -64, 64, -64, 64, -64,
150 87, -80, 87, -80, 87, -80, 87, -80,
151 75, -89, 75, -89, 75, -89, 75, -89,
152 57, -90, 57, -90, 57, -90, 57, -90,
153 36, -83, 36, -83, 36, -83, 36, -83,
154 9, -70, 9, -70, 9, -70, 9, -70,
155 -18, -50, -18, -50, -18, -50, -18, -50,
156 -43, -25, -43, -25, -43, -25, -43, -25,
157 80, -25, 80, -25, 80, -25, 80, -25,
158 50, 18, 50, 18, 50, 18, 50, 18,
159 9, 57, 9, 57, 9, 57, 9, 57,
160 -36, 83, -36, 83, -36, 83, -36, 83,
161 -70, 90, -70, 90, -70, 90, -70, 90,
162 -89, 75, -89, 75, -89, 75, -89, 75,
163 -87, 43, -87, 43, -87, 43, -87, 43,
164 70, 90, 70, 90, 70, 90, 70, 90,
165 18, 75, 18, 75, 18, 75, 18, 75,
166 -43, 25, -43, 25, -43, 25, -43, 25,
167 -83, -36, -83, -36, -83, -36, -83, -36,
168 -87, -80, -87, -80, -87, -80, -87, -80,
169 -50, -89, -50, -89, -50, -89, -50, -89,
170 9, -57, 9, -57, 9, -57, 9, -57,
171 57, -9, 57, -9, 57, -9, 57, -9,
172 -18, -75, -18, -75, -18, -75, -18, -75,
173 -80, -87, -80, -87, -80, -87, -80, -87,
174 -25, 43, -25, 43, -25, 43, -25, 43,
175 50, 89, 50, 89, 50, 89, 50, 89,
176 90, 70, 90, 70, 90, 70, 90, 70,
177 43, -87, 43, -87, 43, -87, 43, -87,
178 -50, -18, -50, -18, -50, -18, -50, -18,
179 -90, 70, -90, 70, -90, 70, -90, 70,
180 57, 9, 57, 9, 57, 9, 57, 9,
181 89, -75, 89, -75, 89, -75, 89, -75,
182 25, -80, 25, -80, 25, -80, 25, -80,
183 25, 43, 25, 43, 25, 43, 25, 43,
184 -75, 89, -75, 89, -75, 89, -75, 89,
185 -70, 9, -70, 9, -70, 9, -70, 9,
186 90, -57, 90, -57, 90, -57, 90, -57,
187 18, 50, 18, 50, 18, 50, 18, 50,
188 -80, 87, -80, 87, -80, 87, -80, 87,
189 9, 70, 9, 70, 9, 70, 9, 70,
190 -89, -50, -89, -50, -89, -50, -89, -50,
191 -25, -80, -25, -80, -25, -80, -25, -80,
192 43, 87, 43, 87, 43, 87, 43, 87,
193 -75, -18, -75, -18, -75, -18, -75, -18,
194 -57, -90, -57, -90, -57, -90, -57, -90,
195 -9, -70, -9, -70, -9, -70, -9, -70,
196 25, 80, 25, 80, 25, 80, 25, 80,
197 -43, -87, -43, -87, -43, -87, -43, -87,
198 57, 90, 57, 90, 57, 90, 57, 90,
199 -25, -43, -25, -43, -25, -43, -25, -43,
200 70, -9, 70, -9, 70, -9, 70, -9,
201 -90, 57, -90, 57, -90, 57, -90, 57,
202 80, -87, 80, -87, 80, -87, 80, -87,
203 -43, 87, -43, 87, -43, 87, -43, 87,
204 90, -70, 90, -70, 90, -70, 90, -70,
205 -57, -9, -57, -9, -57, -9, -57, -9,
206 -25, 80, -25, 80, -25, 80, -25, 80,
207 -57, 9, -57, 9, -57, 9, -57, 9,
208 80, 87, 80, 87, 80, 87, 80, 87,
209 25, -43, 25, -43, 25, -43, 25, -43,
210 -90, -70, -90, -70, -90, -70, -90, -70,
211 -70, -90, -70, -90, -70, -90, -70, -90,
212 43, -25, 43, -25, 43, -25, 43, -25,
213 87, 80, 87, 80, 87, 80, 87, 80,
214 -9, 57, -9, 57, -9, 57, -9, 57,
215 -80, 25, -80, 25, -80, 25, -80, 25,
216 -9, -57, -9, -57, -9, -57, -9, -57,
217 70, -90, 70, -90, 70, -90, 70, -90,
218 87, -43, 87, -43, 87, -43, 87, -43,
219 -87, 80, -87, 80, -87, 80, -87, 80,
220 -57, 90, -57, 90, -57, 90, -57, 90,
221 -9, 70, -9, 70, -9, 70, -9, 70,
222 43, 25, 43, 25, 43, 25, 43, 25,
223 -90, -57, -90, -57, -90, -57, -90, -57,
224 -87, -43, -87, -43, -87, -43, -87, -43,
225 -80, -25, -80, -25, -80, -25, -80, -25,
226 -70, -9, -70, -9, -70, -9, -70, -9,
227 90, 61, 90, 61, 90, 61, 90, 61,
228 90, 54, 90, 54, 90, 54, 90, 54,
229 88, 46, 88, 46, 88, 46, 88, 46,
230 85, 38, 85, 38, 85, 38, 85, 38,
231 82, 31, 82, 31, 82, 31, 82, 31,
232 78, 22, 78, 22, 78, 22, 78, 22,
233 73, 13, 73, 13, 73, 13, 73, 13,
234 67, 4, 67, 4, 67, 4, 67, 4,
235 90, -73, 90, -73, 90, -73, 90, -73,
236 82, -85, 82, -85, 82, -85, 82, -85,
237 67, -90, 67, -90, 67, -90, 67, -90,
238 46, -88, 46, -88, 46, -88, 46, -88,
239 22, -78, 22, -78, 22, -78, 22, -78,
240 -4, -61, -4, -61, -4, -61, -4, -61,
241 -31, -38, -31, -38, -31, -38, -31, -38,
242 -54, -13, -54, -13, -54, -13, -54, -13,
243 88, -46, 88, -46, 88, -46, 88, -46,
244 67, -4, 67, -4, 67, -4, 67, -4,
245 31, 38, 31, 38, 31, 38, 31, 38,
246 -13, 73, -13, 73, -13, 73, -13, 73,
247 -54, 90, -54, 90, -54, 90, -54, 90,
248 -82, 85, -82, 85, -82, 85, -82, 85,
249 -90, 61, -90, 61, -90, 61, -90, 61,
250 -78, 22, -78, 22, -78, 22, -78, 22,
251 85, 82, 85, 82, 85, 82, 85, 82,
252 46, 88, 46, 88, 46, 88, 46, 88,
253 -13, 54, -13, 54, -13, 54, -13, 54,
254 -67, -4, -67, -4, -67, -4, -67, -4,
255 -90, -61, -90, -61, -90, -61, -90, -61,
256 -73, -90, -73, -90, -73, -90, -73, -90,
257 -22, -78, -22, -78, -22, -78, -22, -78,
258 38, -31, 38, -31, 38, -31, 38, -31,
259 22, -46, 22, -46, 22, -46, 22, -46,
260 -54, -90, -54, -90, -54, -90, -54, -90,
261 -90, -67, -90, -67, -90, -67, -90, -67,
262 -61, 4, -61, 4, -61, 4, -61, 4,
263 13, 73, 13, 73, 13, 73, 13, 73,
264 78, 88, 78, 88, 78, 88, 78, 88,
265 78, -88, 78, -88, 78, -88, 78, -88,
266 -82, 31, -82, 31, -82, 31, -82, 31,
267 -73, 90, -73, 90, -73, 90, -73, 90,
268 13, 54, 13, 54, 13, 54, 13, 54,
269 85, -38, 85, -38, 85, -38, 85, -38,
270 -22, -46, -22, -46, -22, -46, -22, -46,
271 73, -13, 73, -13, 73, -13, 73, -13,
272 -31, 82, -31, 82, -31, 82, -31, 82,
273 -38, 85, -38, 85, -38, 85, -38, 85,
274 -90, 54, -90, 54, -90, 54, -90, 54,
275 67, 90, 67, 90, 67, 90, 67, 90,
276 -54, 13, -54, 13, -54, 13, -54, 13,
277 -78, -88, -78, -88, -78, -88, -78, -88,
278 -22, 46, -22, 46, -22, 46, -22, 46,
279 -90, -73, -90, -73, -90, -73, -90, -73,
280 4, -61, 4, -61, 4, -61, 4, -61,
281 61, -4, 61, -4, 61, -4, 61, -4,
282 -46, 22, -46, 22, -46, 22, -46, 22,
283 82, 85, 82, 85, 82, 85, 82, 85,
284 31, -38, 31, -38, 31, -38, 31, -38,
285 -88, -78, -88, -78, -88, -78, -88, -78,
286 90, 67, 90, 67, 90, 67, 90, 67,
287 54, -90, 54, -90, 54, -90, 54, -90,
288 -85, 38, -85, 38, -85, 38, -85, 38,
289 -4, 67, -4, 67, -4, 67, -4, 67,
290 88, -78, 88, -78, 88, -78, 88, -78,
291 -46, -22, -46, -22, -46, -22, -46, -22,
292 -61, 90, -61, 90, -61, 90, -61, 90,
293 82, -31, 82, -31, 82, -31, 82, -31,
294 13, -73, 13, -73, 13, -73, 13, -73,
295 46, 22, 46, 22, 46, 22, 46, 22,
296 -90, 67, -90, 67, -90, 67, -90, 67,
297 38, -85, 38, -85, 38, -85, 38, -85,
298 54, 13, 54, 13, 54, 13, 54, 13,
299 -90, 73, -90, 73, -90, 73, -90, 73,
300 31, -82, 31, -82, 31, -82, 31, -82,
301 61, 4, 61, 4, 61, 4, 61, 4,
302 -88, 78, -88, 78, -88, 78, -88, 78,
303 38, 85, 38, 85, 38, 85, 38, 85,
304 -4, 61, -4, 61, -4, 61, -4, 61,
305 -67, -90, -67, -90, -67, -90, -67, -90,
306 -31, -82, -31, -82, -31, -82, -31, -82,
307 -78, -22, -78, -22, -78, -22, -78, -22,
308 90, 73, 90, 73, 90, 73, 90, 73,
309 -61, -90, -61, -90, -61, -90, -61, -90,
310 4, 67, 4, 67, 4, 67, 4, 67,
311 54, -13, 54, -13, 54, -13, 54, -13,
312 -88, -46, -88, -46, -88, -46, -88, -46,
313 85, -82, 85, -82, 85, -82, 85, -82,
314 -38, -31, -38, -31, -38, -31, -38, -31,
315 -13, -73, -13, -73, -13, -73, -13, -73,
316 22, 78, 22, 78, 22, 78, 22, 78,
317 -46, -88, -46, -88, -46, -88, -46, -88,
318 54, 90, 54, 90, 54, 90, 54, 90
319 };
320
321 #ifdef __GNUC__
322 #ifndef __cplusplus
323 __attribute__((visibility("hidden")))
324 #endif
325 #endif
326 EB_ALIGN(16) const EB_S16 InvDstTransformAsmConst_SSE2[] = {
327 64, 0, 64, 0, 64, 0, 64, 0,
328 29, 84, 29, 84, 29, 84, 29, 84,
329 74, 55, 74, 55, 74, 55, 74, 55,
330 55, -29, 55, -29, 55, -29, 55, -29,
331 74, -84, 74, -84, 74, -84, 74, -84,
332 74, -74, 74, -74, 74, -74, 74, -74,
333 0, 74, 0, 74, 0, 74, 0, 74,
334 84, 55, 84, 55, 84, 55, 84, 55,
335 -74, -29, -74, -29, -74, -29, -74, -29,
336 };
337
338
339 // Coefficients for inverse 32-point transform
340 EB_EXTERN const EB_S16 EbHevcCoeff_tbl2[48 * 8] =
341 {
342 64, 89, 64, 75, 64, 50, 64, 18, 64, -18, 64, -50, 64, -75, 64, -89,
343 83, 75, 36, -18, -36, -89, -83, -50, -83, 50, -36, 89, 36, 18, 83, -75,
344 64, 50, -64, -89, -64, 18, 64, 75, 64, -75, -64, -18, -64, 89, 64, -50,
345 36, 18, -83, -50, 83, 75, -36, -89, -36, 89, 83, -75, -83, 50, 36, -18,
346 90, 87, 87, 57, 80, 9, 70, -43, 57, -80, 43, -90, 25, -70, 9, -25,
347 80, 70, 9, -43, -70, -87, -87, 9, -25, 90, 57, 25, 90, -80, 43, -57,
348 57, 43, -80, -90, -25, 57, 90, 25, -9, -87, -87, 70, 43, 9, 70, -80,
349 25, 9, -70, -25, 90, 43, -80, -57, 43, 70, 9, -80, -57, 87, 87, -90,
350 90, 90, 90, 82, 88, 67, 85, 46, 82, 22, 78, -4, 73, -31, 67, -54,
351 61, -73, 54, -85, 46, -90, 38, -88, 31, -78, 22, -61, 13, -38, 4, -13,
352 88, 85, 67, 46, 31, -13, -13, -67, -54, -90, -82, -73, -90, -22, -78, 38,
353 -46, 82, -4, 88, 38, 54, 73, -4, 90, -61, 85, -90, 61, -78, 22, -31,
354 82, 78, 22, -4, -54, -82, -90, -73, -61, 13, 13, 85, 78, 67, 85, -22,
355 31, -88, -46, -61, -90, 31, -67, 90, 4, 54, 73, -38, 88, -90, 38, -46,
356 73, 67, -31, -54, -90, -78, -22, 38, 78, 85, 67, -22, -38, -90, -90, 4,
357 -13, 90, 82, 13, 61, -88, -46, -31, -88, 82, -4, 46, 85, -73, 54, -61,
358 61, 54, -73, -85, -46, -4, 82, 88, 31, -46, -88, -61, -13, 82, 90, 13,
359 -4, -90, -90, 38, 22, 67, 85, -78, -38, -22, -78, 90, 54, -31, 67, -73,
360 46, 38, -90, -88, 38, 73, 54, -4, -90, -67, 31, 90, 61, -46, -88, -31,
361 22, 85, 67, -78, -85, 13, 13, 61, 73, -90, -82, 54, 4, 22, 78, -82,
362 31, 22, -78, -61, 90, 85, -61, -90, 4, 73, 54, -38, -88, -4, 82, 46,
363 -38, -78, -22, 90, 73, -82, -90, 54, 67, -13, -13, -31, -46, 67, 85, -88,
364 13, 4, -38, -13, 61, 22, -78, -31, 88, 38, -90, -46, 85, 54, -73, -61,
365 54, 67, -31, -73, 4, 78, 22, -82, -46, 85, 67, -88, -82, 90, 90, -90
366 };
367
368 #ifdef __GNUC__
369 #ifndef __cplusplus
370 __attribute__((visibility("hidden")))
371 #endif
372 #endif
373 EB_EXTERN const EB_S16 EbHevcCoeff_tbl[48 * 8] =
374 {
375 64, 64, 89, 75, 83, 36, 75, -18, 64, -64, 50, -89, 36, -83, 18, -50,
376 64, 64, 50, 18, -36, -83, -89, -50, -64, 64, 18, 75, 83, -36, 75, -89,
377 64, 64, -18, -50, -83, -36, 50, 89, 64, -64, -75, -18, -36, 83, 89, -75,
378 64, 64, -75, -89, 36, 83, 18, -75, -64, 64, 89, -50, -83, 36, 50, -18,
379 90, 87, 87, 57, 80, 9, 70, -43, 57, -80, 43, -90, 25, -70, 9, -25,
380 80, 70, 9, -43, -70, -87, -87, 9, -25, 90, 57, 25, 90, -80, 43, -57,
381 57, 43, -80, -90, -25, 57, 90, 25, -9, -87, -87, 70, 43, 9, 70, -80,
382 25, 9, -70, -25, 90, 43, -80, -57, 43, 70, 9, -80, -57, 87, 87, -90,
383 90, 90, 90, 82, 88, 67, 85, 46, 82, 22, 78, -4, 73, -31, 67, -54,
384 61, -73, 54, -85, 46, -90, 38, -88, 31, -78, 22, -61, 13, -38, 4, -13,
385 88, 85, 67, 46, 31, -13, -13, -67, -54, -90, -82, -73, -90, -22, -78, 38,
386 -46, 82, -4, 88, 38, 54, 73, -4, 90, -61, 85, -90, 61, -78, 22, -31,
387 82, 78, 22, -4, -54, -82, -90, -73, -61, 13, 13, 85, 78, 67, 85, -22,
388 31, -88, -46, -61, -90, 31, -67, 90, 4, 54, 73, -38, 88, -90, 38, -46,
389 73, 67, -31, -54, -90, -78, -22, 38, 78, 85, 67, -22, -38, -90, -90, 4,
390 -13, 90, 82, 13, 61, -88, -46, -31, -88, 82, -4, 46, 85, -73, 54, -61,
391 61, 54, -73, -85, -46, -4, 82, 88, 31, -46, -88, -61, -13, 82, 90, 13,
392 -4, -90, -90, 38, 22, 67, 85, -78, -38, -22, -78, 90, 54, -31, 67, -73,
393 46, 38, -90, -88, 38, 73, 54, -4, -90, -67, 31, 90, 61, -46, -88, -31,
394 22, 85, 67, -78, -85, 13, 13, 61, 73, -90, -82, 54, 4, 22, 78, -82,
395 31, 22, -78, -61, 90, 85, -61, -90, 4, 73, 54, -38, -88, -4, 82, 46,
396 -38, -78, -22, 90, 73, -82, -90, 54, 67, -13, -13, -31, -46, 67, 85, -88,
397 13, 4, -38, -13, 61, 22, -78, -31, 88, 38, -90, -46, 85, 54, -73, -61,
398 54, 67, -31, -73, 4, 78, 22, -82, -46, 85, 67, -88, -82, 90, 90, -90
399 };
400
reverse_epi16(__m128i x)401 static __m128i reverse_epi16(__m128i x)
402 {
403 x = _mm_shuffle_epi32(x, 0x1b); // 00011011
404 x = _mm_shufflelo_epi16(x, 0xb1); // 10110001
405 x = _mm_shufflehi_epi16(x, 0xb1);
406 return x;
407 }
408
409 // 16-point forward transform (16 rows)
Transform16(short * src,int src_stride,short * dst,int dst_stride,int shift)410 static void Transform16(short *src, int src_stride, short *dst, int dst_stride, int shift)
411 {
412 int i;
413 __m128i s0 = _mm_cvtsi32_si128(shift);
414 __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
415 const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl;
416
417 for (i = 0; i < 16; i++)
418 {
419 __m128i x0, x1;
420 __m128i y0, y1;
421 __m128i a0, a1, a2, a3;
422 __m128i b0, b1, b2, b3;
423
424 y0 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x00));
425 y1 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x08));
426
427
428 // 16-point butterfly
429 y1 = reverse_epi16(y1);
430
431 x0 = _mm_add_epi16(y0, y1);
432 x1 = _mm_sub_epi16(y0, y1);
433
434 a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
435 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[2]));
436 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[4]));
437 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[6]));
438
439 a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
440 a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[3]));
441 a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[5]));
442 a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[7]));
443
444 a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]);
445 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10]));
446 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[12]));
447 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[14]));
448
449 a3 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[9]);
450 a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[11]));
451 a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[13]));
452 a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[15]));
453
454 b0 = _mm_sra_epi32(_mm_add_epi32(a0, o0), s0);
455 b1 = _mm_sra_epi32(_mm_add_epi32(a1, o0), s0);
456 b2 = _mm_sra_epi32(_mm_add_epi32(a2, o0), s0);
457 b3 = _mm_sra_epi32(_mm_add_epi32(a3, o0), s0);
458
459 x0 = _mm_packs_epi32(b0, b1);
460 x1 = _mm_packs_epi32(b2, b3);
461
462 y0 = _mm_unpacklo_epi16(x0, x1);
463 y1 = _mm_unpackhi_epi16(x0, x1);
464
465 _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x00), y0);
466 _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x08), y1);
467 }
468 }
469
470 // 16-point inverse transform (16 rows)
InvTransform16(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift)471 static void InvTransform16(
472 EB_S16 *src,
473 EB_U32 src_stride,
474 EB_S16 *dst,
475 EB_U32 dst_stride,
476 EB_U32 shift)
477 {
478 int i;
479 __m128i s0 = _mm_cvtsi32_si128(shift);
480 __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
481 const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl2;
482
483 for (i = 0; i < 16; i++)
484 {
485 __m128i x0, x1;
486 __m128i y0, y1;
487 __m128i a0, a1, a2, a3;
488 __m128i b0, b1, b2, b3;
489 x0 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x00)); // 00 01 02 03 04 05 06 07
490 x1 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x08)); // 08 09 0a 0b 0c 0d 0e 0f
491
492 y0 = _mm_unpacklo_epi16(x0, x1); // 00 08 01 09 02 0a 03 0b
493 y1 = _mm_unpackhi_epi16(x0, x1); // 04 0c 05 0d 06 0e 07 0f
494
495 x0 = _mm_unpacklo_epi16(y0, y1); // 00 04 08 0c 01 05 09 0d
496 x1 = _mm_unpackhi_epi16(y0, y1); // 02 06 0a 0e 03 07 0b 0f
497
498 y0 = _mm_unpacklo_epi16(x0, x1); // 00 02 04 06 08 0a 0c 0e
499 y1 = _mm_unpackhi_epi16(x0, x1); // 01 03 05 07 09 0b 0d 0f
500
501 x0 = y0;
502 x1 = y1;
503
504 a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
505 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[2]));
506 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[4]));
507 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[6]));
508
509 a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
510 a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[3]));
511 a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[5]));
512 a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[7]));
513
514 a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]);
515 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10]));
516 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[12]));
517 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[14]));
518
519 a3 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[9]);
520 a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[11]));
521 a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[13]));
522 a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[15]));
523
524
525 a0 = _mm_add_epi32(a0, o0);
526 a1 = _mm_add_epi32(a1, o0);
527
528 b0 = _mm_add_epi32(a0, a2);
529 b1 = _mm_add_epi32(a1, a3);
530 b2 = _mm_sub_epi32(a0, a2);
531 b3 = _mm_sub_epi32(a1, a3);
532
533 a0 = b0;
534 a1 = b1;
535 a2 = _mm_shuffle_epi32(b3, 0x1b); // 00011011
536 a3 = _mm_shuffle_epi32(b2, 0x1b);
537
538 a0 = _mm_sra_epi32(a0, s0);
539 a1 = _mm_sra_epi32(a1, s0);
540 a2 = _mm_sra_epi32(a2, s0);
541 a3 = _mm_sra_epi32(a3, s0);
542
543 x0 = _mm_packs_epi32(a0, a1);
544 x1 = _mm_packs_epi32(a2, a3);
545
546 _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x00), x0);
547 _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x08), x1);
548 }
549 }
550
551 // transpose 16x16 block of data
Transpose16(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride)552 static void Transpose16(
553 EB_S16 *src,
554 EB_U32 src_stride,
555 EB_S16 *dst,
556 EB_U32 dst_stride)
557 {
558 int i, j;
559 for (i = 0; i < 2; i++)
560 {
561 for (j = 0; j < 2; j++)
562 {
563 __m128i a0, a1, a2, a3, a4, a5, a6, a7;
564 __m128i b0, b1, b2, b3, b4, b5, b6, b7;
565
566 a0 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 0)*src_stride + 8 * j));
567 a1 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 1)*src_stride + 8 * j));
568 a2 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 2)*src_stride + 8 * j));
569 a3 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 3)*src_stride + 8 * j));
570 a4 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 4)*src_stride + 8 * j));
571 a5 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 5)*src_stride + 8 * j));
572 a6 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 6)*src_stride + 8 * j));
573 a7 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 7)*src_stride + 8 * j));
574
575 b0 = _mm_unpacklo_epi16(a0, a4);
576 b1 = _mm_unpacklo_epi16(a1, a5);
577 b2 = _mm_unpacklo_epi16(a2, a6);
578 b3 = _mm_unpacklo_epi16(a3, a7);
579 b4 = _mm_unpackhi_epi16(a0, a4);
580 b5 = _mm_unpackhi_epi16(a1, a5);
581 b6 = _mm_unpackhi_epi16(a2, a6);
582 b7 = _mm_unpackhi_epi16(a3, a7);
583
584 a0 = _mm_unpacklo_epi16(b0, b2);
585 a1 = _mm_unpacklo_epi16(b1, b3);
586 a2 = _mm_unpackhi_epi16(b0, b2);
587 a3 = _mm_unpackhi_epi16(b1, b3);
588 a4 = _mm_unpacklo_epi16(b4, b6);
589 a5 = _mm_unpacklo_epi16(b5, b7);
590 a6 = _mm_unpackhi_epi16(b4, b6);
591 a7 = _mm_unpackhi_epi16(b5, b7);
592
593 b0 = _mm_unpacklo_epi16(a0, a1);
594 b1 = _mm_unpackhi_epi16(a0, a1);
595 b2 = _mm_unpacklo_epi16(a2, a3);
596 b3 = _mm_unpackhi_epi16(a2, a3);
597 b4 = _mm_unpacklo_epi16(a4, a5);
598 b5 = _mm_unpackhi_epi16(a4, a5);
599 b6 = _mm_unpacklo_epi16(a6, a7);
600 b7 = _mm_unpackhi_epi16(a6, a7);
601
602 _mm_storeu_si128((__m128i *)(dst + (8 * j + 0)*dst_stride + 8 * i), b0);
603 _mm_storeu_si128((__m128i *)(dst + (8 * j + 1)*dst_stride + 8 * i), b1);
604 _mm_storeu_si128((__m128i *)(dst + (8 * j + 2)*dst_stride + 8 * i), b2);
605 _mm_storeu_si128((__m128i *)(dst + (8 * j + 3)*dst_stride + 8 * i), b3);
606 _mm_storeu_si128((__m128i *)(dst + (8 * j + 4)*dst_stride + 8 * i), b4);
607 _mm_storeu_si128((__m128i *)(dst + (8 * j + 5)*dst_stride + 8 * i), b5);
608 _mm_storeu_si128((__m128i *)(dst + (8 * j + 6)*dst_stride + 8 * i), b6);
609 _mm_storeu_si128((__m128i *)(dst + (8 * j + 7)*dst_stride + 8 * i), b7);
610 }
611 }
612 }
613
PfreqTranspose32_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride)614 static void PfreqTranspose32_SSE2(
615 EB_S16 *src,
616 EB_U32 src_stride,
617 EB_S16 *dst,
618 EB_U32 dst_stride)
619 {
620 EB_U32 i, j;
621 for (i = 0; i < 4; i++)
622 {
623 for (j = 0; j < 2; j++)
624 {
625 __m128i a0, a1, a2, a3, a4, a5, a6, a7;
626 __m128i b0, b1, b2, b3, b4, b5, b6, b7;
627
628 a0 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 0)*src_stride + 8 * j));
629 a1 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 1)*src_stride + 8 * j));
630 a2 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 2)*src_stride + 8 * j));
631 a3 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 3)*src_stride + 8 * j));
632 a4 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 4)*src_stride + 8 * j));
633 a5 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 5)*src_stride + 8 * j));
634 a6 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 6)*src_stride + 8 * j));
635 a7 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 7)*src_stride + 8 * j));
636
637 b0 = _mm_unpacklo_epi16(a0, a4);
638 b1 = _mm_unpacklo_epi16(a1, a5);
639 b2 = _mm_unpacklo_epi16(a2, a6);
640 b3 = _mm_unpacklo_epi16(a3, a7);
641 b4 = _mm_unpackhi_epi16(a0, a4);
642 b5 = _mm_unpackhi_epi16(a1, a5);
643 b6 = _mm_unpackhi_epi16(a2, a6);
644 b7 = _mm_unpackhi_epi16(a3, a7);
645
646 a0 = _mm_unpacklo_epi16(b0, b2);
647 a1 = _mm_unpacklo_epi16(b1, b3);
648 a2 = _mm_unpackhi_epi16(b0, b2);
649 a3 = _mm_unpackhi_epi16(b1, b3);
650 a4 = _mm_unpacklo_epi16(b4, b6);
651 a5 = _mm_unpacklo_epi16(b5, b7);
652 a6 = _mm_unpackhi_epi16(b4, b6);
653 a7 = _mm_unpackhi_epi16(b5, b7);
654
655 b0 = _mm_unpacklo_epi16(a0, a1);
656 b1 = _mm_unpackhi_epi16(a0, a1);
657 b2 = _mm_unpacklo_epi16(a2, a3);
658 b3 = _mm_unpackhi_epi16(a2, a3);
659 b4 = _mm_unpacklo_epi16(a4, a5);
660 b5 = _mm_unpackhi_epi16(a4, a5);
661 b6 = _mm_unpacklo_epi16(a6, a7);
662 b7 = _mm_unpackhi_epi16(a6, a7);
663
664 _mm_storeu_si128((__m128i *)(dst + (8 * j + 0)*dst_stride + 8 * i), b0);
665 _mm_storeu_si128((__m128i *)(dst + (8 * j + 1)*dst_stride + 8 * i), b1);
666 _mm_storeu_si128((__m128i *)(dst + (8 * j + 2)*dst_stride + 8 * i), b2);
667 _mm_storeu_si128((__m128i *)(dst + (8 * j + 3)*dst_stride + 8 * i), b3);
668 _mm_storeu_si128((__m128i *)(dst + (8 * j + 4)*dst_stride + 8 * i), b4);
669 _mm_storeu_si128((__m128i *)(dst + (8 * j + 5)*dst_stride + 8 * i), b5);
670 _mm_storeu_si128((__m128i *)(dst + (8 * j + 6)*dst_stride + 8 * i), b6);
671 _mm_storeu_si128((__m128i *)(dst + (8 * j + 7)*dst_stride + 8 * i), b7);
672 }
673 }
674 }
675
PfreqN4SecTranspose32_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride)676 static void PfreqN4SecTranspose32_SSE2(
677 EB_S16 *src,
678 EB_U32 src_stride,
679 EB_S16 *dst,
680 EB_U32 dst_stride)
681 {
682 EB_U32 i, j;
683
684 i = j = 0;
685 {
686 {
687 __m128i a0, a1, a2, a3, a4, a5, a6, a7;
688 __m128i b0, b1, b2, b3, b4, b5, b6, b7;
689
690 a0 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 0)*src_stride + 8 * j));
691 a1 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 1)*src_stride + 8 * j));
692 a2 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 2)*src_stride + 8 * j));
693 a3 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 3)*src_stride + 8 * j));
694 a4 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 4)*src_stride + 8 * j));
695 a5 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 5)*src_stride + 8 * j));
696 a6 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 6)*src_stride + 8 * j));
697 a7 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 7)*src_stride + 8 * j));
698
699 b0 = _mm_unpacklo_epi16(a0, a4);
700 b1 = _mm_unpacklo_epi16(a1, a5);
701 b2 = _mm_unpacklo_epi16(a2, a6);
702 b3 = _mm_unpacklo_epi16(a3, a7);
703 b4 = _mm_unpackhi_epi16(a0, a4);
704 b5 = _mm_unpackhi_epi16(a1, a5);
705 b6 = _mm_unpackhi_epi16(a2, a6);
706 b7 = _mm_unpackhi_epi16(a3, a7);
707
708 a0 = _mm_unpacklo_epi16(b0, b2);
709 a1 = _mm_unpacklo_epi16(b1, b3);
710 a2 = _mm_unpackhi_epi16(b0, b2);
711 a3 = _mm_unpackhi_epi16(b1, b3);
712 a4 = _mm_unpacklo_epi16(b4, b6);
713 a5 = _mm_unpacklo_epi16(b5, b7);
714 a6 = _mm_unpackhi_epi16(b4, b6);
715 a7 = _mm_unpackhi_epi16(b5, b7);
716
717 b0 = _mm_unpacklo_epi16(a0, a1);
718 b1 = _mm_unpackhi_epi16(a0, a1);
719 b2 = _mm_unpacklo_epi16(a2, a3);
720 b3 = _mm_unpackhi_epi16(a2, a3);
721 b4 = _mm_unpacklo_epi16(a4, a5);
722 b5 = _mm_unpackhi_epi16(a4, a5);
723 b6 = _mm_unpacklo_epi16(a6, a7);
724 b7 = _mm_unpackhi_epi16(a6, a7);
725
726 _mm_storeu_si128((__m128i *)(dst + (8 * j + 0)*dst_stride + 8 * i), b0);
727 _mm_storeu_si128((__m128i *)(dst + (8 * j + 1)*dst_stride + 8 * i), b1);
728 _mm_storeu_si128((__m128i *)(dst + (8 * j + 2)*dst_stride + 8 * i), b2);
729 _mm_storeu_si128((__m128i *)(dst + (8 * j + 3)*dst_stride + 8 * i), b3);
730 _mm_storeu_si128((__m128i *)(dst + (8 * j + 4)*dst_stride + 8 * i), b4);
731 _mm_storeu_si128((__m128i *)(dst + (8 * j + 5)*dst_stride + 8 * i), b5);
732 _mm_storeu_si128((__m128i *)(dst + (8 * j + 6)*dst_stride + 8 * i), b6);
733 _mm_storeu_si128((__m128i *)(dst + (8 * j + 7)*dst_stride + 8 * i), b7);
734
735 }
736 }
737 }
PfreqN4FirstTranspose32_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride)738 static void PfreqN4FirstTranspose32_SSE2(
739 EB_S16 *src,
740 EB_U32 src_stride,
741 EB_S16 *dst,
742 EB_U32 dst_stride)
743 {
744 EB_U32 i, j;
745
746 for (i = 0; i < 4; i++)
747 {
748 //for (j = 0; j < 2; j++)
749 j = 0;
750 {
751
752 __m128i a0, a1, a2, a3, a4, a5, a6, a7;
753 __m128i b0, b1, b2, b3, b4, b5, b6, b7;
754
755 a0 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 0)*src_stride + 8 * j));
756 a1 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 1)*src_stride + 8 * j));
757 a2 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 2)*src_stride + 8 * j));
758 a3 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 3)*src_stride + 8 * j));
759 a4 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 4)*src_stride + 8 * j));
760 a5 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 5)*src_stride + 8 * j));
761 a6 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 6)*src_stride + 8 * j));
762 a7 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 7)*src_stride + 8 * j));
763
764 b0 = _mm_unpacklo_epi16(a0, a4);
765 b1 = _mm_unpacklo_epi16(a1, a5);
766 b2 = _mm_unpacklo_epi16(a2, a6);
767 b3 = _mm_unpacklo_epi16(a3, a7);
768 b4 = _mm_unpackhi_epi16(a0, a4);
769 b5 = _mm_unpackhi_epi16(a1, a5);
770 b6 = _mm_unpackhi_epi16(a2, a6);
771 b7 = _mm_unpackhi_epi16(a3, a7);
772
773 a0 = _mm_unpacklo_epi16(b0, b2);
774 a1 = _mm_unpacklo_epi16(b1, b3);
775 a2 = _mm_unpackhi_epi16(b0, b2);
776 a3 = _mm_unpackhi_epi16(b1, b3);
777 a4 = _mm_unpacklo_epi16(b4, b6);
778 a5 = _mm_unpacklo_epi16(b5, b7);
779 a6 = _mm_unpackhi_epi16(b4, b6);
780 a7 = _mm_unpackhi_epi16(b5, b7);
781
782 b0 = _mm_unpacklo_epi16(a0, a1);
783 b1 = _mm_unpackhi_epi16(a0, a1);
784 b2 = _mm_unpacklo_epi16(a2, a3);
785 b3 = _mm_unpackhi_epi16(a2, a3);
786 b4 = _mm_unpacklo_epi16(a4, a5);
787 b5 = _mm_unpackhi_epi16(a4, a5);
788 b6 = _mm_unpacklo_epi16(a6, a7);
789 b7 = _mm_unpackhi_epi16(a6, a7);
790
791 _mm_storeu_si128((__m128i *)(dst + (8 * j + 0)*dst_stride + 8 * i), b0);
792 _mm_storeu_si128((__m128i *)(dst + (8 * j + 1)*dst_stride + 8 * i), b1);
793 _mm_storeu_si128((__m128i *)(dst + (8 * j + 2)*dst_stride + 8 * i), b2);
794 _mm_storeu_si128((__m128i *)(dst + (8 * j + 3)*dst_stride + 8 * i), b3);
795 _mm_storeu_si128((__m128i *)(dst + (8 * j + 4)*dst_stride + 8 * i), b4);
796 _mm_storeu_si128((__m128i *)(dst + (8 * j + 5)*dst_stride + 8 * i), b5);
797 _mm_storeu_si128((__m128i *)(dst + (8 * j + 6)*dst_stride + 8 * i), b6);
798 _mm_storeu_si128((__m128i *)(dst + (8 * j + 7)*dst_stride + 8 * i), b7);
799
800 }
801 }
802 }
803
EbHevcPfreqTranspose32Type1_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride)804 void EbHevcPfreqTranspose32Type1_SSE2(
805 EB_S16 *src,
806 EB_U32 src_stride,
807 EB_S16 *dst,
808 EB_U32 dst_stride)
809 {
810 EB_U32 i, j;
811 for (i = 0; i < 2; i++)
812 {
813 for (j = 0; j < 2; j++)
814 {
815 __m128i a0, a1, a2, a3, a4, a5, a6, a7;
816 __m128i b0, b1, b2, b3, b4, b5, b6, b7;
817
818 a0 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 0)*src_stride + 8 * j));
819 a1 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 1)*src_stride + 8 * j));
820 a2 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 2)*src_stride + 8 * j));
821 a3 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 3)*src_stride + 8 * j));
822 a4 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 4)*src_stride + 8 * j));
823 a5 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 5)*src_stride + 8 * j));
824 a6 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 6)*src_stride + 8 * j));
825 a7 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 7)*src_stride + 8 * j));
826
827 b0 = _mm_unpacklo_epi16(a0, a4);
828 b1 = _mm_unpacklo_epi16(a1, a5);
829 b2 = _mm_unpacklo_epi16(a2, a6);
830 b3 = _mm_unpacklo_epi16(a3, a7);
831 b4 = _mm_unpackhi_epi16(a0, a4);
832 b5 = _mm_unpackhi_epi16(a1, a5);
833 b6 = _mm_unpackhi_epi16(a2, a6);
834 b7 = _mm_unpackhi_epi16(a3, a7);
835
836 a0 = _mm_unpacklo_epi16(b0, b2);
837 a1 = _mm_unpacklo_epi16(b1, b3);
838 a2 = _mm_unpackhi_epi16(b0, b2);
839 a3 = _mm_unpackhi_epi16(b1, b3);
840 a4 = _mm_unpacklo_epi16(b4, b6);
841 a5 = _mm_unpacklo_epi16(b5, b7);
842 a6 = _mm_unpackhi_epi16(b4, b6);
843 a7 = _mm_unpackhi_epi16(b5, b7);
844
845 b0 = _mm_unpacklo_epi16(a0, a1);
846 b1 = _mm_unpackhi_epi16(a0, a1);
847 b2 = _mm_unpacklo_epi16(a2, a3);
848 b3 = _mm_unpackhi_epi16(a2, a3);
849 b4 = _mm_unpacklo_epi16(a4, a5);
850 b5 = _mm_unpackhi_epi16(a4, a5);
851 b6 = _mm_unpacklo_epi16(a6, a7);
852 b7 = _mm_unpackhi_epi16(a6, a7);
853
854 _mm_storeu_si128((__m128i *)(dst + (8 * j + 0)*dst_stride + 8 * i), b0);
855 _mm_storeu_si128((__m128i *)(dst + (8 * j + 1)*dst_stride + 8 * i), b1);
856 _mm_storeu_si128((__m128i *)(dst + (8 * j + 2)*dst_stride + 8 * i), b2);
857 _mm_storeu_si128((__m128i *)(dst + (8 * j + 3)*dst_stride + 8 * i), b3);
858 _mm_storeu_si128((__m128i *)(dst + (8 * j + 4)*dst_stride + 8 * i), b4);
859 _mm_storeu_si128((__m128i *)(dst + (8 * j + 5)*dst_stride + 8 * i), b5);
860 _mm_storeu_si128((__m128i *)(dst + (8 * j + 6)*dst_stride + 8 * i), b6);
861 _mm_storeu_si128((__m128i *)(dst + (8 * j + 7)*dst_stride + 8 * i), b7);
862 }
863 }
864 }
PfreqTranspose32Type2_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride)865 void PfreqTranspose32Type2_SSE2(
866 EB_S16 *src,
867 EB_U32 src_stride,
868 EB_S16 *dst,
869 EB_U32 dst_stride)
870 {
871 EB_U32 i, j;
872 for (i = 0; i < 2; i++)
873 {
874 for (j = 0; j < 4; j++)
875 {
876 __m128i a0, a1, a2, a3, a4, a5, a6, a7;
877 __m128i b0, b1, b2, b3, b4, b5, b6, b7;
878
879 a0 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 0)*src_stride + 8 * j));
880 a1 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 1)*src_stride + 8 * j));
881 a2 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 2)*src_stride + 8 * j));
882 a3 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 3)*src_stride + 8 * j));
883 a4 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 4)*src_stride + 8 * j));
884 a5 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 5)*src_stride + 8 * j));
885 a6 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 6)*src_stride + 8 * j));
886 a7 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 7)*src_stride + 8 * j));
887
888 b0 = _mm_unpacklo_epi16(a0, a4);
889 b1 = _mm_unpacklo_epi16(a1, a5);
890 b2 = _mm_unpacklo_epi16(a2, a6);
891 b3 = _mm_unpacklo_epi16(a3, a7);
892 b4 = _mm_unpackhi_epi16(a0, a4);
893 b5 = _mm_unpackhi_epi16(a1, a5);
894 b6 = _mm_unpackhi_epi16(a2, a6);
895 b7 = _mm_unpackhi_epi16(a3, a7);
896
897 a0 = _mm_unpacklo_epi16(b0, b2);
898 a1 = _mm_unpacklo_epi16(b1, b3);
899 a2 = _mm_unpackhi_epi16(b0, b2);
900 a3 = _mm_unpackhi_epi16(b1, b3);
901 a4 = _mm_unpacklo_epi16(b4, b6);
902 a5 = _mm_unpacklo_epi16(b5, b7);
903 a6 = _mm_unpackhi_epi16(b4, b6);
904 a7 = _mm_unpackhi_epi16(b5, b7);
905
906 b0 = _mm_unpacklo_epi16(a0, a1);
907 b1 = _mm_unpackhi_epi16(a0, a1);
908 b2 = _mm_unpacklo_epi16(a2, a3);
909 b3 = _mm_unpackhi_epi16(a2, a3);
910 b4 = _mm_unpacklo_epi16(a4, a5);
911 b5 = _mm_unpackhi_epi16(a4, a5);
912 b6 = _mm_unpacklo_epi16(a6, a7);
913 b7 = _mm_unpackhi_epi16(a6, a7);
914
915 _mm_storeu_si128((__m128i *)(dst + (8 * j + 0)*dst_stride + 8 * i), b0);
916 _mm_storeu_si128((__m128i *)(dst + (8 * j + 1)*dst_stride + 8 * i), b1);
917 _mm_storeu_si128((__m128i *)(dst + (8 * j + 2)*dst_stride + 8 * i), b2);
918 _mm_storeu_si128((__m128i *)(dst + (8 * j + 3)*dst_stride + 8 * i), b3);
919 _mm_storeu_si128((__m128i *)(dst + (8 * j + 4)*dst_stride + 8 * i), b4);
920 _mm_storeu_si128((__m128i *)(dst + (8 * j + 5)*dst_stride + 8 * i), b5);
921 _mm_storeu_si128((__m128i *)(dst + (8 * j + 6)*dst_stride + 8 * i), b6);
922 _mm_storeu_si128((__m128i *)(dst + (8 * j + 7)*dst_stride + 8 * i), b7);
923 }
924 }
925 }
926
Transpose32_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride)927 static void Transpose32_SSE2(
928 EB_S16 *src,
929 EB_U32 src_stride,
930 EB_S16 *dst,
931 EB_U32 dst_stride)
932 {
933 EB_U32 i, j;
934 for (i = 0; i < 4; i++)
935 {
936 for (j = 0; j < 4; j++)
937 {
938 __m128i a0, a1, a2, a3, a4, a5, a6, a7;
939 __m128i b0, b1, b2, b3, b4, b5, b6, b7;
940
941 a0 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 0)*src_stride + 8 * j));
942 a1 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 1)*src_stride + 8 * j));
943 a2 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 2)*src_stride + 8 * j));
944 a3 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 3)*src_stride + 8 * j));
945 a4 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 4)*src_stride + 8 * j));
946 a5 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 5)*src_stride + 8 * j));
947 a6 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 6)*src_stride + 8 * j));
948 a7 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 7)*src_stride + 8 * j));
949
950 b0 = _mm_unpacklo_epi16(a0, a4);
951 b1 = _mm_unpacklo_epi16(a1, a5);
952 b2 = _mm_unpacklo_epi16(a2, a6);
953 b3 = _mm_unpacklo_epi16(a3, a7);
954 b4 = _mm_unpackhi_epi16(a0, a4);
955 b5 = _mm_unpackhi_epi16(a1, a5);
956 b6 = _mm_unpackhi_epi16(a2, a6);
957 b7 = _mm_unpackhi_epi16(a3, a7);
958
959 a0 = _mm_unpacklo_epi16(b0, b2);
960 a1 = _mm_unpacklo_epi16(b1, b3);
961 a2 = _mm_unpackhi_epi16(b0, b2);
962 a3 = _mm_unpackhi_epi16(b1, b3);
963 a4 = _mm_unpacklo_epi16(b4, b6);
964 a5 = _mm_unpacklo_epi16(b5, b7);
965 a6 = _mm_unpackhi_epi16(b4, b6);
966 a7 = _mm_unpackhi_epi16(b5, b7);
967
968 b0 = _mm_unpacklo_epi16(a0, a1);
969 b1 = _mm_unpackhi_epi16(a0, a1);
970 b2 = _mm_unpacklo_epi16(a2, a3);
971 b3 = _mm_unpackhi_epi16(a2, a3);
972 b4 = _mm_unpacklo_epi16(a4, a5);
973 b5 = _mm_unpackhi_epi16(a4, a5);
974 b6 = _mm_unpacklo_epi16(a6, a7);
975 b7 = _mm_unpackhi_epi16(a6, a7);
976
977 _mm_storeu_si128((__m128i *)(dst + (8 * j + 0)*dst_stride + 8 * i), b0);
978 _mm_storeu_si128((__m128i *)(dst + (8 * j + 1)*dst_stride + 8 * i), b1);
979 _mm_storeu_si128((__m128i *)(dst + (8 * j + 2)*dst_stride + 8 * i), b2);
980 _mm_storeu_si128((__m128i *)(dst + (8 * j + 3)*dst_stride + 8 * i), b3);
981 _mm_storeu_si128((__m128i *)(dst + (8 * j + 4)*dst_stride + 8 * i), b4);
982 _mm_storeu_si128((__m128i *)(dst + (8 * j + 5)*dst_stride + 8 * i), b5);
983 _mm_storeu_si128((__m128i *)(dst + (8 * j + 6)*dst_stride + 8 * i), b6);
984 _mm_storeu_si128((__m128i *)(dst + (8 * j + 7)*dst_stride + 8 * i), b7);
985 }
986 }
987 }
988
Pfreq2DInvTransform32_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift)989 void Pfreq2DInvTransform32_SSE2(
990 EB_S16 *src,
991 EB_U32 src_stride,
992 EB_S16 *dst,
993 EB_U32 dst_stride,
994 EB_U32 shift)
995 {
996 EB_U32 i;
997 __m128i s0 = _mm_cvtsi32_si128(shift);
998 __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
999 const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl2;
1000
1001 for (i = 0; i < 32; i++)
1002 {
1003 __m128i x0, x1, x2, x3;
1004 __m128i y0, y1, y2;
1005 __m128i a0, a1, a2, a3, a4, a5, a6, a7;
1006 __m128i b0, b1, b2, b3, b4, b5, b6, b7;
1007 x0 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x00)); // 00 01 02 03 04 05 06 07
1008 x1 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x08)); // 08 09 0a 0b 0c 0d 0e 0f
1009
1010 y0 = _mm_unpacklo_epi16(x0, x1); // 00 08 01 09 02 0a 03 0b
1011 y1 = _mm_unpackhi_epi16(x0, x1); // 04 0c 05 0d 06 0e 07 0f
1012
1013 x0 = _mm_unpacklo_epi16(y0, y1); // 00 04 08 0c 01 05 09 0d
1014 x1 = _mm_unpackhi_epi16(y0, y1); // 02 06 0a 0e 03 07 0b 0f
1015
1016 y0 = _mm_unpacklo_epi64(x0, x0); // 00 04 08 0c 10 14 18 1c y0=part of it zero
1017 y1 = _mm_unpacklo_epi64(x1, x1); // 02 06 0a 0e 12 16 1a 1e y1=part of it zero
1018 y2 = _mm_unpackhi_epi16(x0, x1); // 01 03 05 07 09 0b 0d 0f
1019
1020 x0 = y0; //part of it zero
1021 x1 = y1; //part of it zero
1022 x2 = y2;
1023
1024 a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
1025 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[2]));
1026
1027 a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
1028 a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[3]));
1029
1030 a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]);
1031 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10]));
1032
1033 a3 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[9]);
1034 a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[11]));
1035
1036 a4 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[16]);
1037 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[20]));
1038 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[24]));
1039 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[28]));
1040
1041 a5 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[17]);
1042 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[21]));
1043 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[25]));
1044 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[29]));
1045
1046 a6 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[18]);
1047 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[22]));
1048 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[26]));
1049 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[30]));
1050
1051 a7 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[19]);
1052 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[23]));
1053 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[27]));
1054 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[31]));
1055
1056 a0 = _mm_add_epi32(a0, o0);
1057 a1 = _mm_add_epi32(a1, o0);
1058
1059 b0 = _mm_add_epi32(a0, a2);
1060 b1 = _mm_add_epi32(a1, a3);
1061 b2 = _mm_sub_epi32(a0, a2);
1062 b3 = _mm_sub_epi32(a1, a3);
1063
1064 a0 = b0;
1065 a1 = b1;
1066 a2 = _mm_shuffle_epi32(b3, 0x1b); // 00011011
1067 a3 = _mm_shuffle_epi32(b2, 0x1b);
1068
1069 b0 = _mm_add_epi32(a0, a4);
1070 b1 = _mm_add_epi32(a1, a5);
1071 b2 = _mm_add_epi32(a2, a6);
1072 b3 = _mm_add_epi32(a3, a7);
1073 b4 = _mm_sub_epi32(a0, a4);
1074 b5 = _mm_sub_epi32(a1, a5);
1075 b6 = _mm_sub_epi32(a2, a6);
1076 b7 = _mm_sub_epi32(a3, a7);
1077
1078 a0 = _mm_sra_epi32(b0, s0);
1079 a1 = _mm_sra_epi32(b1, s0);
1080 a2 = _mm_sra_epi32(b2, s0);
1081 a3 = _mm_sra_epi32(b3, s0);
1082 a4 = _mm_sra_epi32(_mm_shuffle_epi32(b7, 0x1b), s0);
1083 a5 = _mm_sra_epi32(_mm_shuffle_epi32(b6, 0x1b), s0);
1084 a6 = _mm_sra_epi32(_mm_shuffle_epi32(b5, 0x1b), s0);
1085 a7 = _mm_sra_epi32(_mm_shuffle_epi32(b4, 0x1b), s0);
1086
1087 x0 = _mm_packs_epi32(a0, a1);
1088 x1 = _mm_packs_epi32(a2, a3);
1089 x2 = _mm_packs_epi32(a4, a5);
1090 x3 = _mm_packs_epi32(a6, a7);
1091
1092 _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x00), x0);
1093 _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x08), x1);
1094 _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x10), x2);
1095 _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x18), x3);
1096 }
1097 }
Pfreq1DInvTransform32_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift)1098 void Pfreq1DInvTransform32_SSE2(
1099 EB_S16 *src,
1100 EB_U32 src_stride,
1101 EB_S16 *dst,
1102 EB_U32 dst_stride,
1103 EB_U32 shift)
1104 {
1105 EB_U32 i;
1106 __m128i s0 = _mm_cvtsi32_si128(shift);
1107 __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
1108 const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl2;
1109
1110 for (i = 0; i < 16; i++)
1111 {
1112 __m128i x0, x1, x2, x3;
1113 __m128i y0, y1, y2;
1114 __m128i a0, a1, a2, a3, a4, a5, a6, a7;
1115 __m128i b0, b1, b2, b3, b4, b5, b6, b7;
1116 x0 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x00)); // 00 01 02 03 04 05 06 07
1117 x1 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x08)); // 08 09 0a 0b 0c 0d 0e 0f
1118
1119 y0 = _mm_unpacklo_epi16(x0, x1); // 00 08 01 09 02 0a 03 0b
1120 y1 = _mm_unpackhi_epi16(x0, x1); // 04 0c 05 0d 06 0e 07 0f
1121
1122 x0 = _mm_unpacklo_epi16(y0, y1); // 00 04 08 0c 01 05 09 0d
1123 x1 = _mm_unpackhi_epi16(y0, y1); // 02 06 0a 0e 03 07 0b 0f
1124
1125 y0 = _mm_unpacklo_epi64(x0, x0); // 00 04 08 0c 10 14 18 1c y0=part of it zero
1126 y1 = _mm_unpacklo_epi64(x1, x1); // 02 06 0a 0e 12 16 1a 1e y1=part of it zero
1127 y2 = _mm_unpackhi_epi16(x0, x1); // 01 03 05 07 09 0b 0d 0f
1128
1129 x0 = y0; //part of it zero
1130 x1 = y1; //part of it zero
1131 x2 = y2;
1132
1133 a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
1134 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[2]));
1135
1136 a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
1137 a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[3]));
1138
1139 a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]);
1140 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10]));
1141
1142 a3 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[9]);
1143 a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[11]));
1144
1145 a4 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[16]);
1146 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[20]));
1147 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[24]));
1148 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[28]));
1149
1150 a5 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[17]);
1151 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[21]));
1152 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[25]));
1153 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[29]));
1154
1155 a6 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[18]);
1156 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[22]));
1157 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[26]));
1158 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[30]));
1159
1160 a7 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[19]);
1161 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[23]));
1162 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[27]));
1163 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[31]));
1164
1165 a0 = _mm_add_epi32(a0, o0);
1166 a1 = _mm_add_epi32(a1, o0);
1167
1168 b0 = _mm_add_epi32(a0, a2);
1169 b1 = _mm_add_epi32(a1, a3);
1170 b2 = _mm_sub_epi32(a0, a2);
1171 b3 = _mm_sub_epi32(a1, a3);
1172
1173 a0 = b0;
1174 a1 = b1;
1175 a2 = _mm_shuffle_epi32(b3, 0x1b); // 00011011
1176 a3 = _mm_shuffle_epi32(b2, 0x1b);
1177
1178 b0 = _mm_add_epi32(a0, a4);
1179 b1 = _mm_add_epi32(a1, a5);
1180 b2 = _mm_add_epi32(a2, a6);
1181 b3 = _mm_add_epi32(a3, a7);
1182 b4 = _mm_sub_epi32(a0, a4);
1183 b5 = _mm_sub_epi32(a1, a5);
1184 b6 = _mm_sub_epi32(a2, a6);
1185 b7 = _mm_sub_epi32(a3, a7);
1186
1187 a0 = _mm_sra_epi32(b0, s0);
1188 a1 = _mm_sra_epi32(b1, s0);
1189 a2 = _mm_sra_epi32(b2, s0);
1190 a3 = _mm_sra_epi32(b3, s0);
1191 a4 = _mm_sra_epi32(_mm_shuffle_epi32(b7, 0x1b), s0);
1192 a5 = _mm_sra_epi32(_mm_shuffle_epi32(b6, 0x1b), s0);
1193 a6 = _mm_sra_epi32(_mm_shuffle_epi32(b5, 0x1b), s0);
1194 a7 = _mm_sra_epi32(_mm_shuffle_epi32(b4, 0x1b), s0);
1195
1196 x0 = _mm_packs_epi32(a0, a1);
1197 x1 = _mm_packs_epi32(a2, a3);
1198 x2 = _mm_packs_epi32(a4, a5);
1199 x3 = _mm_packs_epi32(a6, a7);
1200
1201 _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x00), x0);
1202 _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x08), x1);
1203 _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x10), x2);
1204 _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x18), x3);
1205 }
1206 }
1207
PfreqEstimateInvTransform32x32_SSE2(EB_S16 * src,const EB_U32 src_stride,EB_S16 * dst,const EB_U32 dst_stride,EB_S16 * intermediate,EB_U32 addshift)1208 void PfreqEstimateInvTransform32x32_SSE2(
1209 EB_S16 *src,
1210 const EB_U32 src_stride,
1211 EB_S16 *dst,
1212 const EB_U32 dst_stride,
1213 EB_S16 *intermediate,
1214 EB_U32 addshift)
1215 {
1216 EbHevcPfreqTranspose32Type1_SSE2(src, src_stride, intermediate, 32);
1217 Pfreq1DInvTransform32_SSE2(intermediate, 32, dst, dst_stride, 7);
1218 PfreqTranspose32Type2_SSE2(dst, dst_stride, intermediate, 32);
1219 Pfreq2DInvTransform32_SSE2(intermediate, 32, dst, dst_stride, 12 - addshift);
1220 }
1221
1222 // 32-point inverse transform (32 rows)
InvTransform32_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift)1223 static void InvTransform32_SSE2(
1224 EB_S16 *src,
1225 EB_U32 src_stride,
1226 EB_S16 *dst,
1227 EB_U32 dst_stride,
1228 EB_U32 shift)
1229 {
1230 EB_U32 i;
1231 __m128i s0 = _mm_cvtsi32_si128(shift);
1232 __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
1233 const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl2;
1234
1235 for (i = 0; i < 32; i++)
1236 {
1237 __m128i x0, x1, x2, x3;
1238 __m128i y0, y1, y2, y3;
1239 __m128i a0, a1, a2, a3, a4, a5, a6, a7;
1240 __m128i b0, b1, b2, b3, b4, b5, b6, b7;
1241 x0 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x00)); // 00 01 02 03 04 05 06 07
1242 x1 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x08)); // 08 09 0a 0b 0c 0d 0e 0f
1243 x2 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x10)); // 10 11 12 13 14 15 16 17
1244 x3 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x18)); // 18 19 1a 1b 1c 1d 1e 1f
1245
1246 y0 = _mm_unpacklo_epi16(x0, x1); // 00 08 01 09 02 0a 03 0b
1247 y1 = _mm_unpackhi_epi16(x0, x1); // 04 0c 05 0d 06 0e 07 0f
1248 y2 = _mm_unpacklo_epi16(x2, x3); // 10 18
1249 y3 = _mm_unpackhi_epi16(x2, x3); // 24 2c
1250
1251 x0 = _mm_unpacklo_epi16(y0, y1); // 00 04 08 0c 01 05 09 0d
1252 x1 = _mm_unpackhi_epi16(y0, y1); // 02 06 0a 0e 03 07 0b 0f
1253 x2 = _mm_unpacklo_epi16(y2, y3); // 10 14
1254 x3 = _mm_unpackhi_epi16(y2, y3); // 12 16
1255
1256 y0 = _mm_unpacklo_epi64(x0, x2); // 00 04 08 0c 10 14 18 1c
1257 y1 = _mm_unpacklo_epi64(x1, x3); // 02 06 0a 0e 12 16 1a 1e
1258 y2 = _mm_unpackhi_epi16(x0, x1); // 01 03 05 07 09 0b 0d 0f
1259 y3 = _mm_unpackhi_epi16(x2, x3); // 11 13 15 17 19 1b 1d 1f
1260
1261 x0 = y0;
1262 x1 = y1;
1263 x2 = y2;
1264 x3 = y3;
1265
1266 a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
1267 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[2]));
1268 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[4]));
1269 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[6]));
1270
1271 a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
1272 a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[3]));
1273 a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[5]));
1274 a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[7]));
1275
1276 a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]);
1277 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10]));
1278 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[12]));
1279 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[14]));
1280
1281 a3 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[9]);
1282 a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[11]));
1283 a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[13]));
1284 a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[15]));
1285
1286 a4 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[16]);
1287 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[20]));
1288 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[24]));
1289 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[28]));
1290 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[32]));
1291 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[36]));
1292 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[40]));
1293 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[44]));
1294
1295 a5 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[17]);
1296 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[21]));
1297 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[25]));
1298 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[29]));
1299 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[33]));
1300 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[37]));
1301 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[41]));
1302 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[45]));
1303
1304 a6 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[18]);
1305 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[22]));
1306 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[26]));
1307 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[30]));
1308 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[34]));
1309 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[38]));
1310 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[42]));
1311 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[46]));
1312
1313 a7 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[19]);
1314 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[23]));
1315 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[27]));
1316 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[31]));
1317 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[35]));
1318 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[39]));
1319 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[43]));
1320 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[47]));
1321
1322 a0 = _mm_add_epi32(a0, o0);
1323 a1 = _mm_add_epi32(a1, o0);
1324
1325 b0 = _mm_add_epi32(a0, a2);
1326 b1 = _mm_add_epi32(a1, a3);
1327 b2 = _mm_sub_epi32(a0, a2);
1328 b3 = _mm_sub_epi32(a1, a3);
1329
1330 a0 = b0;
1331 a1 = b1;
1332 a2 = _mm_shuffle_epi32(b3, 0x1b); // 00011011
1333 a3 = _mm_shuffle_epi32(b2, 0x1b);
1334
1335 b0 = _mm_add_epi32(a0, a4);
1336 b1 = _mm_add_epi32(a1, a5);
1337 b2 = _mm_add_epi32(a2, a6);
1338 b3 = _mm_add_epi32(a3, a7);
1339 b4 = _mm_sub_epi32(a0, a4);
1340 b5 = _mm_sub_epi32(a1, a5);
1341 b6 = _mm_sub_epi32(a2, a6);
1342 b7 = _mm_sub_epi32(a3, a7);
1343
1344 a0 = _mm_sra_epi32(b0, s0);
1345 a1 = _mm_sra_epi32(b1, s0);
1346 a2 = _mm_sra_epi32(b2, s0);
1347 a3 = _mm_sra_epi32(b3, s0);
1348 a4 = _mm_sra_epi32(_mm_shuffle_epi32(b7, 0x1b), s0);
1349 a5 = _mm_sra_epi32(_mm_shuffle_epi32(b6, 0x1b), s0);
1350 a6 = _mm_sra_epi32(_mm_shuffle_epi32(b5, 0x1b), s0);
1351 a7 = _mm_sra_epi32(_mm_shuffle_epi32(b4, 0x1b), s0);
1352
1353 x0 = _mm_packs_epi32(a0, a1);
1354 x1 = _mm_packs_epi32(a2, a3);
1355 x2 = _mm_packs_epi32(a4, a5);
1356 x3 = _mm_packs_epi32(a6, a7);
1357
1358 _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x00), x0);
1359 _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x08), x1);
1360 _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x10), x2);
1361 _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x18), x3);
1362 }
1363 }
1364
EstimateInvTransform32x32_SSE2(EB_S16 * src,const EB_U32 src_stride,EB_S16 * dst,const EB_U32 dst_stride,EB_S16 * intermediate,EB_U32 addshift)1365 void EstimateInvTransform32x32_SSE2(
1366 EB_S16 *src,
1367 const EB_U32 src_stride,
1368 EB_S16 *dst,
1369 const EB_U32 dst_stride,
1370 EB_S16 *intermediate,
1371 EB_U32 addshift)
1372 {
1373 Transpose32_SSE2(src, src_stride, intermediate, 32);
1374 InvTransform32_SSE2(intermediate, 32, dst, dst_stride, 7);
1375 Transpose32_SSE2(dst, dst_stride, intermediate, 32);
1376 InvTransform32_SSE2(intermediate, 32, dst, dst_stride, 12 - addshift);
1377 }
1378
1379
1380
1381 // forward 16x16 transform
Transform16x16_SSE2(EB_S16 * src,const EB_U32 src_stride,EB_S16 * dst,const EB_U32 dst_stride,EB_S16 * intermediate,EB_U32 addshift)1382 void Transform16x16_SSE2(
1383 EB_S16 *src,
1384 const EB_U32 src_stride,
1385 EB_S16 *dst,
1386 const EB_U32 dst_stride,
1387 EB_S16 *intermediate,
1388 EB_U32 addshift)
1389 {
1390 Transform16(src, src_stride, intermediate, 16, 4 + addshift);
1391 Transpose16(intermediate, 16, dst, dst_stride);
1392 Transform16(dst, dst_stride, intermediate, 16, 9);
1393 Transpose16(intermediate, 16, dst, dst_stride);
1394 }
1395
1396
1397 // inverse 16x16 transform
EstimateInvTransform16x16_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_S16 * intermediate,EB_U32 addshift)1398 void EstimateInvTransform16x16_SSE2(
1399 EB_S16 *src,
1400 EB_U32 src_stride,
1401 EB_S16 *dst,
1402 EB_U32 dst_stride,
1403 EB_S16 *intermediate,
1404 EB_U32 addshift)
1405 {
1406 Transpose16(src, src_stride, intermediate, 16);
1407 InvTransform16(intermediate, 16, dst, dst_stride, 7);
1408 Transpose16(dst, dst_stride, intermediate, 16);
1409 InvTransform16(intermediate, 16, dst, dst_stride, 12 - addshift);
1410 }
1411
1412
Transform32_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift)1413 static void Transform32_SSE2(
1414 EB_S16 *src,
1415 EB_U32 src_stride,
1416 EB_S16 *dst,
1417 EB_U32 dst_stride,
1418 EB_U32 shift)
1419 {
1420 EB_U32 i;
1421 __m128i s0 = _mm_cvtsi32_si128(shift);
1422 __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
1423 const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl;
1424
1425 for (i = 0; i < 32; i++)
1426 {
1427 __m128i x0, x1, x2, x3;
1428 __m128i y0, y1, y2, y3;
1429 __m128i a0, a1, a2, a3, a4, a5, a6, a7;
1430 __m128i b0, b1, b2, b3, b4, b5, b6, b7;
1431
1432 x0 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x00));
1433 x1 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x08));
1434 x2 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x10));
1435 x3 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x18));
1436
1437 // 32-point butterfly
1438 x2 = reverse_epi16(x2);
1439 x3 = reverse_epi16(x3);
1440
1441 y0 = _mm_add_epi16(x0, x3);
1442 y1 = _mm_add_epi16(x1, x2);
1443
1444 y2 = _mm_sub_epi16(x0, x3);
1445 y3 = _mm_sub_epi16(x1, x2);
1446
1447 // 16-point butterfly
1448 y1 = reverse_epi16(y1);
1449
1450 x0 = _mm_add_epi16(y0, y1);
1451 x1 = _mm_sub_epi16(y0, y1);
1452
1453 x2 = y2;
1454 x3 = y3;
1455
1456 a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
1457 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[2]));
1458 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[4]));
1459 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[6]));
1460
1461 a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
1462 a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[3]));
1463 a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[5]));
1464 a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[7]));
1465
1466 a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]);
1467 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10]));
1468 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[12]));
1469 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[14]));
1470
1471 a3 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[9]);
1472 a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[11]));
1473 a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[13]));
1474 a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[15]));
1475
1476 a4 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[16]);
1477 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[20]));
1478 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[24]));
1479 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[28]));
1480 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[32]));
1481 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[36]));
1482 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[40]));
1483 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[44]));
1484
1485 a5 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[17]);
1486 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[21]));
1487 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[25]));
1488 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[29]));
1489 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[33]));
1490 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[37]));
1491 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[41]));
1492 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[45]));
1493
1494 a6 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[18]);
1495 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[22]));
1496 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[26]));
1497 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[30]));
1498 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[34]));
1499 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[38]));
1500 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[42]));
1501 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[46]));
1502
1503 a7 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[19]);
1504 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[23]));
1505 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[27]));
1506 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[31]));
1507 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[35]));
1508 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[39]));
1509 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[43]));
1510 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[47]));
1511
1512 b0 = _mm_sra_epi32(_mm_add_epi32(a0, o0), s0);
1513 b1 = _mm_sra_epi32(_mm_add_epi32(a1, o0), s0);
1514 b2 = _mm_sra_epi32(_mm_add_epi32(a2, o0), s0);
1515 b3 = _mm_sra_epi32(_mm_add_epi32(a3, o0), s0);
1516 b4 = _mm_sra_epi32(_mm_add_epi32(a4, o0), s0);
1517 b5 = _mm_sra_epi32(_mm_add_epi32(a5, o0), s0);
1518 b6 = _mm_sra_epi32(_mm_add_epi32(a6, o0), s0);
1519 b7 = _mm_sra_epi32(_mm_add_epi32(a7, o0), s0);
1520
1521 x0 = _mm_packs_epi32(b0, b1);
1522 x1 = _mm_packs_epi32(b2, b3);
1523 x2 = _mm_packs_epi32(b4, b5);
1524 x3 = _mm_packs_epi32(b6, b7);
1525
1526 y0 = _mm_unpacklo_epi16(x0, x1);
1527 y1 = _mm_unpackhi_epi16(x0, x1);
1528 y2 = x2;
1529 y3 = x3;
1530 x0 = _mm_unpacklo_epi16(y0, y2);
1531 x1 = _mm_unpackhi_epi16(y0, y2);
1532 x2 = _mm_unpacklo_epi16(y1, y3);
1533 x3 = _mm_unpackhi_epi16(y1, y3);
1534
1535 _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x00), x0);
1536 _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x08), x1);
1537 _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x10), x2);
1538 _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x18), x3);
1539 }
1540 }
1541
Pfreq1DTransform32_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift)1542 static void Pfreq1DTransform32_SSE2(
1543 EB_S16 *src,
1544 EB_U32 src_stride,
1545 EB_S16 *dst,
1546 EB_U32 dst_stride,
1547 EB_U32 shift)
1548 {
1549 EB_U32 i;
1550 __m128i s0 = _mm_cvtsi32_si128(shift);
1551 __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
1552 const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl;
1553
1554 for (i = 0; i < 32; i++)
1555 {
1556 __m128i x0, x1, x2, x3;
1557 __m128i y0, y1, y2, y3;
1558 __m128i a0, a2, a4, a5;
1559 __m128i b0, b1, b2, b3, b4, b5;
1560
1561
1562 b1 = s0;
1563 b3 = s0;
1564
1565 x0 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x00));
1566 x1 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x08));
1567 x2 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x10));
1568 x3 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x18));
1569
1570
1571 // 32-point butterfly
1572 x2 = reverse_epi16(x2);
1573 x3 = reverse_epi16(x3);
1574
1575 y0 = _mm_add_epi16(x0, x3);
1576 y1 = _mm_add_epi16(x1, x2);
1577
1578 y2 = _mm_sub_epi16(x0, x3);
1579 y3 = _mm_sub_epi16(x1, x2);
1580
1581 // 16-point butterfly
1582 y1 = reverse_epi16(y1);
1583
1584 x0 = _mm_add_epi16(y0, y1);
1585 x1 = _mm_sub_epi16(y0, y1);
1586
1587
1588 x2 = y2;
1589 x3 = y3;
1590
1591
1592
1593 a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
1594 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[2]));
1595 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[4]));
1596 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[6]));
1597
1598 //a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
1599 //a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[3]));
1600 //a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[5]));
1601 //a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[7]));
1602
1603 a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]);
1604 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10]));
1605 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[12]));
1606 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[14]));
1607
1608 //a3 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[9]);
1609 //a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[11]));
1610 //a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[13]));
1611 //a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[15]));
1612
1613 a4 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[16]);
1614 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[20]));
1615 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[24]));
1616 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[28]));
1617 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[32]));
1618 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[36]));
1619 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[40]));
1620 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[44]));
1621
1622 a5 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[17]);
1623 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[21]));
1624 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[25]));
1625 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[29]));
1626 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[33]));
1627 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[37]));
1628 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[41]));
1629 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[45]));
1630
1631 //a6 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[18]);
1632 //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[22]));
1633 //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[26]));
1634 //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[30]));
1635 //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[34]));
1636 //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[38]));
1637 //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[42]));
1638 //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[46]));
1639 //
1640 //a7 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[19]);
1641 //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[23]));
1642 //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[27]));
1643 //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[31]));
1644 //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[35]));
1645 //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[39]));
1646 //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[43]));
1647 //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[47]));
1648
1649 b0 = _mm_sra_epi32(_mm_add_epi32(a0, o0), s0);
1650 //b1 = _mm_sra_epi32(_mm_add_epi32(a1, o0), s0);
1651 b2 = _mm_sra_epi32(_mm_add_epi32(a2, o0), s0);
1652 //b3 = _mm_sra_epi32(_mm_add_epi32(a3, o0), s0);
1653 b4 = _mm_sra_epi32(_mm_add_epi32(a4, o0), s0);
1654 b5 = _mm_sra_epi32(_mm_add_epi32(a5, o0), s0);
1655 //b6 = _mm_sra_epi32(_mm_add_epi32(a6, o0), s0);
1656 //b7 = _mm_sra_epi32(_mm_add_epi32(a7, o0), s0);
1657
1658 x0 = _mm_packs_epi32(b0, b1);
1659 x1 = _mm_packs_epi32(b2, b3);
1660 x2 = _mm_packs_epi32(b4, b5);
1661 //x3 = _mm_packs_epi32(b6, b7);
1662
1663 y0 = _mm_unpacklo_epi16(x0, x1);
1664 //y1 = _mm_unpackhi_epi16(x0, x1);
1665 y2 = x2;
1666 //y3 = x3;
1667 x0 = _mm_unpacklo_epi16(y0, y2);
1668 x1 = _mm_unpackhi_epi16(y0, y2);
1669 //x2 = _mm_unpacklo_epi16(y1, y3);
1670 //x3 = _mm_unpackhi_epi16(y1, y3);
1671
1672 _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x00), x0);
1673 _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x08), x1);
1674 //_mm_storeu_si128((__m128i *)(dst+i*dst_stride+0x10), x2);
1675 //_mm_storeu_si128((__m128i *)(dst+i*dst_stride+0x18), x3);
1676 }
1677 }
Pfreq2DTransform32_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift)1678 static void Pfreq2DTransform32_SSE2(
1679 EB_S16 *src,
1680 EB_U32 src_stride,
1681 EB_S16 *dst,
1682 EB_U32 dst_stride,
1683 EB_U32 shift)
1684 {
1685 EB_U32 i;
1686 __m128i s0 = _mm_cvtsi32_si128(shift);
1687 __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
1688 const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl;
1689
1690 for (i = 0; i < 16; i++)
1691 {
1692 __m128i x0, x1, x2, x3;
1693 __m128i y0, y1, y2, y3;
1694 __m128i a0, a2, a4, a5;
1695 __m128i b0, b1, b2, b3, b4, b5;
1696
1697 b1 = s0;
1698 b3 = s0;
1699
1700 x0 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x00));
1701 x1 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x08));
1702 x2 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x10));
1703 x3 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x18));
1704
1705
1706 // 32-point butterfly
1707 x2 = reverse_epi16(x2);
1708 x3 = reverse_epi16(x3);
1709
1710 y0 = _mm_add_epi16(x0, x3);
1711 y1 = _mm_add_epi16(x1, x2);
1712
1713 y2 = _mm_sub_epi16(x0, x3);
1714 y3 = _mm_sub_epi16(x1, x2);
1715
1716 // 16-point butterfly
1717 y1 = reverse_epi16(y1);
1718
1719 x0 = _mm_add_epi16(y0, y1);
1720 x1 = _mm_sub_epi16(y0, y1);
1721
1722
1723 x2 = y2;
1724 x3 = y3;
1725
1726
1727
1728 a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
1729 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[2]));
1730 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[4]));
1731 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[6]));
1732
1733 //a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
1734 //a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[3]));
1735 //a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[5]));
1736 //a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[7]));
1737
1738 a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]);
1739 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10]));
1740 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[12]));
1741 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[14]));
1742
1743 //a3 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[9]);
1744 //a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[11]));
1745 //a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[13]));
1746 //a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[15]));
1747
1748 a4 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[16]);
1749 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[20]));
1750 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[24]));
1751 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[28]));
1752 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[32]));
1753 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[36]));
1754 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[40]));
1755 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[44]));
1756
1757 a5 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[17]);
1758 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[21]));
1759 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[25]));
1760 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[29]));
1761 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[33]));
1762 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[37]));
1763 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[41]));
1764 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[45]));
1765
1766 //a6 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[18]);
1767 //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[22]));
1768 //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[26]));
1769 //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[30]));
1770 //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[34]));
1771 //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[38]));
1772 //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[42]));
1773 //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[46]));
1774 //
1775 //a7 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[19]);
1776 //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[23]));
1777 //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[27]));
1778 //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[31]));
1779 //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[35]));
1780 //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[39]));
1781 //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[43]));
1782 //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[47]));
1783
1784 b0 = _mm_sra_epi32(_mm_add_epi32(a0, o0), s0);
1785 //b1 = _mm_sra_epi32(_mm_add_epi32(a1, o0), s0);
1786 b2 = _mm_sra_epi32(_mm_add_epi32(a2, o0), s0);
1787 //b3 = _mm_sra_epi32(_mm_add_epi32(a3, o0), s0);
1788 b4 = _mm_sra_epi32(_mm_add_epi32(a4, o0), s0);
1789 b5 = _mm_sra_epi32(_mm_add_epi32(a5, o0), s0);
1790 //b6 = _mm_sra_epi32(_mm_add_epi32(a6, o0), s0);
1791 //b7 = _mm_sra_epi32(_mm_add_epi32(a7, o0), s0);
1792
1793 x0 = _mm_packs_epi32(b0, b1);
1794 x1 = _mm_packs_epi32(b2, b3);
1795 x2 = _mm_packs_epi32(b4, b5);
1796 //x3 = _mm_packs_epi32(b6, b7);
1797
1798 y0 = _mm_unpacklo_epi16(x0, x1);
1799 //y1 = _mm_unpackhi_epi16(x0, x1);
1800 y2 = x2;
1801 //y3 = x3;
1802 x0 = _mm_unpacklo_epi16(y0, y2);
1803 x1 = _mm_unpackhi_epi16(y0, y2);
1804 //x2 = _mm_unpacklo_epi16(y1, y3);
1805 //x3 = _mm_unpackhi_epi16(y1, y3);
1806
1807 _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x00), x0);
1808 _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x08), x1);
1809 //_mm_storeu_si128((__m128i *)(dst+i*dst_stride+0x10), x2);
1810 //_mm_storeu_si128((__m128i *)(dst+i*dst_stride+0x18), x3);
1811 }
1812 }
1813
PfreqN41DTransform32_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift)1814 static void PfreqN41DTransform32_SSE2(
1815 EB_S16 *src,
1816 EB_U32 src_stride,
1817 EB_S16 *dst,
1818 EB_U32 dst_stride,
1819 EB_U32 shift)
1820 {
1821 EB_U32 i;
1822 __m128i s0 = _mm_cvtsi32_si128(shift);
1823 __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
1824 const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl;
1825
1826 for (i = 0; i < 32; i++)
1827 {
1828 __m128i x0, x1, x2, x3;
1829 __m128i y0, y1, y2, y3;
1830 __m128i a0, a2, a4/*, a5*/;
1831 __m128i b0, b1, b2, b3, b4/*, b5*/;
1832
1833
1834 b1 = s0;
1835 b3 = s0;
1836
1837 x0 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x00));
1838 x1 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x08));
1839 x2 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x10));
1840 x3 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x18));
1841
1842
1843 // 32-point butterfly
1844 x2 = reverse_epi16(x2);
1845 x3 = reverse_epi16(x3);
1846
1847 y0 = _mm_add_epi16(x0, x3);
1848 y1 = _mm_add_epi16(x1, x2);
1849
1850 y2 = _mm_sub_epi16(x0, x3);
1851 y3 = _mm_sub_epi16(x1, x2);
1852
1853 // 16-point butterfly
1854 y1 = reverse_epi16(y1);
1855
1856 x0 = _mm_add_epi16(y0, y1);
1857 x1 = _mm_sub_epi16(y0, y1);
1858
1859
1860 x2 = y2;
1861 x3 = y3;
1862
1863
1864
1865 a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
1866 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[2]));
1867 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[4]));
1868 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[6]));
1869
1870 //a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
1871 //a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[3]));
1872 //a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[5]));
1873 //a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[7]));
1874
1875 a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]);
1876 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10]));
1877 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[12]));
1878 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[14]));
1879
1880 //a3 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[9]);
1881 //a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[11]));
1882 //a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[13]));
1883 //a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[15]));
1884
1885 a4 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[16]);
1886 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[20]));
1887 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[24]));
1888 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[28]));
1889 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[32]));
1890 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[36]));
1891 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[40]));
1892 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[44]));
1893
1894 /**/// a5 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[17]);
1895 /**/// a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[21]));
1896 /**/// a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[25]));
1897 /**/// a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[29]));
1898 /**/// a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[33]));
1899 /**/// a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[37]));
1900 /**/// a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[41]));
1901 /**/// a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[45]));
1902
1903 //a6 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[18]);
1904 //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[22]));
1905 //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[26]));
1906 //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[30]));
1907 //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[34]));
1908 //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[38]));
1909 //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[42]));
1910 //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[46]));
1911 //
1912 //a7 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[19]);
1913 //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[23]));
1914 //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[27]));
1915 //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[31]));
1916 //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[35]));
1917 //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[39]));
1918 //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[43]));
1919 //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[47]));
1920
1921 b0 = _mm_sra_epi32(_mm_add_epi32(a0, o0), s0);
1922 //b1 = _mm_sra_epi32(_mm_add_epi32(a1, o0), s0);
1923 b2 = _mm_sra_epi32(_mm_add_epi32(a2, o0), s0);
1924 //b3 = _mm_sra_epi32(_mm_add_epi32(a3, o0), s0);
1925 b4 = _mm_sra_epi32(_mm_add_epi32(a4, o0), s0);
1926 /**/// b5 = _mm_sra_epi32(_mm_add_epi32(a5, o0), s0);
1927 //b6 = _mm_sra_epi32(_mm_add_epi32(a6, o0), s0);
1928 //b7 = _mm_sra_epi32(_mm_add_epi32(a7, o0), s0);
1929
1930 x0 = _mm_packs_epi32(b0, b1);
1931 x1 = _mm_packs_epi32(b2, b3);
1932 x2 = _mm_packs_epi32(b4, b4);
1933
1934 //x3 = _mm_packs_epi32(b6, b7);
1935
1936 y0 = _mm_unpacklo_epi16(x0, x1);
1937 //y1 = _mm_unpackhi_epi16(x0, x1);
1938 y2 = x2;
1939 //y3 = x3;
1940 x0 = _mm_unpacklo_epi16(y0, y2);
1941 /**/// x1 = _mm_unpackhi_epi16(y0, y2);
1942 //x2 = _mm_unpacklo_epi16(y1, y3);
1943 //x3 = _mm_unpackhi_epi16(y1, y3);
1944
1945 _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x00), x0);
1946 /**/// _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x08), x1);
1947 //_mm_storeu_si128((__m128i *)(dst+i*dst_stride+0x10), x2);
1948 //_mm_storeu_si128((__m128i *)(dst+i*dst_stride+0x18), x3);
1949 }
1950 }
PfreqN42DTransform32_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift)1951 static void PfreqN42DTransform32_SSE2(
1952 EB_S16 *src,
1953 EB_U32 src_stride,
1954 EB_S16 *dst,
1955 EB_U32 dst_stride,
1956 EB_U32 shift)
1957 {
1958 EB_U32 i;
1959 __m128i s0 = _mm_cvtsi32_si128(shift);
1960 __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
1961 const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl;
1962
1963 for (i = 0; i < 8; i++)
1964
1965 {
1966 __m128i x0, x1, x2, x3;
1967 __m128i y0, y1, y2, y3;
1968 __m128i a0, a2, a4/*, a5*/;
1969 __m128i b0, b1, b2, b3, b4/*, b5*/;
1970
1971 b1 = s0;
1972 b3 = s0;
1973
1974 x0 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x00));
1975 x1 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x08));
1976 x2 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x10));
1977 x3 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x18));
1978
1979
1980 // 32-point butterfly
1981 x2 = reverse_epi16(x2);
1982 x3 = reverse_epi16(x3);
1983
1984 y0 = _mm_add_epi16(x0, x3);
1985 y1 = _mm_add_epi16(x1, x2);
1986
1987 y2 = _mm_sub_epi16(x0, x3);
1988 y3 = _mm_sub_epi16(x1, x2);
1989
1990 // 16-point butterfly
1991 y1 = reverse_epi16(y1);
1992
1993 x0 = _mm_add_epi16(y0, y1);
1994 x1 = _mm_sub_epi16(y0, y1);
1995
1996
1997 x2 = y2;
1998 x3 = y3;
1999
2000
2001
2002 a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
2003 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[2]));
2004 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[4]));
2005 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[6]));
2006
2007 //a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
2008 //a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[3]));
2009 //a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[5]));
2010 //a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[7]));
2011
2012 a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]);
2013 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10]));
2014 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[12]));
2015 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[14]));
2016
2017 //a3 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[9]);
2018 //a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[11]));
2019 //a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[13]));
2020 //a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[15]));
2021
2022 a4 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[16]);
2023 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[20]));
2024 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[24]));
2025 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[28]));
2026 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[32]));
2027 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[36]));
2028 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[40]));
2029 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[44]));
2030
2031 /**/// a5 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[17]);
2032 /**/// a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[21]));
2033 /**/// a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[25]));
2034 /**/// a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[29]));
2035 /**/// a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[33]));
2036 /**/// a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[37]));
2037 /**/// a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[41]));
2038 /**/// a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[45]));
2039
2040 //a6 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[18]);
2041 //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[22]));
2042 //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[26]));
2043 //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[30]));
2044 //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[34]));
2045 //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[38]));
2046 //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[42]));
2047 //a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[46]));
2048 //
2049 //a7 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[19]);
2050 //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[23]));
2051 //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[27]));
2052 //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[31]));
2053 //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[35]));
2054 //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[39]));
2055 //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[43]));
2056 //a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[47]));
2057
2058 b0 = _mm_sra_epi32(_mm_add_epi32(a0, o0), s0);
2059 //b1 = _mm_sra_epi32(_mm_add_epi32(a1, o0), s0);
2060 b2 = _mm_sra_epi32(_mm_add_epi32(a2, o0), s0);
2061 //b3 = _mm_sra_epi32(_mm_add_epi32(a3, o0), s0);
2062 b4 = _mm_sra_epi32(_mm_add_epi32(a4, o0), s0);
2063 /**/// b5 = _mm_sra_epi32(_mm_add_epi32(a5, o0), s0);
2064 //b6 = _mm_sra_epi32(_mm_add_epi32(a6, o0), s0);
2065 //b7 = _mm_sra_epi32(_mm_add_epi32(a7, o0), s0);
2066
2067 x0 = _mm_packs_epi32(b0, b1);
2068 x1 = _mm_packs_epi32(b2, b3);
2069 x2 = _mm_packs_epi32(b4, b4);//do not use b5
2070
2071 //x3 = _mm_packs_epi32(b6, b7);
2072
2073 y0 = _mm_unpacklo_epi16(x0, x1);
2074 //y1 = _mm_unpackhi_epi16(x0, x1);
2075 y2 = x2;
2076 //y3 = x3;
2077 x0 = _mm_unpacklo_epi16(y0, y2);
2078 /**/// x1 = _mm_unpackhi_epi16(y0, y2);
2079 //x2 = _mm_unpacklo_epi16(y1, y3);
2080 //x3 = _mm_unpackhi_epi16(y1, y3);
2081
2082 _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x00), x0);
2083 /**/// _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x08), x1);
2084
2085 //_mm_storeu_si128((__m128i *)(dst+i*dst_stride+0x10), x2);
2086 //_mm_storeu_si128((__m128i *)(dst+i*dst_stride+0x18), x3);
2087 }
2088 }
2089
Transform32x32_SSE2(EB_S16 * src,const EB_U32 src_stride,EB_S16 * dst,const EB_U32 dst_stride,EB_S16 * intermediate,EB_U32 addshift)2090 void Transform32x32_SSE2(
2091 EB_S16 *src,
2092 const EB_U32 src_stride,
2093 EB_S16 *dst,
2094 const EB_U32 dst_stride,
2095 EB_S16 *intermediate,
2096 EB_U32 addshift)
2097 {
2098 Transform32_SSE2(src, src_stride, intermediate, 32, 6 + addshift);
2099 Transpose32_SSE2(intermediate, 32, dst, dst_stride);
2100 Transform32_SSE2(dst, dst_stride, intermediate, 32, 9);
2101 Transpose32_SSE2(intermediate, 32, dst, dst_stride);
2102
2103 return;
2104 }
2105
PfreqN4Transform32x32_SSE2(EB_S16 * src,const EB_U32 src_stride,EB_S16 * dst,const EB_U32 dst_stride,EB_S16 * intermediate,EB_U32 addshift)2106 void PfreqN4Transform32x32_SSE2(
2107 EB_S16 *src,
2108 const EB_U32 src_stride,
2109 EB_S16 *dst,
2110 const EB_U32 dst_stride,
2111 EB_S16 *intermediate,
2112 EB_U32 addshift)
2113 {
2114 PfreqN41DTransform32_SSE2(src, src_stride, intermediate, 32, 6 + addshift);
2115 PfreqN4FirstTranspose32_SSE2(intermediate, 32, dst, dst_stride);
2116 PfreqN42DTransform32_SSE2(dst, dst_stride, intermediate, 32, 9);
2117 PfreqN4SecTranspose32_SSE2(intermediate, 32, dst, dst_stride);
2118
2119 return;
2120 }
2121
Pfreq1DTransform16_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift)2122 static void Pfreq1DTransform16_SSE2(
2123 EB_S16 *src,
2124 EB_U32 src_stride,
2125 EB_S16 *dst,
2126 EB_U32 dst_stride,
2127 EB_U32 shift)
2128 {
2129 EB_U32 i;
2130 __m128i s0 = _mm_cvtsi32_si128(shift);
2131 __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
2132 const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl;
2133
2134 for (i = 0; i < 16; i++)
2135 {
2136 __m128i x0, x1;
2137 __m128i y0, y1;
2138 __m128i a0, a2;
2139 __m128i b0, b1, b2, b3;
2140
2141 b1 = s0;
2142 b3 = s0;
2143
2144 y0 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x00));
2145 y1 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x08));
2146
2147
2148 // 16-point butterfly
2149 y1 = reverse_epi16(y1);
2150
2151 x0 = _mm_add_epi16(y0, y1);
2152 x1 = _mm_sub_epi16(y0, y1);
2153
2154 a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
2155 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[2]));
2156 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[4]));
2157 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[6]));
2158
2159 a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]);
2160 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10]));
2161 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[12]));
2162 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[14]));
2163
2164 b0 = _mm_sra_epi32(_mm_add_epi32(a0, o0), s0);
2165 b2 = _mm_sra_epi32(_mm_add_epi32(a2, o0), s0);
2166
2167 x0 = _mm_packs_epi32(b0, b1);
2168 x1 = _mm_packs_epi32(b2, b3);
2169
2170 y0 = _mm_unpacklo_epi16(x0, x1);
2171
2172 _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x00), y0);
2173 }
2174 }
2175
Pfreq2DTransform16_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift)2176 static void Pfreq2DTransform16_SSE2(
2177 EB_S16 *src,
2178 EB_U32 src_stride,
2179 EB_S16 *dst,
2180 EB_U32 dst_stride,
2181 EB_U32 shift)
2182 {
2183 EB_U32 i;
2184 __m128i s0 = _mm_cvtsi32_si128(shift);
2185 __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
2186 const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl;
2187
2188 for (i = 0; i < 8; i++)
2189 {
2190 __m128i x0, x1;
2191 __m128i y0, y1;
2192 __m128i a0, a2;
2193 __m128i b0, b1, b2, b3;
2194
2195 b1 = s0;
2196 b3 = s0;
2197
2198 y0 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x00));
2199 y1 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x08));
2200
2201
2202 // 16-point butterfly
2203 y1 = reverse_epi16(y1);
2204
2205 x0 = _mm_add_epi16(y0, y1);
2206 x1 = _mm_sub_epi16(y0, y1);
2207
2208 a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
2209 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[2]));
2210 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[4]));
2211 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[6]));
2212
2213
2214 a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]);
2215 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10]));
2216 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[12]));
2217 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[14]));
2218
2219
2220 b0 = _mm_sra_epi32(_mm_add_epi32(a0, o0), s0);
2221 b2 = _mm_sra_epi32(_mm_add_epi32(a2, o0), s0);
2222
2223 x0 = _mm_packs_epi32(b0, b1);
2224 x1 = _mm_packs_epi32(b2, b3);
2225
2226 y0 = _mm_unpacklo_epi16(x0, x1);
2227
2228 _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x00), y0);
2229 }
2230 }
PfreqTranspose16_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride)2231 static void PfreqTranspose16_SSE2(
2232 EB_S16 *src,
2233 EB_U32 src_stride,
2234 EB_S16 *dst,
2235 EB_U32 dst_stride)
2236 {
2237 EB_U32 i, j;
2238 for (i = 0; i < 2; i++)
2239 {
2240 for (j = 0; j < 1; j++)
2241 {
2242 __m128i a0, a1, a2, a3, a4, a5, a6, a7;
2243 __m128i b0, b1, b2, b3, b4, b5, b6, b7;
2244
2245 a0 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 0)*src_stride + 8 * j));
2246 a1 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 1)*src_stride + 8 * j));
2247 a2 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 2)*src_stride + 8 * j));
2248 a3 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 3)*src_stride + 8 * j));
2249 a4 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 4)*src_stride + 8 * j));
2250 a5 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 5)*src_stride + 8 * j));
2251 a6 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 6)*src_stride + 8 * j));
2252 a7 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 7)*src_stride + 8 * j));
2253
2254 b0 = _mm_unpacklo_epi16(a0, a4);
2255 b1 = _mm_unpacklo_epi16(a1, a5);
2256 b2 = _mm_unpacklo_epi16(a2, a6);
2257 b3 = _mm_unpacklo_epi16(a3, a7);
2258 b4 = _mm_unpackhi_epi16(a0, a4);
2259 b5 = _mm_unpackhi_epi16(a1, a5);
2260 b6 = _mm_unpackhi_epi16(a2, a6);
2261 b7 = _mm_unpackhi_epi16(a3, a7);
2262
2263 a0 = _mm_unpacklo_epi16(b0, b2);
2264 a1 = _mm_unpacklo_epi16(b1, b3);
2265 a2 = _mm_unpackhi_epi16(b0, b2);
2266 a3 = _mm_unpackhi_epi16(b1, b3);
2267 a4 = _mm_unpacklo_epi16(b4, b6);
2268 a5 = _mm_unpacklo_epi16(b5, b7);
2269 a6 = _mm_unpackhi_epi16(b4, b6);
2270 a7 = _mm_unpackhi_epi16(b5, b7);
2271
2272 b0 = _mm_unpacklo_epi16(a0, a1);
2273 b1 = _mm_unpackhi_epi16(a0, a1);
2274 b2 = _mm_unpacklo_epi16(a2, a3);
2275 b3 = _mm_unpackhi_epi16(a2, a3);
2276 b4 = _mm_unpacklo_epi16(a4, a5);
2277 b5 = _mm_unpackhi_epi16(a4, a5);
2278 b6 = _mm_unpacklo_epi16(a6, a7);
2279 b7 = _mm_unpackhi_epi16(a6, a7);
2280
2281 _mm_storeu_si128((__m128i *)(dst + (8 * j + 0)*dst_stride + 8 * i), b0);
2282 _mm_storeu_si128((__m128i *)(dst + (8 * j + 1)*dst_stride + 8 * i), b1);
2283 _mm_storeu_si128((__m128i *)(dst + (8 * j + 2)*dst_stride + 8 * i), b2);
2284 _mm_storeu_si128((__m128i *)(dst + (8 * j + 3)*dst_stride + 8 * i), b3);
2285 _mm_storeu_si128((__m128i *)(dst + (8 * j + 4)*dst_stride + 8 * i), b4);
2286 _mm_storeu_si128((__m128i *)(dst + (8 * j + 5)*dst_stride + 8 * i), b5);
2287 _mm_storeu_si128((__m128i *)(dst + (8 * j + 6)*dst_stride + 8 * i), b6);
2288 _mm_storeu_si128((__m128i *)(dst + (8 * j + 7)*dst_stride + 8 * i), b7);
2289 }
2290 }
2291 }
PfreqTransform16x16_SSE2(EB_S16 * src,const EB_U32 src_stride,EB_S16 * dst,const EB_U32 dst_stride,EB_S16 * intermediate,EB_U32 addshift)2292 void PfreqTransform16x16_SSE2(
2293 EB_S16 *src,
2294 const EB_U32 src_stride,
2295 EB_S16 *dst,
2296 const EB_U32 dst_stride,
2297 EB_S16 *intermediate,
2298 EB_U32 addshift)
2299 {
2300 Pfreq1DTransform16_SSE2(src, src_stride, intermediate, 16, 4 + addshift);
2301 PfreqTranspose16_SSE2(intermediate, 16, dst, dst_stride);
2302 Pfreq2DTransform16_SSE2(dst, dst_stride, intermediate, 16, 9);
2303 PfreqTranspose16_SSE2(intermediate, 16, dst, dst_stride);
2304
2305 return;
2306 }
2307
PfreqN42DTransform16_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift)2308 static void PfreqN42DTransform16_SSE2(
2309 EB_S16 *src,
2310 EB_U32 src_stride,
2311 EB_S16 *dst,
2312 EB_U32 dst_stride,
2313 EB_U32 shift)
2314 {
2315 EB_U32 i;
2316 __m128i s0 = _mm_cvtsi32_si128(shift);
2317 __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
2318 const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl;
2319
2320 for (i = 0; i < 4; i++)
2321
2322 {
2323 __m128i x0, x1;
2324 __m128i y0, y1;
2325 __m128i a0, a2;
2326 __m128i b0, b1, b2, b3;
2327
2328 b1 = s0;
2329 b3 = s0;
2330
2331 y0 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x00));
2332 y1 = _mm_loadu_si128((const __m128i *)(src + i*src_stride + 0x08));
2333
2334
2335 // 16-point butterfly
2336 y1 = reverse_epi16(y1);
2337
2338 x0 = _mm_add_epi16(y0, y1);
2339 x1 = _mm_sub_epi16(y0, y1);
2340
2341 a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
2342 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[2]));
2343 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[4]));
2344 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[6]));
2345
2346
2347 a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]);
2348 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10]));
2349 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[12]));
2350 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[14]));
2351
2352
2353 b0 = _mm_sra_epi32(_mm_add_epi32(a0, o0), s0);
2354 b2 = _mm_sra_epi32(_mm_add_epi32(a2, o0), s0);
2355
2356 x0 = _mm_packs_epi32(b0, b1);
2357 x1 = _mm_packs_epi32(b2, b3);
2358
2359 y0 = _mm_unpacklo_epi16(x0, x1);
2360
2361 _mm_storeu_si128((__m128i *)(dst + i*dst_stride + 0x00), y0);//TODO change to 64bit
2362 }
2363 }
PfreqN4FirstTranspose16_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride)2364 static void PfreqN4FirstTranspose16_SSE2(
2365 EB_S16 *src,
2366 EB_U32 src_stride,
2367 EB_S16 *dst,
2368 EB_U32 dst_stride)
2369 {
2370 EB_U32 i, j;
2371 for (i = 0; i < 2; i++)
2372 {
2373 for (j = 0; j < 1; j++)
2374 {
2375 __m128i a0, a1, a2, a3, a4, a5, a6, a7;
2376 __m128i b0, b1, b2, b3/*, b4, b5, b6, b7*/;
2377
2378 a0 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 0)*src_stride + 8 * j));
2379 a1 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 1)*src_stride + 8 * j));
2380 a2 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 2)*src_stride + 8 * j));
2381 a3 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 3)*src_stride + 8 * j));
2382 a4 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 4)*src_stride + 8 * j));
2383 a5 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 5)*src_stride + 8 * j));
2384 a6 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 6)*src_stride + 8 * j));
2385 a7 = _mm_loadu_si128((const __m128i *)(src + (8 * i + 7)*src_stride + 8 * j));
2386
2387 b0 = _mm_unpacklo_epi16(a0, a4);
2388 b1 = _mm_unpacklo_epi16(a1, a5);
2389 b2 = _mm_unpacklo_epi16(a2, a6);
2390 b3 = _mm_unpacklo_epi16(a3, a7);
2391 // b4 = _mm_unpackhi_epi16(a0, a4);
2392 // b5 = _mm_unpackhi_epi16(a1, a5);
2393 // b6 = _mm_unpackhi_epi16(a2, a6);
2394 // b7 = _mm_unpackhi_epi16(a3, a7);
2395
2396 a0 = _mm_unpacklo_epi16(b0, b2);
2397 a1 = _mm_unpacklo_epi16(b1, b3);
2398 a2 = _mm_unpackhi_epi16(b0, b2);
2399 a3 = _mm_unpackhi_epi16(b1, b3);
2400 // a4 = _mm_unpacklo_epi16(b4, b6);
2401 // a5 = _mm_unpacklo_epi16(b5, b7);
2402 // a6 = _mm_unpackhi_epi16(b4, b6);
2403 // a7 = _mm_unpackhi_epi16(b5, b7);
2404
2405 b0 = _mm_unpacklo_epi16(a0, a1);
2406 b1 = _mm_unpackhi_epi16(a0, a1);
2407 b2 = _mm_unpacklo_epi16(a2, a3);
2408 b3 = _mm_unpackhi_epi16(a2, a3);
2409 // b4 = _mm_unpacklo_epi16(a4, a5);
2410 // b5 = _mm_unpackhi_epi16(a4, a5);
2411 // b6 = _mm_unpacklo_epi16(a6, a7);
2412 // b7 = _mm_unpackhi_epi16(a6, a7);
2413
2414 _mm_storeu_si128((__m128i *)(dst + (8 * j + 0)*dst_stride + 8 * i), b0);
2415 _mm_storeu_si128((__m128i *)(dst + (8 * j + 1)*dst_stride + 8 * i), b1);
2416 _mm_storeu_si128((__m128i *)(dst + (8 * j + 2)*dst_stride + 8 * i), b2);
2417 _mm_storeu_si128((__m128i *)(dst + (8 * j + 3)*dst_stride + 8 * i), b3);
2418 // _mm_storeu_si128((__m128i *)(dst + (8 * j + 4)*dst_stride + 8 * i), b4);
2419 // _mm_storeu_si128((__m128i *)(dst + (8 * j + 5)*dst_stride + 8 * i), b5);
2420 // _mm_storeu_si128((__m128i *)(dst + (8 * j + 6)*dst_stride + 8 * i), b6);
2421 // _mm_storeu_si128((__m128i *)(dst + (8 * j + 7)*dst_stride + 8 * i), b7);
2422 }
2423 }
2424 }
PfreqN4SecondTranspose16_SSE2(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride)2425 static void PfreqN4SecondTranspose16_SSE2(
2426 EB_S16 *src,
2427 EB_U32 src_stride,
2428 EB_S16 *dst,
2429 EB_U32 dst_stride)
2430 {
2431 EB_U32 i, j;
2432
2433 i = j = 0;
2434 {
2435 {
2436 __m128i a0, a1, a2, a3/*, a4, a5, a6, a7*/;
2437 __m128i b0, b1, b2, b3/*, b4, b5, b6, b7*/;
2438
2439 a0 = _mm_loadu_si128((const __m128i *)(src + (0)*src_stride)); //TODO load only 64bit
2440 a1 = _mm_loadu_si128((const __m128i *)(src + (1)*src_stride));
2441 a2 = _mm_loadu_si128((const __m128i *)(src + (2)*src_stride));
2442 a3 = _mm_loadu_si128((const __m128i *)(src + (3)*src_stride));
2443
2444 b0 = _mm_unpacklo_epi16(a0, a0/*a4*/);
2445 b1 = _mm_unpacklo_epi16(a1, a1/*a5*/);
2446 b2 = _mm_unpacklo_epi16(a2, a2/*a6*/);
2447 b3 = _mm_unpacklo_epi16(a3, a3/*a7*/);
2448
2449 a0 = _mm_unpacklo_epi16(b0, b2);
2450 a1 = _mm_unpacklo_epi16(b1, b3);
2451 a2 = _mm_unpackhi_epi16(b0, b2);
2452 a3 = _mm_unpackhi_epi16(b1, b3);
2453
2454 b0 = _mm_unpacklo_epi16(a0, a1);
2455 b1 = _mm_unpackhi_epi16(a0, a1);
2456 b2 = _mm_unpacklo_epi16(a2, a3);
2457 b3 = _mm_unpackhi_epi16(a2, a3);
2458
2459 _mm_storeu_si128((__m128i *)(dst + (8 * j + 0)*dst_stride + 8 * i), b0);
2460 _mm_storeu_si128((__m128i *)(dst + (8 * j + 1)*dst_stride + 8 * i), b1);
2461 _mm_storeu_si128((__m128i *)(dst + (8 * j + 2)*dst_stride + 8 * i), b2);
2462 _mm_storeu_si128((__m128i *)(dst + (8 * j + 3)*dst_stride + 8 * i), b3);
2463 }
2464 }
2465 }
2466
PfreqTransform32x32_SSE2(EB_S16 * src,const EB_U32 src_stride,EB_S16 * dst,const EB_U32 dst_stride,EB_S16 * intermediate,EB_U32 addshift)2467 void PfreqTransform32x32_SSE2(
2468 EB_S16 *src,
2469 const EB_U32 src_stride,
2470 EB_S16 *dst,
2471 const EB_U32 dst_stride,
2472 EB_S16 *intermediate,
2473 EB_U32 addshift)
2474 {
2475 Pfreq1DTransform32_SSE2(src, src_stride, intermediate, 32, 6 + addshift);
2476 PfreqTranspose32_SSE2(intermediate, 32, dst, dst_stride);
2477 Pfreq2DTransform32_SSE2(dst, dst_stride, intermediate, 32, 9);
2478 PfreqTranspose32_SSE2(intermediate, 32, dst, dst_stride);
2479
2480 return;
2481 }
2482
PfreqN4Transform16x16_SSE2(EB_S16 * src,const EB_U32 src_stride,EB_S16 * dst,const EB_U32 dst_stride,EB_S16 * intermediate,EB_U32 addshift)2483 void PfreqN4Transform16x16_SSE2(
2484 EB_S16 *src,
2485 const EB_U32 src_stride,
2486 EB_S16 *dst,
2487 const EB_U32 dst_stride,
2488 EB_S16 *intermediate,
2489 EB_U32 addshift)
2490 {
2491 Pfreq1DTransform16_SSE2(src, src_stride, intermediate, 16, 4 + addshift);
2492 PfreqN4FirstTranspose16_SSE2(intermediate, 16, dst, dst_stride);
2493 PfreqN42DTransform16_SSE2(dst, dst_stride, intermediate, 16, 9);
2494 PfreqN4SecondTranspose16_SSE2(intermediate, 16, dst, dst_stride);
2495
2496 return;
2497 }
2498
2499
Transform4x4_SSE2_INTRIN(EB_S16 * residual,const EB_U32 srcStride,EB_S16 * transformCoefficients,const EB_U32 dstStride,EB_S16 * transformInnerArrayPtr,EB_U32 bitIncrement)2500 void Transform4x4_SSE2_INTRIN(
2501 EB_S16 *residual,
2502 const EB_U32 srcStride,
2503 EB_S16 *transformCoefficients,
2504 const EB_U32 dstStride,
2505 EB_S16 *transformInnerArrayPtr,
2506 EB_U32 bitIncrement)
2507 {
2508 #define OFFSET_128 0
2509 #define OFFSET_64_64 8
2510 #define OFFSET_83_36 16
2511 #define OFFSET_N36_N83 24
2512 #define OFFSET_64_N64 32
2513 #define OFFSET_N64_64 40
2514 #define OFFSET_36_N83 48
2515 #define OFFSET_83_N36 56
2516
2517
2518 EB_ALIGN(16) EB_S16 transformIntrinConst_SSE2[] = {
2519 128, 0, 128, 0, 128, 0, 128, 0,
2520 64, 64, 64, 64, 64, 64, 64, 64,
2521 83, 36, 83, 36, 83, 36, 83, 36,
2522 -36, -83, -36, -83, -36, -83, -36, -83,
2523 64, -64, 64, -64, 64, -64, 64, -64,
2524 -64, 64, -64, 64, -64, 64, -64, 64,
2525 36, -83, 36, -83, 36, -83, 36, -83,
2526 83, -36, 83, -36, 83, -36, 83, -36
2527 };
2528 EB_ALIGN(16) const EB_S16 * EbHevcTransformAsmConst = transformIntrinConst_SSE2;
2529 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm_offset, xmm_shift;
2530
2531 xmm_shift = _mm_cvtsi32_si128(5 - bitIncrement);
2532 xmm0 = _mm_loadl_epi64((__m128i*)(residual));
2533 xmm1 = _mm_loadl_epi64((__m128i*)(residual + srcStride));
2534 xmm2 = _mm_loadl_epi64((__m128i *)(residual + 2 * srcStride));
2535 xmm3 = _mm_loadl_epi64((__m128i *)(residual + 3 * srcStride));
2536 xmm0 = _mm_unpacklo_epi16(xmm0, xmm1);
2537 xmm2 = _mm_unpacklo_epi16(xmm2, xmm3);
2538
2539 xmm1 = _mm_unpackhi_epi32(xmm0, xmm2);
2540 xmm0 = _mm_unpacklo_epi32(xmm0, xmm2);
2541 xmm1 = _mm_unpacklo_epi64(_mm_srli_si128(xmm1, 8), xmm1);
2542 xmm3 = _mm_sub_epi16(xmm0, xmm1);
2543 xmm0 = _mm_add_epi16(xmm0, xmm1);
2544
2545 xmm4 = xmm2 = xmm0;
2546 xmm0 = _mm_srli_si128(xmm0, 8);
2547 xmm2 = _mm_sll_epi16(_mm_add_epi16(xmm2, xmm0), xmm_shift);
2548 xmm4 = _mm_sll_epi16(_mm_sub_epi16(xmm4, xmm0), xmm_shift);
2549
2550 xmm_offset = _mm_slli_epi16(_mm_set1_epi32(1), bitIncrement);
2551 xmm_shift = _mm_cvtsi32_si128(bitIncrement + 1);
2552
2553 xmm1 = _mm_unpacklo_epi16(xmm3, _mm_srli_si128(xmm3, 8));
2554
2555 xmm3 = _mm_madd_epi16(xmm1, _mm_load_si128((__m128i *)(transformIntrinConst_SSE2 + OFFSET_36_N83)));
2556 xmm1 = _mm_madd_epi16(xmm1, _mm_load_si128((__m128i *)(transformIntrinConst_SSE2 + OFFSET_83_36)));
2557 xmm1 = _mm_add_epi32(xmm1, xmm_offset);
2558 xmm3 = _mm_add_epi32(xmm3, xmm_offset);
2559 xmm1 = _mm_sra_epi32(xmm1, xmm_shift);
2560 xmm3 = _mm_sra_epi32(xmm3, xmm_shift);
2561 xmm1 = _mm_packs_epi32(xmm1, xmm3);
2562
2563 xmm2 = _mm_unpacklo_epi32(xmm2, xmm1);
2564 xmm1 = _mm_srli_si128(xmm1, 8);
2565 xmm4 = _mm_unpacklo_epi32(xmm4, xmm1);
2566
2567 xmm3 = _mm_unpackhi_epi64(xmm2, xmm4);
2568 xmm2 = _mm_unpacklo_epi64(xmm2, xmm4);
2569
2570 xmm_offset = _mm_load_si128((__m128i *)(transformIntrinConst_SSE2 + OFFSET_128));
2571
2572 MACRO_TRANS_2MAC(xmm2, xmm3, xmm0, xmm1, xmm_offset, OFFSET_64_64, OFFSET_64_64, 8, 0)
2573 MACRO_TRANS_2MAC(xmm2, xmm3, xmm4, xmm5, xmm_offset, OFFSET_83_36, OFFSET_N36_N83, 8, dstStride)
2574 MACRO_TRANS_2MAC(xmm2, xmm3, xmm6, xmm0, xmm_offset, OFFSET_64_N64, OFFSET_N64_64, 8, 2 * dstStride)
2575 MACRO_TRANS_2MAC(xmm2, xmm3, xmm1, xmm4, xmm_offset, OFFSET_36_N83, OFFSET_83_N36, 8, 3 * dstStride)
2576
2577 (void)transformCoefficients;
2578 (void)transformInnerArrayPtr;
2579
2580 #undef OFFSET_128
2581 #undef OFFSET_64_64
2582 #undef OFFSET_83_36
2583 #undef OFFSET_N36_N83
2584 #undef OFFSET_64_N64
2585 #undef OFFSET_N64_64
2586 #undef OFFSET_36_N83
2587 #undef OFFSET_83_N36
2588 }
2589
DstTransform4x4_SSE2_INTRIN(EB_S16 * residual,const EB_U32 srcStride,EB_S16 * transformCoefficients,const EB_U32 dstStride,EB_S16 * transformInnerArrayPtr,EB_U32 bitIncrement)2590 void DstTransform4x4_SSE2_INTRIN(
2591 EB_S16 *residual,
2592 const EB_U32 srcStride,
2593 EB_S16 *transformCoefficients,
2594 const EB_U32 dstStride,
2595 EB_S16 *transformInnerArrayPtr,
2596 EB_U32 bitIncrement)
2597 {
2598 #define OFFSET_DST_1 0
2599 #define OFFSET_DST_29_55 (8+OFFSET_DST_1)
2600 #define OFFSET_DST_74_84 (8+OFFSET_DST_29_55)
2601 #define OFFSET_DST_84_N29 (8+OFFSET_DST_74_84)
2602 #define OFFSET_DST_N74_55 (8+OFFSET_DST_84_N29)
2603 #define OFFSET_DST_55_N84 (8+OFFSET_DST_N74_55)
2604 #define OFFSET_DST_74_N29 (8+OFFSET_DST_55_N84)
2605 #define OFFSET_DST_37_37 (8+OFFSET_DST_74_N29)
2606 #define OFFSET_DST_74_74 (8+OFFSET_DST_37_37)
2607 #define OFFSET_DST_0_N37 (8+OFFSET_DST_74_74)
2608 #define OFFSET_DST_0_N74 (8+OFFSET_DST_0_N37)
2609
2610 __m128i xmm_res0, xmm_res1, xmm_res2, xmm_res3, xmm_res0_1, xmm_res2_3, xmm_res_lo, xmm_res_hi, xmm_offset;
2611 __m128i xmm_trans0, xmm_trans1, xmm_trans2, xmm_trans3, xmm_trans0_1, xmm_trans2_3, xmm_trans_lo, xmm_trans_hi;
2612 __m128i xmm_temp;
2613
2614 EB_U32 shift = bitIncrement + 1;
2615 EB_ALIGN(16) const EB_S16 * EbHevcTransformAsmConst = DstTransformAsmConst_SSE2;
2616
2617 xmm_res0 = _mm_loadl_epi64((__m128i *)(residual));
2618 xmm_res1 = _mm_loadl_epi64((__m128i *)(residual + srcStride));
2619 xmm_res2 = _mm_loadl_epi64((__m128i *)(residual + 2 * srcStride));
2620 xmm_res3 = _mm_loadl_epi64((__m128i *)(residual + 3 * srcStride));
2621 xmm_offset = _mm_srli_epi32(_mm_slli_epi32(_mm_load_si128((__m128i *)(DstTransformAsmConst_SSE2 + OFFSET_DST_1)), shift), 1);
2622
2623 xmm_res0_1 = _mm_unpacklo_epi32(xmm_res0, xmm_res1); // |res01 |res-S1-01|res23 |res-S1-23|
2624 xmm_res2_3 = _mm_unpacklo_epi32(xmm_res2, xmm_res3); // |res-S2-01|res-S3-01|res-S2-23|res-S3-23|
2625 xmm_res_hi = _mm_unpackhi_epi64(xmm_res0_1, xmm_res2_3); // |res23 |res-S1-23|res-S2-23|res-S3-23|
2626 xmm_res_lo = _mm_unpacklo_epi64(xmm_res0_1, xmm_res2_3); // |res01 |res-S1-01|res-S2-01|res-S3-01|
2627
2628 MACRO_TRANS_2MAC_NO_SAVE(xmm_res_lo, xmm_res_hi, xmm_trans0, xmm_temp, xmm_offset, OFFSET_DST_29_55, OFFSET_DST_74_84, shift)
2629 MACRO_TRANS_2MAC_NO_SAVE(xmm_res_lo, xmm_res_hi, xmm_trans1, xmm_temp, xmm_offset, OFFSET_DST_74_74, OFFSET_DST_0_N74, shift)
2630 MACRO_TRANS_2MAC_NO_SAVE(xmm_res_lo, xmm_res_hi, xmm_trans2, xmm_temp, xmm_offset, OFFSET_DST_84_N29, OFFSET_DST_N74_55, shift)
2631 MACRO_TRANS_2MAC_NO_SAVE(xmm_res_lo, xmm_res_hi, xmm_trans3, xmm_temp, xmm_offset, OFFSET_DST_55_N84, OFFSET_DST_74_N29, shift)
2632
2633 // Second Partial Bufferfly
2634 xmm_offset = _mm_set1_epi32(0x00000080); // 128
2635 xmm_trans0_1 = _mm_unpacklo_epi32(xmm_trans0, xmm_trans1);
2636 xmm_trans2_3 = _mm_unpacklo_epi32(xmm_trans2, xmm_trans3);
2637 xmm_trans_hi = _mm_unpackhi_epi64(xmm_trans0_1, xmm_trans2_3);
2638 xmm_trans_lo = _mm_unpacklo_epi64(xmm_trans0_1, xmm_trans2_3);
2639
2640 MACRO_TRANS_2MAC(xmm_trans_lo, xmm_trans_hi, xmm_trans0, xmm_temp, xmm_offset, OFFSET_DST_29_55, OFFSET_DST_74_84, 8, 0)
2641 MACRO_TRANS_2MAC(xmm_trans_lo, xmm_trans_hi, xmm_trans1, xmm_temp, xmm_offset, OFFSET_DST_74_74, OFFSET_DST_0_N74, 8, dstStride)
2642 MACRO_TRANS_2MAC(xmm_trans_lo, xmm_trans_hi, xmm_trans2, xmm_temp, xmm_offset, OFFSET_DST_84_N29, OFFSET_DST_N74_55, 8, (2 * dstStride))
2643 MACRO_TRANS_2MAC(xmm_trans_lo, xmm_trans_hi, xmm_trans3, xmm_temp, xmm_offset, OFFSET_DST_55_N84, OFFSET_DST_74_N29, 8, (3 * dstStride))
2644
2645 (void)transformInnerArrayPtr;
2646 }
2647
Transform8x8_SSE2_INTRIN(EB_S16 * residual,const EB_U32 srcStride,EB_S16 * transformCoefficients,const EB_U32 dstStride,EB_S16 * transformInnerArrayPtr,EB_U32 bitIncrement)2648 void Transform8x8_SSE2_INTRIN(
2649 EB_S16 *residual,
2650 const EB_U32 srcStride,
2651 EB_S16 *transformCoefficients,
2652 const EB_U32 dstStride,
2653 EB_S16 *transformInnerArrayPtr,
2654 EB_U32 bitIncrement)
2655 {
2656 // Transform8x8 has its own table because the larger table's offset macros exceed 256 (which is maximum macro expansion depth
2657 // Use a smaller table with values just for Transform8x8.
2658
2659 EB_ALIGN(16) EB_S16 transformIntrinConst_8x8[] = {
2660 83, 36, 83, 36, 83, 36, 83, 36,
2661 36, -83, 36, -83, 36, -83, 36, -83,
2662 89, 75, 89, 75, 89, 75, 89, 75,
2663 50, 18, 50, 18, 50, 18, 50, 18,
2664 75, -18, 75, -18, 75, -18, 75, -18,
2665 -89, -50, -89, -50, -89, -50, -89, -50,
2666 50, -89, 50, -89, 50, -89, 50, -89,
2667 18, 75, 18, 75, 18, 75, 18, 75,
2668 18, -50, 18, -50, 18, -50, 18, -50,
2669 75, -89, 75, -89, 75, -89, 75, -89,
2670 256, 0, 256, 0, 256, 0, 256, 0,
2671 64, 64, 64, 64, 64, 64, 64, 64,
2672 -18, -50, -18, -50, -18, -50, -18, -50,
2673 -75, -89, -75, -89, -75, -89, -75, -89,
2674 -36, -83, -36, -83, -36, -83, -36, -83,
2675 -83, -36, -83, -36, -83, -36, -83, -36,
2676 36, 83, 36, 83, 36, 83, 36, 83,
2677 50, 89, 50, 89, 50, 89, 50, 89,
2678 18, -75, 18, -75, 18, -75, 18, -75,
2679 -64, 64, -64, 64, -64, 64, -64, 64,
2680 64, -64, 64, -64, 64, -64, 64, -64,
2681 -75, -18, -75, -18, -75, -18, -75, -18,
2682 89, -50, 89, -50, 89, -50, 89, -50,
2683 83, -36, 83, -36, 83, -36, 83, -36,
2684 -36, 83, -36, 83, -36, 83, -36, 83,
2685 -83, 36, -83, 36, -83, 36, -83, 36,
2686 89, -75, 89, -75, 89, -75, 89, -75,
2687 50, -18, 50, -18, 50, -18, 50, -18,
2688 };
2689 __m128i sum, sum1, sum2, sum3, sum4;
2690 __m128i res0, res1, res2, res3, res4, res5, res6, res7;
2691 __m128i res01, res23, res45, res67, res02, res0123, res46, res4567, res04, res0246, res0145, res0_to_7;
2692 __m128i even0, even1, even2, even3, odd0, odd1, odd2, odd3, odd01_lo, odd01_hi, odd23_lo, odd23_hi;
2693 __m128i evenEven0, evenEven1, evenOdd0, evenOdd1;
2694 __m128i trans0, trans1, trans2, trans3, trans4, trans5, trans6, trans7, trans01, trans23, trans45, trans67;
2695 __m128i trans02, trans0123, trans46, trans4567;
2696 __m128i xmm_offset;
2697 EB_ALIGN(16) EB_S16 * TransformIntrinConst = transformIntrinConst_8x8;
2698 EB_U32 shift;
2699
2700 res0 = _mm_loadu_si128((__m128i *)residual);
2701 res1 = _mm_loadu_si128((__m128i *)(residual + srcStride));
2702 res2 = _mm_loadu_si128((__m128i *)(residual + 2 * srcStride));
2703 res3 = _mm_loadu_si128((__m128i *)(residual + 3 * srcStride));
2704 residual += (srcStride << 2);
2705 res4 = _mm_loadu_si128((__m128i *)residual);
2706 res5 = _mm_loadu_si128((__m128i *)(residual + srcStride));
2707 res6 = _mm_loadu_si128((__m128i *)(residual + 2 * srcStride));
2708 res7 = _mm_loadu_si128((__m128i *)(residual + 3 * srcStride));
2709
2710 MACRO_UNPACK(16, res0, res1, res2, res3, res4, res5, res6, res7, res01, res23, res45, res67)
2711 MACRO_UNPACK(32, res0, res2, res01, res23, res4, res6, res45, res67, res02, res0123, res46, res4567)
2712 MACRO_UNPACK(64, res0, res4, res02, res46, res01, res45, res0123, res4567, res04, res0246, res0145, res0_to_7)
2713 MACRO_CALC_EVEN_ODD(res0, res04, res02, res0246, res01, res0145, res0123, res0_to_7)
2714
2715 evenEven0 = _mm_add_epi16(even0, even3);
2716 evenEven1 = _mm_add_epi16(even1, even2);
2717 evenOdd0 = _mm_sub_epi16(even0, even3);
2718 evenOdd1 = _mm_sub_epi16(even1, even2);
2719
2720 shift = 4 - bitIncrement;
2721 trans0 = _mm_slli_epi16(_mm_add_epi16(evenEven0, evenEven1), shift);
2722 trans4 = _mm_slli_epi16(_mm_sub_epi16(evenEven0, evenEven1), shift);
2723
2724 xmm_offset = _mm_slli_epi32(_mm_set1_epi32(0x00000002), bitIncrement);
2725 shift = bitIncrement + 2;
2726
2727 trans2 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(_mm_load_si128((__m128i *)(TransformIntrinConst + TRANS8x8_OFFSET_83_36)),_mm_unpacklo_epi16(evenOdd0, evenOdd1)), xmm_offset), shift),
2728 _mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(_mm_load_si128((__m128i *)(TransformIntrinConst + TRANS8x8_OFFSET_83_36)),_mm_unpackhi_epi16(evenOdd0, evenOdd1)), xmm_offset), shift));
2729
2730 trans6 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(_mm_load_si128((__m128i *)(TransformIntrinConst + TRANS8x8_OFFSET_36_N83)),_mm_unpacklo_epi16(evenOdd0, evenOdd1)), xmm_offset), shift),
2731 _mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(_mm_load_si128((__m128i *)(TransformIntrinConst + TRANS8x8_OFFSET_36_N83)),_mm_unpackhi_epi16(evenOdd0, evenOdd1)), xmm_offset), shift));
2732
2733 // TransformCoefficients 1, 3, 5, 7
2734 odd01_lo = _mm_unpacklo_epi16(odd0, odd1);
2735 odd01_hi = _mm_unpackhi_epi16(odd0, odd1);
2736 odd23_lo = _mm_unpacklo_epi16(odd2, odd3);
2737 odd23_hi = _mm_unpackhi_epi16(odd2, odd3);
2738
2739 MACRO_TRANS_4MAC_NO_SAVE(odd01_lo, odd01_hi, odd23_lo, odd23_hi, trans1, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_89_75, TRANS8x8_OFFSET_50_18, shift)
2740 MACRO_TRANS_4MAC_NO_SAVE(odd01_lo, odd01_hi, odd23_lo, odd23_hi, trans3, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_75_N18, TRANS8x8_OFFSET_N89_N50, shift)
2741 MACRO_TRANS_4MAC_NO_SAVE(odd01_lo, odd01_hi, odd23_lo, odd23_hi, trans5, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_50_N89, TRANS8x8_OFFSET_18_75, shift)
2742 MACRO_TRANS_4MAC_NO_SAVE(odd01_lo, odd01_hi, odd23_lo, odd23_hi, trans7, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_18_N50, TRANS8x8_OFFSET_75_N89, shift)
2743
2744 MACRO_UNPACK(32, trans0, trans1, trans2, trans3, trans4, trans5, trans6, trans7, trans01, trans23, trans45, trans67)
2745 MACRO_UNPACK(64, trans0, trans2, trans01, trans23, trans4, trans6, trans45, trans67, trans02, trans0123, trans46, trans4567)
2746
2747 xmm_offset = _mm_loadu_si128((__m128i *)(TransformIntrinConst + TRANS8x8_OFFSET_256));
2748
2749 MACRO_TRANS_8MAC(trans0, trans02, trans01, trans0123, trans4, trans46, trans45, trans4567, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_64_64, TRANS8x8_OFFSET_64_64, TRANS8x8_OFFSET_64_64, TRANS8x8_OFFSET_64_64, 9, _mm_storeu_si128, transformCoefficients, 0)
2750 MACRO_TRANS_8MAC(trans0, trans02, trans01, trans0123, trans4, trans46, trans45, trans4567, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_89_75, TRANS8x8_OFFSET_50_18, TRANS8x8_OFFSET_N18_N50, TRANS8x8_OFFSET_N75_N89, 9, _mm_storeu_si128, transformCoefficients, (dstStride))
2751 MACRO_TRANS_8MAC(trans0, trans02, trans01, trans0123, trans4, trans46, trans45, trans4567, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_83_36, TRANS8x8_OFFSET_N36_N83, TRANS8x8_OFFSET_N83_N36, TRANS8x8_OFFSET_36_83, 9, _mm_storeu_si128, transformCoefficients, (2 * dstStride))
2752 MACRO_TRANS_8MAC(trans0, trans02, trans01, trans0123, trans4, trans46, trans45, trans4567, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_75_N18, TRANS8x8_OFFSET_N89_N50, TRANS8x8_OFFSET_50_89, TRANS8x8_OFFSET_18_N75, 9, _mm_storeu_si128, transformCoefficients, (3 * dstStride))
2753 transformCoefficients += 4 * dstStride;
2754 MACRO_TRANS_8MAC(trans0, trans02, trans01, trans0123, trans4, trans46, trans45, trans4567, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_64_N64, TRANS8x8_OFFSET_N64_64, TRANS8x8_OFFSET_64_N64, TRANS8x8_OFFSET_N64_64, 9, _mm_storeu_si128, transformCoefficients, 0)
2755 MACRO_TRANS_8MAC(trans0, trans02, trans01, trans0123, trans4, trans46, trans45, trans4567, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_50_N89, TRANS8x8_OFFSET_18_75, TRANS8x8_OFFSET_N75_N18, TRANS8x8_OFFSET_89_N50, 9, _mm_storeu_si128, transformCoefficients, (dstStride))
2756 MACRO_TRANS_8MAC(trans0, trans02, trans01, trans0123, trans4, trans46, trans45, trans4567, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_36_N83, TRANS8x8_OFFSET_83_N36, TRANS8x8_OFFSET_N36_83, TRANS8x8_OFFSET_N83_36, 9, _mm_storeu_si128, transformCoefficients, (2 * dstStride))
2757 MACRO_TRANS_8MAC(trans0, trans02, trans01, trans0123, trans4, trans46, trans45, trans4567, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_18_N50, TRANS8x8_OFFSET_75_N89, TRANS8x8_OFFSET_89_N75, TRANS8x8_OFFSET_50_N18, 9, _mm_storeu_si128, transformCoefficients, (3 * dstStride))
2758
2759 (void)transformInnerArrayPtr;
2760 }
2761
2762
PfreqTransform8x8_SSE2_INTRIN(EB_S16 * residual,const EB_U32 srcStride,EB_S16 * transformCoefficients,const EB_U32 dstStride,EB_S16 * transformInnerArrayPtr,EB_U32 bitIncrement)2763 void PfreqTransform8x8_SSE2_INTRIN(
2764 EB_S16 *residual,
2765 const EB_U32 srcStride,
2766 EB_S16 *transformCoefficients,
2767 const EB_U32 dstStride,
2768 EB_S16 *transformInnerArrayPtr,
2769 EB_U32 bitIncrement)
2770 {
2771 // Transform8x8 has its own table because the larger table's offset macros exceed 256 (which is maximum macro expansion depth
2772 // Use a smaller table with values just for Transform8x8.
2773
2774 EB_ALIGN(16) EB_S16 transformIntrinConst_8x8[] = {
2775 83, 36, 83, 36, 83, 36, 83, 36,
2776 36, -83, 36, -83, 36, -83, 36, -83,
2777 89, 75, 89, 75, 89, 75, 89, 75,
2778 50, 18, 50, 18, 50, 18, 50, 18,
2779 75, -18, 75, -18, 75, -18, 75, -18,
2780 -89, -50, -89, -50, -89, -50, -89, -50,
2781 50, -89, 50, -89, 50, -89, 50, -89,
2782 18, 75, 18, 75, 18, 75, 18, 75,
2783 18, -50, 18, -50, 18, -50, 18, -50,
2784 75, -89, 75, -89, 75, -89, 75, -89,
2785 256, 0, 256, 0, 256, 0, 256, 0,
2786 64, 64, 64, 64, 64, 64, 64, 64,
2787 -18, -50, -18, -50, -18, -50, -18, -50,
2788 -75, -89, -75, -89, -75, -89, -75, -89,
2789 -36, -83, -36, -83, -36, -83, -36, -83,
2790 -83, -36, -83, -36, -83, -36, -83, -36,
2791 36, 83, 36, 83, 36, 83, 36, 83,
2792 50, 89, 50, 89, 50, 89, 50, 89,
2793 18, -75, 18, -75, 18, -75, 18, -75,
2794 -64, 64, -64, 64, -64, 64, -64, 64,
2795 64, -64, 64, -64, 64, -64, 64, -64,
2796 -75, -18, -75, -18, -75, -18, -75, -18,
2797 89, -50, 89, -50, 89, -50, 89, -50,
2798 83, -36, 83, -36, 83, -36, 83, -36,
2799 -36, 83, -36, 83, -36, 83, -36, 83,
2800 -83, 36, -83, 36, -83, 36, -83, 36,
2801 89, -75, 89, -75, 89, -75, 89, -75,
2802 50, -18, 50, -18, 50, -18, 50, -18,
2803 };
2804 __m128i sum, sum1, sum2/*, sum3, sum4*/;
2805 __m128i res0, res1, res2, res3, res4, res5, res6, res7;
2806 __m128i res01, res23, res45, res67, res02, res0123, res46, res4567, res04, res0246, res0145, res0_to_7;
2807 __m128i even0, even1, even2, even3, odd0, odd1, odd2, odd3, odd01_lo, odd01_hi, odd23_lo, odd23_hi;
2808 __m128i evenEven0, evenEven1, evenOdd0, evenOdd1;
2809 __m128i trans0, trans1, trans2, trans3, trans4/*, trans5, trans6, trans7*/, trans01, trans23, trans45, trans67;
2810 __m128i trans02, trans0123;
2811 __m128i xmm_offset;
2812 EB_ALIGN(16) EB_S16 * TransformIntrinConst = transformIntrinConst_8x8;
2813 EB_U32 shift;
2814
2815 res0 = _mm_loadu_si128((__m128i *)residual);
2816 res1 = _mm_loadu_si128((__m128i *)(residual + srcStride));
2817 res2 = _mm_loadu_si128((__m128i *)(residual + 2 * srcStride));
2818 res3 = _mm_loadu_si128((__m128i *)(residual + 3 * srcStride));
2819 residual += (srcStride << 2);
2820 res4 = _mm_loadu_si128((__m128i *)residual);
2821 res5 = _mm_loadu_si128((__m128i *)(residual + srcStride));
2822 res6 = _mm_loadu_si128((__m128i *)(residual + 2 * srcStride));
2823 res7 = _mm_loadu_si128((__m128i *)(residual + 3 * srcStride));
2824
2825 MACRO_UNPACK(16, res0, res1, res2, res3, res4, res5, res6, res7, res01, res23, res45, res67)
2826 MACRO_UNPACK(32, res0, res2, res01, res23, res4, res6, res45, res67, res02, res0123, res46, res4567)
2827 MACRO_UNPACK(64, res0, res4, res02, res46, res01, res45, res0123, res4567, res04, res0246, res0145, res0_to_7)
2828 MACRO_CALC_EVEN_ODD(res0, res04, res02, res0246, res01, res0145, res0123, res0_to_7)
2829
2830 evenEven0 = _mm_add_epi16(even0, even3);
2831 evenEven1 = _mm_add_epi16(even1, even2);
2832 evenOdd0 = _mm_sub_epi16(even0, even3);
2833 evenOdd1 = _mm_sub_epi16(even1, even2);
2834
2835 shift = 4 - bitIncrement;
2836 trans0 = _mm_slli_epi16(_mm_add_epi16(evenEven0, evenEven1), shift);
2837 trans4 = _mm_slli_epi16(_mm_sub_epi16(evenEven0, evenEven1), shift);
2838
2839 xmm_offset = _mm_slli_epi32(_mm_set1_epi32(0x00000002), bitIncrement);
2840 shift = bitIncrement + 2;
2841
2842 trans2 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(_mm_load_si128((__m128i *)(TransformIntrinConst + TRANS8x8_OFFSET_83_36)), _mm_unpacklo_epi16(evenOdd0, evenOdd1)), xmm_offset), shift),
2843 _mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(_mm_load_si128((__m128i *)(TransformIntrinConst + TRANS8x8_OFFSET_83_36)), _mm_unpackhi_epi16(evenOdd0, evenOdd1)), xmm_offset), shift));
2844
2845 //trans6 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(_mm_load_si128((__m128i *)(TransformIntrinConst + TRANS8x8_OFFSET_36_N83)), _mm_unpacklo_epi16(evenOdd0, evenOdd1)), xmm_offset), shift),
2846 // _mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(_mm_load_si128((__m128i *)(TransformIntrinConst + TRANS8x8_OFFSET_36_N83)), _mm_unpackhi_epi16(evenOdd0, evenOdd1)), xmm_offset), shift));
2847
2848 // TransformCoefficients 1, 3, 5, 7
2849 odd01_lo = _mm_unpacklo_epi16(odd0, odd1);
2850 odd01_hi = _mm_unpackhi_epi16(odd0, odd1);
2851 odd23_lo = _mm_unpacklo_epi16(odd2, odd3);
2852 odd23_hi = _mm_unpackhi_epi16(odd2, odd3);
2853
2854 MACRO_TRANS_4MAC_NO_SAVE(odd01_lo, odd01_hi, odd23_lo, odd23_hi, trans1, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_89_75, TRANS8x8_OFFSET_50_18, shift)
2855 MACRO_TRANS_4MAC_NO_SAVE(odd01_lo, odd01_hi, odd23_lo, odd23_hi, trans3, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_75_N18, TRANS8x8_OFFSET_N89_N50, shift)
2856 //MACRO_TRANS_4MAC_NO_SAVE(odd01_lo, odd01_hi, odd23_lo, odd23_hi, trans5, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_50_N89, TRANS8x8_OFFSET_18_75, shift)
2857 //MACRO_TRANS_4MAC_NO_SAVE(odd01_lo, odd01_hi, odd23_lo, odd23_hi, trans7, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_18_N50, TRANS8x8_OFFSET_75_N89, shift)
2858
2859 MACRO_UNPACK(32, trans0, trans1, trans2, trans3, trans4/*, trans5, trans6, trans7*/, trans1, trans1, trans1, trans01, trans23, trans45, trans67)
2860 MACRO_UNPACK_V2(64, trans0, trans2, trans01, trans23, trans4, trans0, /*trans6,*/ trans45, trans67, trans02, trans0123)
2861
2862 xmm_offset = _mm_loadu_si128((__m128i *)(TransformIntrinConst + TRANS8x8_OFFSET_256));
2863
2864 MACRO_TRANS_8MAC_PF_N2(trans0, trans02, trans01, trans0123, trans4, trans45, trans45, trans45, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_64_64, TRANS8x8_OFFSET_64_64, TRANS8x8_OFFSET_64_64, TRANS8x8_OFFSET_64_64, 9, _mm_storeu_si128, transformCoefficients, 0)
2865 MACRO_TRANS_8MAC_PF_N2(trans0, trans02, trans01, trans0123, trans4, trans45, trans45, trans45, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_89_75, TRANS8x8_OFFSET_50_18, TRANS8x8_OFFSET_N18_N50, TRANS8x8_OFFSET_N75_N89, 9, _mm_storeu_si128, transformCoefficients, (dstStride))
2866 MACRO_TRANS_8MAC_PF_N2(trans0, trans02, trans01, trans0123, trans4, trans45, trans45, trans45, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_83_36, TRANS8x8_OFFSET_N36_N83, TRANS8x8_OFFSET_N83_N36, TRANS8x8_OFFSET_36_83, 9, _mm_storeu_si128, transformCoefficients, (2 * dstStride))
2867 MACRO_TRANS_8MAC_PF_N2(trans0, trans02, trans01, trans0123, trans4, trans45, trans45, trans45, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_75_N18, TRANS8x8_OFFSET_N89_N50, TRANS8x8_OFFSET_50_89, TRANS8x8_OFFSET_18_N75, 9, _mm_storeu_si128, transformCoefficients, (3 * dstStride))
2868 //transformCoefficients += 4 * dstStride;
2869 //MACRO_TRANS_8MAC(trans0, trans02, trans01, trans0123, trans4, trans46, trans45, trans4567, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_64_N64, TRANS8x8_OFFSET_N64_64, TRANS8x8_OFFSET_64_N64, TRANS8x8_OFFSET_N64_64, 9, _mm_storeu_si128, transformCoefficients, 0)
2870 //MACRO_TRANS_8MAC(trans0, trans02, trans01, trans0123, trans4, trans46, trans45, trans4567, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_50_N89, TRANS8x8_OFFSET_18_75, TRANS8x8_OFFSET_N75_N18, TRANS8x8_OFFSET_89_N50, 9, _mm_storeu_si128, transformCoefficients, (dstStride))
2871 //MACRO_TRANS_8MAC(trans0, trans02, trans01, trans0123, trans4, trans46, trans45, trans4567, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_36_N83, TRANS8x8_OFFSET_83_N36, TRANS8x8_OFFSET_N36_83, TRANS8x8_OFFSET_N83_36, 9, _mm_storeu_si128, transformCoefficients, (2 * dstStride))
2872 //MACRO_TRANS_8MAC(trans0, trans02, trans01, trans0123, trans4, trans46, trans45, trans4567, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_18_N50, TRANS8x8_OFFSET_75_N89, TRANS8x8_OFFSET_89_N75, TRANS8x8_OFFSET_50_N18, 9, _mm_storeu_si128, transformCoefficients, (3 * dstStride))
2873
2874 (void)transformInnerArrayPtr;
2875 }
2876
PfreqN4Transform8x8_SSE2_INTRIN(EB_S16 * residual,const EB_U32 srcStride,EB_S16 * transformCoefficients,const EB_U32 dstStride,EB_S16 * transformInnerArrayPtr,EB_U32 bitIncrement)2877 void PfreqN4Transform8x8_SSE2_INTRIN(
2878 EB_S16 *residual,
2879 const EB_U32 srcStride,
2880 EB_S16 *transformCoefficients,
2881 const EB_U32 dstStride,
2882 EB_S16 *transformInnerArrayPtr,
2883 EB_U32 bitIncrement)
2884 {
2885 // Transform8x8 has its own table because the larger table's offset macros exceed 256 (which is maximum macro expansion depth
2886 // Use a smaller table with values just for Transform8x8.
2887
2888 EB_ALIGN(16) EB_S16 transformIntrinConst_8x8[] = {
2889 83, 36, 83, 36, 83, 36, 83, 36,
2890 36, -83, 36, -83, 36, -83, 36, -83,
2891 89, 75, 89, 75, 89, 75, 89, 75,
2892 50, 18, 50, 18, 50, 18, 50, 18,
2893 75, -18, 75, -18, 75, -18, 75, -18,
2894 -89, -50, -89, -50, -89, -50, -89, -50,
2895 50, -89, 50, -89, 50, -89, 50, -89,
2896 18, 75, 18, 75, 18, 75, 18, 75,
2897 18, -50, 18, -50, 18, -50, 18, -50,
2898 75, -89, 75, -89, 75, -89, 75, -89,
2899 256, 0, 256, 0, 256, 0, 256, 0,
2900 64, 64, 64, 64, 64, 64, 64, 64,
2901 -18, -50, -18, -50, -18, -50, -18, -50,
2902 -75, -89, -75, -89, -75, -89, -75, -89,
2903 -36, -83, -36, -83, -36, -83, -36, -83,
2904 -83, -36, -83, -36, -83, -36, -83, -36,
2905 36, 83, 36, 83, 36, 83, 36, 83,
2906 50, 89, 50, 89, 50, 89, 50, 89,
2907 18, -75, 18, -75, 18, -75, 18, -75,
2908 -64, 64, -64, 64, -64, 64, -64, 64,
2909 64, -64, 64, -64, 64, -64, 64, -64,
2910 -75, -18, -75, -18, -75, -18, -75, -18,
2911 89, -50, 89, -50, 89, -50, 89, -50,
2912 83, -36, 83, -36, 83, -36, 83, -36,
2913 -36, 83, -36, 83, -36, 83, -36, 83,
2914 -83, 36, -83, 36, -83, 36, -83, 36,
2915 89, -75, 89, -75, 89, -75, 89, -75,
2916 50, -18, 50, -18, 50, -18, 50, -18,
2917 };
2918 __m128i sum, sum1, sum2/*, sum3, sum4*/;
2919 __m128i res0, res1, res2, res3, res4, res5, res6, res7;
2920 __m128i res01, res23, res45, res67, res02, res0123, res46, res4567, res04, res0246, res0145, res0_to_7;
2921 __m128i even0, even1, even2, even3, odd0, odd1, odd2, odd3, odd01_lo, odd01_hi, odd23_lo, odd23_hi;
2922 __m128i evenEven0, evenEven1, evenOdd0, evenOdd1;
2923 __m128i trans0, trans1, trans2, trans3, trans4/*, trans5, trans6, trans7*/, trans01, trans23, trans45, trans67;
2924 __m128i trans02, trans0123;
2925 __m128i xmm_offset;
2926 EB_ALIGN(16) EB_S16 * TransformIntrinConst = transformIntrinConst_8x8;
2927 EB_U32 shift;
2928
2929 res0 = _mm_loadu_si128((__m128i *)residual);
2930 res1 = _mm_loadu_si128((__m128i *)(residual + srcStride));
2931 res2 = _mm_loadu_si128((__m128i *)(residual + 2 * srcStride));
2932 res3 = _mm_loadu_si128((__m128i *)(residual + 3 * srcStride));
2933 residual += (srcStride << 2);
2934 res4 = _mm_loadu_si128((__m128i *)residual);
2935 res5 = _mm_loadu_si128((__m128i *)(residual + srcStride));
2936 res6 = _mm_loadu_si128((__m128i *)(residual + 2 * srcStride));
2937 res7 = _mm_loadu_si128((__m128i *)(residual + 3 * srcStride));
2938
2939 MACRO_UNPACK(16, res0, res1, res2, res3, res4, res5, res6, res7, res01, res23, res45, res67)
2940 MACRO_UNPACK(32, res0, res2, res01, res23, res4, res6, res45, res67, res02, res0123, res46, res4567)
2941 MACRO_UNPACK(64, res0, res4, res02, res46, res01, res45, res0123, res4567, res04, res0246, res0145, res0_to_7)
2942 MACRO_CALC_EVEN_ODD(res0, res04, res02, res0246, res01, res0145, res0123, res0_to_7)
2943
2944 evenEven0 = _mm_add_epi16(even0, even3);
2945 evenEven1 = _mm_add_epi16(even1, even2);
2946 evenOdd0 = _mm_sub_epi16(even0, even3);
2947 evenOdd1 = _mm_sub_epi16(even1, even2);
2948
2949 shift = 4 - bitIncrement;
2950 trans0 = _mm_slli_epi16(_mm_add_epi16(evenEven0, evenEven1), shift);
2951 trans4 = _mm_slli_epi16(_mm_sub_epi16(evenEven0, evenEven1), shift);
2952
2953 xmm_offset = _mm_slli_epi32(_mm_set1_epi32(0x00000002), bitIncrement);
2954 shift = bitIncrement + 2;
2955
2956 trans2 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(_mm_load_si128((__m128i *)(TransformIntrinConst + TRANS8x8_OFFSET_83_36)), _mm_unpacklo_epi16(evenOdd0, evenOdd1)), xmm_offset), shift),
2957 _mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(_mm_load_si128((__m128i *)(TransformIntrinConst + TRANS8x8_OFFSET_83_36)), _mm_unpackhi_epi16(evenOdd0, evenOdd1)), xmm_offset), shift));
2958
2959 //trans6 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(_mm_load_si128((__m128i *)(TransformIntrinConst + TRANS8x8_OFFSET_36_N83)), _mm_unpacklo_epi16(evenOdd0, evenOdd1)), xmm_offset), shift),
2960 // _mm_srai_epi32(_mm_add_epi32(_mm_madd_epi16(_mm_load_si128((__m128i *)(TransformIntrinConst + TRANS8x8_OFFSET_36_N83)), _mm_unpackhi_epi16(evenOdd0, evenOdd1)), xmm_offset), shift));
2961
2962 // TransformCoefficients 1, 3, 5, 7
2963 odd01_lo = _mm_unpacklo_epi16(odd0, odd1);
2964 odd01_hi = _mm_unpackhi_epi16(odd0, odd1);
2965 odd23_lo = _mm_unpacklo_epi16(odd2, odd3);
2966 odd23_hi = _mm_unpackhi_epi16(odd2, odd3);
2967
2968 MACRO_TRANS_4MAC_NO_SAVE(odd01_lo, odd01_hi, odd23_lo, odd23_hi, trans1, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_89_75, TRANS8x8_OFFSET_50_18, shift)
2969 MACRO_TRANS_4MAC_NO_SAVE(odd01_lo, odd01_hi, odd23_lo, odd23_hi, trans3, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_75_N18, TRANS8x8_OFFSET_N89_N50, shift)
2970 //MACRO_TRANS_4MAC_NO_SAVE(odd01_lo, odd01_hi, odd23_lo, odd23_hi, trans5, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_50_N89, TRANS8x8_OFFSET_18_75, shift)
2971 //MACRO_TRANS_4MAC_NO_SAVE(odd01_lo, odd01_hi, odd23_lo, odd23_hi, trans7, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_18_N50, TRANS8x8_OFFSET_75_N89, shift)
2972
2973 MACRO_UNPACK(32, trans0, trans1, trans2, trans3, trans4/*, trans5, trans6, trans7*/, trans1, trans1, trans1, trans01, trans23, trans45, trans67)
2974 MACRO_UNPACK_V2(64, trans0, trans2, trans01, trans23, trans4, trans0, /*trans6,*/ trans45, trans67, trans02, trans0123)
2975
2976 xmm_offset = _mm_loadu_si128((__m128i *)(TransformIntrinConst + TRANS8x8_OFFSET_256));
2977
2978 MACRO_TRANS_8MAC_PF_N4(trans0, trans02, trans01, trans0123, trans4, trans45, trans45, trans45, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_64_64, TRANS8x8_OFFSET_64_64, TRANS8x8_OFFSET_64_64, TRANS8x8_OFFSET_64_64, 9, _mm_storeu_si128, transformCoefficients, 0)
2979 MACRO_TRANS_8MAC_PF_N4(trans0, trans02, trans01, trans0123, trans4, trans45, trans45, trans45, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_89_75, TRANS8x8_OFFSET_50_18, TRANS8x8_OFFSET_N18_N50, TRANS8x8_OFFSET_N75_N89, 9, _mm_storeu_si128, transformCoefficients, (dstStride))
2980 //MACRO_TRANS_8MAC_PF_N2(trans0, trans02, trans01, trans0123, trans4, trans46, trans45, trans4567, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_83_36, TRANS8x8_OFFSET_N36_N83, TRANS8x8_OFFSET_N83_N36, TRANS8x8_OFFSET_36_83, 9, _mm_storeu_si128, transformCoefficients, (2 * dstStride))
2981 //MACRO_TRANS_8MAC_PF_N2(trans0, trans02, trans01, trans0123, trans4, trans46, trans45, trans4567, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_75_N18, TRANS8x8_OFFSET_N89_N50, TRANS8x8_OFFSET_50_89, TRANS8x8_OFFSET_18_N75, 9, _mm_storeu_si128, transformCoefficients, (3 * dstStride))
2982 //transformCoefficients += 4 * dstStride;
2983 //MACRO_TRANS_8MAC(trans0, trans02, trans01, trans0123, trans4, trans46, trans45, trans4567, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_64_N64, TRANS8x8_OFFSET_N64_64, TRANS8x8_OFFSET_64_N64, TRANS8x8_OFFSET_N64_64, 9, _mm_storeu_si128, transformCoefficients, 0)
2984 //MACRO_TRANS_8MAC(trans0, trans02, trans01, trans0123, trans4, trans46, trans45, trans4567, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_50_N89, TRANS8x8_OFFSET_18_75, TRANS8x8_OFFSET_N75_N18, TRANS8x8_OFFSET_89_N50, 9, _mm_storeu_si128, transformCoefficients, (dstStride))
2985 //MACRO_TRANS_8MAC(trans0, trans02, trans01, trans0123, trans4, trans46, trans45, trans4567, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_36_N83, TRANS8x8_OFFSET_83_N36, TRANS8x8_OFFSET_N36_83, TRANS8x8_OFFSET_N83_36, 9, _mm_storeu_si128, transformCoefficients, (2 * dstStride))
2986 //MACRO_TRANS_8MAC(trans0, trans02, trans01, trans0123, trans4, trans46, trans45, trans4567, xmm_offset, TransformIntrinConst, TRANS8x8_OFFSET_18_N50, TRANS8x8_OFFSET_75_N89, TRANS8x8_OFFSET_89_N75, TRANS8x8_OFFSET_50_N18, 9, _mm_storeu_si128, transformCoefficients, (3 * dstStride))
2987
2988 (void)transformInnerArrayPtr;
2989 }
2990