1 /*
2 * Copyright(c) 2018 Intel Corporation
3 * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 */
5
6 #include "EbDefinitions.h"
7
8 #include "EbTransforms_SSSE3.h"
9
10 #include <emmintrin.h>
11 #include <tmmintrin.h>
12
13
14 #define SSSE3/// __SSSE3__
15
16
17
18 #ifdef __cplusplus
19 extern "C" const EB_S16 EbHevcCoeff_tbl[48*8];
20 extern "C" const EB_S16 EbHevcCoeff_tbl2[48*8];
21 #else
22 extern const EB_S16 EbHevcCoeff_tbl[48*8];
23 extern const EB_S16 EbHevcCoeff_tbl2[48*8];
24 #endif
25
26
27 // Reverse order of 16-bit elements within 128-bit vector
28 // This can be done more efficiently with _mm_shuffle_epi8 but requires SSSE3
reverse_epi16(__m128i x)29 static __m128i reverse_epi16(__m128i x)
30 {
31 #ifdef SSSE3/// __SSSE3__
32 return _mm_shuffle_epi8(x, _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
33 #else
34 x = _mm_shuffle_epi32(x, 0x1b); // 00011011
35 x = _mm_shufflelo_epi16(x, 0xb1); // 10110001
36 x = _mm_shufflehi_epi16(x, 0xb1);
37 return x;
38 #endif
39 }
40
41
42 // transpose 16x16 block of data
transpose16(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride)43 static void transpose16(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride)
44 {
45 EB_U32 i, j;
46 for (i = 0; i < 2; i++)
47 {
48 for (j = 0; j < 2; j++)
49 {
50 __m128i a0, a1, a2, a3, a4, a5, a6, a7;
51 __m128i b0, b1, b2, b3, b4, b5, b6, b7;
52
53 a0 = _mm_loadu_si128((const __m128i *)(src + (8*i+0)*src_stride + 8*j));
54 a1 = _mm_loadu_si128((const __m128i *)(src + (8*i+1)*src_stride + 8*j));
55 a2 = _mm_loadu_si128((const __m128i *)(src + (8*i+2)*src_stride + 8*j));
56 a3 = _mm_loadu_si128((const __m128i *)(src + (8*i+3)*src_stride + 8*j));
57 a4 = _mm_loadu_si128((const __m128i *)(src + (8*i+4)*src_stride + 8*j));
58 a5 = _mm_loadu_si128((const __m128i *)(src + (8*i+5)*src_stride + 8*j));
59 a6 = _mm_loadu_si128((const __m128i *)(src + (8*i+6)*src_stride + 8*j));
60 a7 = _mm_loadu_si128((const __m128i *)(src + (8*i+7)*src_stride + 8*j));
61
62 b0 = _mm_unpacklo_epi16(a0, a4);
63 b1 = _mm_unpacklo_epi16(a1, a5);
64 b2 = _mm_unpacklo_epi16(a2, a6);
65 b3 = _mm_unpacklo_epi16(a3, a7);
66 b4 = _mm_unpackhi_epi16(a0, a4);
67 b5 = _mm_unpackhi_epi16(a1, a5);
68 b6 = _mm_unpackhi_epi16(a2, a6);
69 b7 = _mm_unpackhi_epi16(a3, a7);
70
71 a0 = _mm_unpacklo_epi16(b0, b2);
72 a1 = _mm_unpacklo_epi16(b1, b3);
73 a2 = _mm_unpackhi_epi16(b0, b2);
74 a3 = _mm_unpackhi_epi16(b1, b3);
75 a4 = _mm_unpacklo_epi16(b4, b6);
76 a5 = _mm_unpacklo_epi16(b5, b7);
77 a6 = _mm_unpackhi_epi16(b4, b6);
78 a7 = _mm_unpackhi_epi16(b5, b7);
79
80 b0 = _mm_unpacklo_epi16(a0, a1);
81 b1 = _mm_unpackhi_epi16(a0, a1);
82 b2 = _mm_unpacklo_epi16(a2, a3);
83 b3 = _mm_unpackhi_epi16(a2, a3);
84 b4 = _mm_unpacklo_epi16(a4, a5);
85 b5 = _mm_unpackhi_epi16(a4, a5);
86 b6 = _mm_unpacklo_epi16(a6, a7);
87 b7 = _mm_unpackhi_epi16(a6, a7);
88
89 _mm_storeu_si128((__m128i *)(dst + (8*j+0)*dst_stride + 8*i), b0);
90 _mm_storeu_si128((__m128i *)(dst + (8*j+1)*dst_stride + 8*i), b1);
91 _mm_storeu_si128((__m128i *)(dst + (8*j+2)*dst_stride + 8*i), b2);
92 _mm_storeu_si128((__m128i *)(dst + (8*j+3)*dst_stride + 8*i), b3);
93 _mm_storeu_si128((__m128i *)(dst + (8*j+4)*dst_stride + 8*i), b4);
94 _mm_storeu_si128((__m128i *)(dst + (8*j+5)*dst_stride + 8*i), b5);
95 _mm_storeu_si128((__m128i *)(dst + (8*j+6)*dst_stride + 8*i), b6);
96 _mm_storeu_si128((__m128i *)(dst + (8*j+7)*dst_stride + 8*i), b7);
97 }
98 }
99 }
100
transpose16Partial(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 pattern)101 static void transpose16Partial(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_U32 pattern)
102 {
103 EB_U32 j;
104 EB_U32 numRows = 2 - (pattern & 1);
105
106 do
107 {
108 for (j = 0; j < 2; j++)
109 {
110 __m128i a0, a1, a2, a3, a4, a5, a6, a7;
111 __m128i b0, b1, b2, b3, b4, b5, b6, b7;
112
113 a0 = _mm_loadu_si128((const __m128i *)(src + (0)*src_stride + 8*j));
114 a1 = _mm_loadu_si128((const __m128i *)(src + (1)*src_stride + 8*j));
115 a2 = _mm_loadu_si128((const __m128i *)(src + (2)*src_stride + 8*j));
116 a3 = _mm_loadu_si128((const __m128i *)(src + (3)*src_stride + 8*j));
117 a4 = _mm_loadu_si128((const __m128i *)(src + (4)*src_stride + 8*j));
118 a5 = _mm_loadu_si128((const __m128i *)(src + (5)*src_stride + 8*j));
119 a6 = _mm_loadu_si128((const __m128i *)(src + (6)*src_stride + 8*j));
120 a7 = _mm_loadu_si128((const __m128i *)(src + (7)*src_stride + 8*j));
121
122 b0 = _mm_unpacklo_epi16(a0, a4);
123 b1 = _mm_unpacklo_epi16(a1, a5);
124 b2 = _mm_unpacklo_epi16(a2, a6);
125 b3 = _mm_unpacklo_epi16(a3, a7);
126 b4 = _mm_unpackhi_epi16(a0, a4);
127 b5 = _mm_unpackhi_epi16(a1, a5);
128 b6 = _mm_unpackhi_epi16(a2, a6);
129 b7 = _mm_unpackhi_epi16(a3, a7);
130
131 a0 = _mm_unpacklo_epi16(b0, b2);
132 a1 = _mm_unpacklo_epi16(b1, b3);
133 a2 = _mm_unpackhi_epi16(b0, b2);
134 a3 = _mm_unpackhi_epi16(b1, b3);
135 a4 = _mm_unpacklo_epi16(b4, b6);
136 a5 = _mm_unpacklo_epi16(b5, b7);
137 a6 = _mm_unpackhi_epi16(b4, b6);
138 a7 = _mm_unpackhi_epi16(b5, b7);
139
140 b0 = _mm_unpacklo_epi16(a0, a1);
141 b1 = _mm_unpackhi_epi16(a0, a1);
142 b2 = _mm_unpacklo_epi16(a2, a3);
143 b3 = _mm_unpackhi_epi16(a2, a3);
144 b4 = _mm_unpacklo_epi16(a4, a5);
145 b5 = _mm_unpackhi_epi16(a4, a5);
146 b6 = _mm_unpacklo_epi16(a6, a7);
147 b7 = _mm_unpackhi_epi16(a6, a7);
148
149 _mm_storeu_si128((__m128i *)(dst + (8*j+0)*dst_stride), b0);
150 _mm_storeu_si128((__m128i *)(dst + (8*j+1)*dst_stride), b1);
151 _mm_storeu_si128((__m128i *)(dst + (8*j+2)*dst_stride), b2);
152 _mm_storeu_si128((__m128i *)(dst + (8*j+3)*dst_stride), b3);
153 _mm_storeu_si128((__m128i *)(dst + (8*j+4)*dst_stride), b4);
154 _mm_storeu_si128((__m128i *)(dst + (8*j+5)*dst_stride), b5);
155 _mm_storeu_si128((__m128i *)(dst + (8*j+6)*dst_stride), b6);
156 _mm_storeu_si128((__m128i *)(dst + (8*j+7)*dst_stride), b7);
157 }
158
159 src += 8*src_stride;
160 dst += 8;
161 }
162 while (--numRows);
163 }
164
transpose16Check0s(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride)165 static EB_U32 transpose16Check0s(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride)
166 {
167 EB_U32 i, j;
168 EB_U32 zeroPattern = 0;
169 EB_U32 result = 0;
170
171 for (i = 0; i < 2; i++)
172 {
173 for (j = 0; j < 2; j++)
174 {
175 __m128i a0, a1, a2, a3, a4, a5, a6, a7;
176 __m128i b0, b1, b2, b3, b4, b5, b6, b7;
177 __m128i c0;
178
179 a0 = _mm_loadu_si128((const __m128i *)(src + (8*i+0)*src_stride + 8*j));
180 a1 = _mm_loadu_si128((const __m128i *)(src + (8*i+1)*src_stride + 8*j));
181 a2 = _mm_loadu_si128((const __m128i *)(src + (8*i+2)*src_stride + 8*j));
182 a3 = _mm_loadu_si128((const __m128i *)(src + (8*i+3)*src_stride + 8*j));
183 a4 = _mm_loadu_si128((const __m128i *)(src + (8*i+4)*src_stride + 8*j));
184 a5 = _mm_loadu_si128((const __m128i *)(src + (8*i+5)*src_stride + 8*j));
185 a6 = _mm_loadu_si128((const __m128i *)(src + (8*i+6)*src_stride + 8*j));
186 a7 = _mm_loadu_si128((const __m128i *)(src + (8*i+7)*src_stride + 8*j));
187
188 c0 = _mm_or_si128(a0, a4);
189 c0 = _mm_or_si128(c0, a1);
190 c0 = _mm_or_si128(c0, a5);
191 c0 = _mm_or_si128(c0, a2);
192 c0 = _mm_or_si128(c0, a6);
193 c0 = _mm_or_si128(c0, a3);
194 c0 = _mm_or_si128(c0, a7);
195
196 c0 = _mm_cmpeq_epi8(c0, _mm_setzero_si128());
197
198 zeroPattern = 2 * zeroPattern + ((_mm_movemask_epi8(c0)+1) >> 16); // add a '1' bit if all zeros
199
200 b0 = _mm_unpacklo_epi16(a0, a4);
201 b1 = _mm_unpacklo_epi16(a1, a5);
202 b2 = _mm_unpacklo_epi16(a2, a6);
203 b3 = _mm_unpacklo_epi16(a3, a7);
204 b4 = _mm_unpackhi_epi16(a0, a4);
205 b5 = _mm_unpackhi_epi16(a1, a5);
206 b6 = _mm_unpackhi_epi16(a2, a6);
207 b7 = _mm_unpackhi_epi16(a3, a7);
208
209 a0 = _mm_unpacklo_epi16(b0, b2);
210 a1 = _mm_unpacklo_epi16(b1, b3);
211 a2 = _mm_unpackhi_epi16(b0, b2);
212 a3 = _mm_unpackhi_epi16(b1, b3);
213 a4 = _mm_unpacklo_epi16(b4, b6);
214 a5 = _mm_unpacklo_epi16(b5, b7);
215 a6 = _mm_unpackhi_epi16(b4, b6);
216 a7 = _mm_unpackhi_epi16(b5, b7);
217
218 b0 = _mm_unpacklo_epi16(a0, a1);
219 b1 = _mm_unpackhi_epi16(a0, a1);
220 b2 = _mm_unpacklo_epi16(a2, a3);
221 b3 = _mm_unpackhi_epi16(a2, a3);
222 b4 = _mm_unpacklo_epi16(a4, a5);
223 b5 = _mm_unpackhi_epi16(a4, a5);
224 b6 = _mm_unpacklo_epi16(a6, a7);
225 b7 = _mm_unpackhi_epi16(a6, a7);
226
227 _mm_storeu_si128((__m128i *)(dst + (8*j+0)*dst_stride + 8*i), b0);
228 _mm_storeu_si128((__m128i *)(dst + (8*j+1)*dst_stride + 8*i), b1);
229 _mm_storeu_si128((__m128i *)(dst + (8*j+2)*dst_stride + 8*i), b2);
230 _mm_storeu_si128((__m128i *)(dst + (8*j+3)*dst_stride + 8*i), b3);
231 _mm_storeu_si128((__m128i *)(dst + (8*j+4)*dst_stride + 8*i), b4);
232 _mm_storeu_si128((__m128i *)(dst + (8*j+5)*dst_stride + 8*i), b5);
233 _mm_storeu_si128((__m128i *)(dst + (8*j+6)*dst_stride + 8*i), b6);
234 _mm_storeu_si128((__m128i *)(dst + (8*j+7)*dst_stride + 8*i), b7);
235 }
236 }
237
238 if ((zeroPattern & 3) == 3) result |= 1; // can do half transforms 1st pass
239 if ((zeroPattern & 5) == 5) result |= 2; // can do half rows 1st pass, and half transforms 2nd pass
240 return result;
241 }
242
243 // 16-point forward transform (16 rows)
transform16(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_S16 shift)244 static void transform16(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_S16 shift)
245 {
246 EB_U32 i;
247 __m128i s0 = _mm_cvtsi32_si128(shift);
248 __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
249 const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl;
250
251 for (i = 0; i < 16; i++)
252 {
253 __m128i x0, x1;
254 __m128i y0, y1;
255 __m128i a0, a1, a2, a3;
256 __m128i b0, b1, b2, b3;
257
258 y0 = _mm_loadu_si128((const __m128i *)(src+i*src_stride+0x00));
259 y1 = _mm_loadu_si128((const __m128i *)(src+i*src_stride+0x08));
260
261
262 // 16-point butterfly
263 y1 = reverse_epi16(y1);
264
265 x0 = _mm_add_epi16(y0, y1);
266 x1 = _mm_sub_epi16(y0, y1);
267
268 a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
269 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[2]));
270 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[4]));
271 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[6]));
272
273 a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
274 a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[3]));
275 a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[5]));
276 a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[7]));
277
278 a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]);
279 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10]));
280 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[12]));
281 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[14]));
282
283 a3 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[9]);
284 a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[11]));
285 a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[13]));
286 a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[15]));
287
288 b0 = _mm_sra_epi32(_mm_add_epi32(a0, o0), s0);
289 b1 = _mm_sra_epi32(_mm_add_epi32(a1, o0), s0);
290 b2 = _mm_sra_epi32(_mm_add_epi32(a2, o0), s0);
291 b3 = _mm_sra_epi32(_mm_add_epi32(a3, o0), s0);
292
293 x0 = _mm_packs_epi32(b0, b1);
294 x1 = _mm_packs_epi32(b2, b3);
295
296 y0 = _mm_unpacklo_epi16(x0, x1);
297 y1 = _mm_unpackhi_epi16(x0, x1);
298
299 _mm_storeu_si128((__m128i *)(dst+i*dst_stride+0x00), y0);
300 _mm_storeu_si128((__m128i *)(dst+i*dst_stride+0x08), y1);
301 }
302 }
303
304 // 16-point inverse transform
invTransform16(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift,EB_U32 numRows)305 static void invTransform16(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_U32 shift, EB_U32 numRows)
306 {
307 __m128i s0 = _mm_cvtsi32_si128(shift);
308 __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
309 const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl2;
310
311 do
312 {
313 __m128i x0, x1;
314 __m128i a0, a1, a2, a3;
315 __m128i b0, b1, b2, b3;
316 x0 = _mm_loadu_si128((const __m128i *)(src+0x00)); // 00 01 02 03 04 05 06 07
317 x1 = _mm_loadu_si128((const __m128i *)(src+0x08)); // 08 09 0a 0b 0c 0d 0e 0f
318
319 #ifdef SSSE3/// __SSSE3__
320 x0 = _mm_shuffle_epi8(x0, _mm_setr_epi8(0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15));
321 x1 = _mm_shuffle_epi8(x1, _mm_setr_epi8(0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15));
322 #else
323 x0 = _mm_shufflelo_epi16(x0, 0xd8); // 00 02 01 03 04 06 05 07
324 x1 = _mm_shufflelo_epi16(x1, 0xd8); // 08 0a 09 0b 0c 0e 0d 0f
325 x0 = _mm_shufflehi_epi16(x0, 0xd8);
326 x1 = _mm_shufflehi_epi16(x1, 0xd8);
327 #endif
328
329 a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]); // 00 02
330 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[2])); // 04 06
331 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[4])); // 08 0a
332 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[6])); // 0c 0e
333
334 a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
335 a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[3]));
336 a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[5]));
337 a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[7]));
338
339 a2 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[8]);
340 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[10]));
341 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[12]));
342 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[14]));
343
344 a3 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[9]);
345 a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[11]));
346 a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[13]));
347 a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[15]));
348
349 a0 = _mm_add_epi32(a0, o0);
350 a1 = _mm_add_epi32(a1, o0);
351
352 b0 = _mm_add_epi32(a0, a2);
353 b1 = _mm_add_epi32(a1, a3);
354 b2 = _mm_sub_epi32(a0, a2);
355 b3 = _mm_sub_epi32(a1, a3);
356
357 a0 = b0;
358 a1 = b1;
359 a2 = _mm_shuffle_epi32(b3, 0x1b); // 00011011
360 a3 = _mm_shuffle_epi32(b2, 0x1b);
361
362 a0 = _mm_sra_epi32(a0, s0);
363 a1 = _mm_sra_epi32(a1, s0);
364 a2 = _mm_sra_epi32(a2, s0);
365 a3 = _mm_sra_epi32(a3, s0);
366
367 x0 = _mm_packs_epi32(a0, a1);
368 x1 = _mm_packs_epi32(a2, a3);
369
370 _mm_storeu_si128((__m128i *)(dst+0x00), x0);
371 _mm_storeu_si128((__m128i *)(dst+0x08), x1);
372
373 src += src_stride;
374 dst += dst_stride;
375 }
376 while (--numRows);
377 }
378
379 // 16-point inverse transform
invTransform16Half(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift,EB_U32 numRows)380 static void invTransform16Half(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_U32 shift, EB_U32 numRows)
381 {
382 __m128i s0 = _mm_cvtsi32_si128(shift);
383 __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
384 const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl2;
385
386 do
387 {
388 __m128i x0, x1;
389 __m128i a0, a1, a2, a3;
390 __m128i b0, b1, b2, b3;
391 x0 = _mm_loadu_si128((const __m128i *)(src+0x00)); // 00 01 02 03 04 05 06 07
392
393 #ifdef SSSE3/// __SSSE3__
394 x0 = _mm_shuffle_epi8(x0, _mm_setr_epi8(0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15));
395 #else
396 x0 = _mm_shufflelo_epi16(x0, 0xd8); // 00 02 01 03 04 06 05 07
397 x0 = _mm_shufflehi_epi16(x0, 0xd8);
398 #endif
399
400 a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]); // 00 02
401 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[2])); // 04 06
402
403 a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
404 a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[3]));
405
406 a2 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[8]);
407 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[10]));
408
409 a3 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[9]);
410 a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[11]));
411
412 a0 = _mm_add_epi32(a0, o0);
413 a1 = _mm_add_epi32(a1, o0);
414
415 b0 = _mm_add_epi32(a0, a2);
416 b1 = _mm_add_epi32(a1, a3);
417 b2 = _mm_sub_epi32(a0, a2);
418 b3 = _mm_sub_epi32(a1, a3);
419
420 a0 = b0;
421 a1 = b1;
422 a2 = _mm_shuffle_epi32(b3, 0x1b); // 00011011
423 a3 = _mm_shuffle_epi32(b2, 0x1b);
424
425 a0 = _mm_sra_epi32(a0, s0);
426 a1 = _mm_sra_epi32(a1, s0);
427 a2 = _mm_sra_epi32(a2, s0);
428 a3 = _mm_sra_epi32(a3, s0);
429
430 x0 = _mm_packs_epi32(a0, a1);
431 x1 = _mm_packs_epi32(a2, a3);
432
433 _mm_storeu_si128((__m128i *)(dst+0x00), x0);
434 _mm_storeu_si128((__m128i *)(dst+0x08), x1);
435
436 src += src_stride;
437 dst += dst_stride;
438 }
439 while (--numRows);
440 }
441
442
invTransform16Partial(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift,EB_U32 pattern)443 static void invTransform16Partial(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_U32 shift, EB_U32 pattern)
444 {
445 EB_U32 numRows = 16 - 4 * (pattern & 2);
446 if (pattern & 1)
447 {
448 invTransform16Half(src, src_stride, dst, dst_stride, shift, numRows);
449 }
450 else
451 {
452 invTransform16(src, src_stride, dst, dst_stride, shift, numRows);
453 }
454 }
455
456 // inverse 16x16 transform
PFinvTransform16x16_SSSE3(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_S16 * intermediate,EB_U32 addshift)457 void PFinvTransform16x16_SSSE3(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_S16 *intermediate, EB_U32 addshift)
458 {
459
460 EB_U32 pattern = transpose16Check0s(src, src_stride, intermediate, 16);
461 invTransform16Partial(intermediate, 16, dst, dst_stride, 7, pattern);
462
463 pattern >>= 1;
464 transpose16Partial(dst, dst_stride, intermediate, 16, pattern);
465 invTransform16Partial(intermediate, 16, dst, dst_stride, 12-addshift, pattern);
466
467 }
468
469 // transpose 32x32 block of data
transpose32(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride)470 static void transpose32(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride)
471 {
472 EB_U32 i, j;
473 for (i = 0; i < 4; i++)
474 {
475 for (j = 0; j < 4; j++)
476 {
477 __m128i a0, a1, a2, a3, a4, a5, a6, a7;
478 __m128i b0, b1, b2, b3, b4, b5, b6, b7;
479
480 a0 = _mm_loadu_si128((const __m128i *)(src + (8*i+0)*src_stride + 8*j));
481 a1 = _mm_loadu_si128((const __m128i *)(src + (8*i+1)*src_stride + 8*j));
482 a2 = _mm_loadu_si128((const __m128i *)(src + (8*i+2)*src_stride + 8*j));
483 a3 = _mm_loadu_si128((const __m128i *)(src + (8*i+3)*src_stride + 8*j));
484 a4 = _mm_loadu_si128((const __m128i *)(src + (8*i+4)*src_stride + 8*j));
485 a5 = _mm_loadu_si128((const __m128i *)(src + (8*i+5)*src_stride + 8*j));
486 a6 = _mm_loadu_si128((const __m128i *)(src + (8*i+6)*src_stride + 8*j));
487 a7 = _mm_loadu_si128((const __m128i *)(src + (8*i+7)*src_stride + 8*j));
488
489 b0 = _mm_unpacklo_epi16(a0, a4);
490 b1 = _mm_unpacklo_epi16(a1, a5);
491 b2 = _mm_unpacklo_epi16(a2, a6);
492 b3 = _mm_unpacklo_epi16(a3, a7);
493 b4 = _mm_unpackhi_epi16(a0, a4);
494 b5 = _mm_unpackhi_epi16(a1, a5);
495 b6 = _mm_unpackhi_epi16(a2, a6);
496 b7 = _mm_unpackhi_epi16(a3, a7);
497
498 a0 = _mm_unpacklo_epi16(b0, b2);
499 a1 = _mm_unpacklo_epi16(b1, b3);
500 a2 = _mm_unpackhi_epi16(b0, b2);
501 a3 = _mm_unpackhi_epi16(b1, b3);
502 a4 = _mm_unpacklo_epi16(b4, b6);
503 a5 = _mm_unpacklo_epi16(b5, b7);
504 a6 = _mm_unpackhi_epi16(b4, b6);
505 a7 = _mm_unpackhi_epi16(b5, b7);
506
507 b0 = _mm_unpacklo_epi16(a0, a1);
508 b1 = _mm_unpackhi_epi16(a0, a1);
509 b2 = _mm_unpacklo_epi16(a2, a3);
510 b3 = _mm_unpackhi_epi16(a2, a3);
511 b4 = _mm_unpacklo_epi16(a4, a5);
512 b5 = _mm_unpackhi_epi16(a4, a5);
513 b6 = _mm_unpacklo_epi16(a6, a7);
514 b7 = _mm_unpackhi_epi16(a6, a7);
515
516 _mm_storeu_si128((__m128i *)(dst + (8*j+0)*dst_stride + 8*i), b0);
517 _mm_storeu_si128((__m128i *)(dst + (8*j+1)*dst_stride + 8*i), b1);
518 _mm_storeu_si128((__m128i *)(dst + (8*j+2)*dst_stride + 8*i), b2);
519 _mm_storeu_si128((__m128i *)(dst + (8*j+3)*dst_stride + 8*i), b3);
520 _mm_storeu_si128((__m128i *)(dst + (8*j+4)*dst_stride + 8*i), b4);
521 _mm_storeu_si128((__m128i *)(dst + (8*j+5)*dst_stride + 8*i), b5);
522 _mm_storeu_si128((__m128i *)(dst + (8*j+6)*dst_stride + 8*i), b6);
523 _mm_storeu_si128((__m128i *)(dst + (8*j+7)*dst_stride + 8*i), b7);
524 }
525 }
526 }
527
transpose32Partial(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 pattern)528 static void transpose32Partial(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_U32 pattern)
529 {
530 EB_U32 j;
531 EB_U32 numRows = 4 - (pattern & 3);
532
533 do
534 {
535 for (j = 0; j < 4; j++)
536 {
537 __m128i a0, a1, a2, a3, a4, a5, a6, a7;
538 __m128i b0, b1, b2, b3, b4, b5, b6, b7;
539
540 a0 = _mm_loadu_si128((const __m128i *)(src + (0)*src_stride + 8*j));
541 a1 = _mm_loadu_si128((const __m128i *)(src + (1)*src_stride + 8*j));
542 a2 = _mm_loadu_si128((const __m128i *)(src + (2)*src_stride + 8*j));
543 a3 = _mm_loadu_si128((const __m128i *)(src + (3)*src_stride + 8*j));
544 a4 = _mm_loadu_si128((const __m128i *)(src + (4)*src_stride + 8*j));
545 a5 = _mm_loadu_si128((const __m128i *)(src + (5)*src_stride + 8*j));
546 a6 = _mm_loadu_si128((const __m128i *)(src + (6)*src_stride + 8*j));
547 a7 = _mm_loadu_si128((const __m128i *)(src + (7)*src_stride + 8*j));
548
549 b0 = _mm_unpacklo_epi16(a0, a4);
550 b1 = _mm_unpacklo_epi16(a1, a5);
551 b2 = _mm_unpacklo_epi16(a2, a6);
552 b3 = _mm_unpacklo_epi16(a3, a7);
553 b4 = _mm_unpackhi_epi16(a0, a4);
554 b5 = _mm_unpackhi_epi16(a1, a5);
555 b6 = _mm_unpackhi_epi16(a2, a6);
556 b7 = _mm_unpackhi_epi16(a3, a7);
557
558 a0 = _mm_unpacklo_epi16(b0, b2);
559 a1 = _mm_unpacklo_epi16(b1, b3);
560 a2 = _mm_unpackhi_epi16(b0, b2);
561 a3 = _mm_unpackhi_epi16(b1, b3);
562 a4 = _mm_unpacklo_epi16(b4, b6);
563 a5 = _mm_unpacklo_epi16(b5, b7);
564 a6 = _mm_unpackhi_epi16(b4, b6);
565 a7 = _mm_unpackhi_epi16(b5, b7);
566
567 b0 = _mm_unpacklo_epi16(a0, a1);
568 b1 = _mm_unpackhi_epi16(a0, a1);
569 b2 = _mm_unpacklo_epi16(a2, a3);
570 b3 = _mm_unpackhi_epi16(a2, a3);
571 b4 = _mm_unpacklo_epi16(a4, a5);
572 b5 = _mm_unpackhi_epi16(a4, a5);
573 b6 = _mm_unpacklo_epi16(a6, a7);
574 b7 = _mm_unpackhi_epi16(a6, a7);
575
576 _mm_storeu_si128((__m128i *)(dst + (8*j+0)*dst_stride), b0);
577 _mm_storeu_si128((__m128i *)(dst + (8*j+1)*dst_stride), b1);
578 _mm_storeu_si128((__m128i *)(dst + (8*j+2)*dst_stride), b2);
579 _mm_storeu_si128((__m128i *)(dst + (8*j+3)*dst_stride), b3);
580 _mm_storeu_si128((__m128i *)(dst + (8*j+4)*dst_stride), b4);
581 _mm_storeu_si128((__m128i *)(dst + (8*j+5)*dst_stride), b5);
582 _mm_storeu_si128((__m128i *)(dst + (8*j+6)*dst_stride), b6);
583 _mm_storeu_si128((__m128i *)(dst + (8*j+7)*dst_stride), b7);
584 }
585
586 src += 8 * src_stride;
587 dst += 8;
588 }
589 while (--numRows);
590 }
591
transpose32Check0s(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride)592 static EB_U32 transpose32Check0s(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride)
593 {
594 EB_U32 i, j;
595 EB_U32 zeroPattern = 0;
596 EB_U32 result = 0;
597
598 for (i = 0; i < 4; i++)
599 {
600 for (j = 0; j < 4; j++)
601 {
602 __m128i a0, a1, a2, a3, a4, a5, a6, a7;
603 __m128i b0, b1, b2, b3, b4, b5, b6, b7;
604 __m128i c0;
605
606 a0 = _mm_loadu_si128((const __m128i *)(src + (8*i+0)*src_stride + 8*j));
607 a1 = _mm_loadu_si128((const __m128i *)(src + (8*i+1)*src_stride + 8*j));
608 a2 = _mm_loadu_si128((const __m128i *)(src + (8*i+2)*src_stride + 8*j));
609 a3 = _mm_loadu_si128((const __m128i *)(src + (8*i+3)*src_stride + 8*j));
610 a4 = _mm_loadu_si128((const __m128i *)(src + (8*i+4)*src_stride + 8*j));
611 a5 = _mm_loadu_si128((const __m128i *)(src + (8*i+5)*src_stride + 8*j));
612 a6 = _mm_loadu_si128((const __m128i *)(src + (8*i+6)*src_stride + 8*j));
613 a7 = _mm_loadu_si128((const __m128i *)(src + (8*i+7)*src_stride + 8*j));
614
615 c0 = _mm_or_si128(a0, a4);
616 c0 = _mm_or_si128(c0, a1);
617 c0 = _mm_or_si128(c0, a5);
618 c0 = _mm_or_si128(c0, a2);
619 c0 = _mm_or_si128(c0, a6);
620 c0 = _mm_or_si128(c0, a3);
621 c0 = _mm_or_si128(c0, a7);
622
623 c0 = _mm_cmpeq_epi8(c0, _mm_setzero_si128());
624
625 zeroPattern = 2 * zeroPattern + ((_mm_movemask_epi8(c0)+1) >> 16); // add a '1' bit if all zeros
626
627 b0 = _mm_unpacklo_epi16(a0, a4);
628 b1 = _mm_unpacklo_epi16(a1, a5);
629 b2 = _mm_unpacklo_epi16(a2, a6);
630 b3 = _mm_unpacklo_epi16(a3, a7);
631 b4 = _mm_unpackhi_epi16(a0, a4);
632 b5 = _mm_unpackhi_epi16(a1, a5);
633 b6 = _mm_unpackhi_epi16(a2, a6);
634 b7 = _mm_unpackhi_epi16(a3, a7);
635
636 a0 = _mm_unpacklo_epi16(b0, b2);
637 a1 = _mm_unpacklo_epi16(b1, b3);
638 a2 = _mm_unpackhi_epi16(b0, b2);
639 a3 = _mm_unpackhi_epi16(b1, b3);
640 a4 = _mm_unpacklo_epi16(b4, b6);
641 a5 = _mm_unpacklo_epi16(b5, b7);
642 a6 = _mm_unpackhi_epi16(b4, b6);
643 a7 = _mm_unpackhi_epi16(b5, b7);
644
645 b0 = _mm_unpacklo_epi16(a0, a1);
646 b1 = _mm_unpackhi_epi16(a0, a1);
647 b2 = _mm_unpacklo_epi16(a2, a3);
648 b3 = _mm_unpackhi_epi16(a2, a3);
649 b4 = _mm_unpacklo_epi16(a4, a5);
650 b5 = _mm_unpackhi_epi16(a4, a5);
651 b6 = _mm_unpacklo_epi16(a6, a7);
652 b7 = _mm_unpackhi_epi16(a6, a7);
653
654 _mm_storeu_si128((__m128i *)(dst + (8*j+0)*dst_stride + 8*i), b0);
655 _mm_storeu_si128((__m128i *)(dst + (8*j+1)*dst_stride + 8*i), b1);
656 _mm_storeu_si128((__m128i *)(dst + (8*j+2)*dst_stride + 8*i), b2);
657 _mm_storeu_si128((__m128i *)(dst + (8*j+3)*dst_stride + 8*i), b3);
658 _mm_storeu_si128((__m128i *)(dst + (8*j+4)*dst_stride + 8*i), b4);
659 _mm_storeu_si128((__m128i *)(dst + (8*j+5)*dst_stride + 8*i), b5);
660 _mm_storeu_si128((__m128i *)(dst + (8*j+6)*dst_stride + 8*i), b6);
661 _mm_storeu_si128((__m128i *)(dst + (8*j+7)*dst_stride + 8*i), b7);
662 }
663 }
664
665 if ((zeroPattern & 0xfff) == 0xfff) result |= 3;
666 else if ((zeroPattern & 0xff) == 0xff) result |= 2;
667 else if ((zeroPattern & 0xf) == 0xf) result |= 1;
668
669 if ((zeroPattern & 0x7777) == 0x7777) result |= 3*4;
670 else if ((zeroPattern & 0x3333) == 0x3333) result |= 2*4;
671 else if ((zeroPattern & 0x1111) == 0x1111) result |= 1*4;
672
673 return result;
674 }
675
676 // 32-point forward transform (32 rows)
transform32(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift)677 static void transform32(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_U32 shift)
678 {
679 __m128i s0 = _mm_cvtsi32_si128(shift);
680 __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
681 const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl;
682
683 EB_U32 numRows = 32;
684 do
685 {
686 __m128i x0, x1, x2, x3;
687 __m128i y0, y1, y2, y3;
688 __m128i a0, a1, a2, a3, a4, a5, a6, a7;
689 __m128i b0, b1, b2, b3, b4, b5, b6, b7;
690
691 x0 = _mm_loadu_si128((const __m128i *)(src+0x00));
692 x1 = _mm_loadu_si128((const __m128i *)(src+0x08));
693 x2 = _mm_loadu_si128((const __m128i *)(src+0x10));
694 x3 = _mm_loadu_si128((const __m128i *)(src+0x18));
695
696
697 // 32-point butterfly
698 x2 = reverse_epi16(x2);
699 x3 = reverse_epi16(x3);
700
701 y0 = _mm_add_epi16(x0, x3);
702 y1 = _mm_add_epi16(x1, x2);
703
704 y2 = _mm_sub_epi16(x0, x3);
705 y3 = _mm_sub_epi16(x1, x2);
706
707 // 16-point butterfly
708 y1 = reverse_epi16(y1);
709
710 x0 = _mm_add_epi16(y0, y1);
711 x1 = _mm_sub_epi16(y0, y1);
712
713
714 x2 = y2;
715 x3 = y3;
716
717 a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
718 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[2]));
719 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[4]));
720 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[6]));
721
722 a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
723 a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[3]));
724 a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[5]));
725 a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[7]));
726
727 a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]);
728 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10]));
729 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[12]));
730 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[14]));
731
732 a3 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[9]);
733 a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[11]));
734 a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[13]));
735 a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[15]));
736
737 a4 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[16]);
738 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[20]));
739 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[24]));
740 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[28]));
741 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[32]));
742 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[36]));
743 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[40]));
744 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[44]));
745
746 a5 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[17]);
747 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[21]));
748 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[25]));
749 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[29]));
750 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[33]));
751 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[37]));
752 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[41]));
753 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[45]));
754
755 a6 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[18]);
756 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[22]));
757 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[26]));
758 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[30]));
759 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[34]));
760 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[38]));
761 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[42]));
762 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[46]));
763
764 a7 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[19]);
765 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[23]));
766 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[27]));
767 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[31]));
768 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[35]));
769 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[39]));
770 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[43]));
771 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[47]));
772
773 b0 = _mm_sra_epi32(_mm_add_epi32(a0, o0), s0);
774 b1 = _mm_sra_epi32(_mm_add_epi32(a1, o0), s0);
775 b2 = _mm_sra_epi32(_mm_add_epi32(a2, o0), s0);
776 b3 = _mm_sra_epi32(_mm_add_epi32(a3, o0), s0);
777 b4 = _mm_sra_epi32(_mm_add_epi32(a4, o0), s0);
778 b5 = _mm_sra_epi32(_mm_add_epi32(a5, o0), s0);
779 b6 = _mm_sra_epi32(_mm_add_epi32(a6, o0), s0);
780 b7 = _mm_sra_epi32(_mm_add_epi32(a7, o0), s0);
781
782 x0 = _mm_packs_epi32(b0, b1);
783 x1 = _mm_packs_epi32(b2, b3);
784 x2 = _mm_packs_epi32(b4, b5);
785 x3 = _mm_packs_epi32(b6, b7);
786
787 y0 = _mm_unpacklo_epi16(x0, x1);
788 y1 = _mm_unpackhi_epi16(x0, x1);
789 y2 = x2;
790 y3 = x3;
791 x0 = _mm_unpacklo_epi16(y0, y2);
792 x1 = _mm_unpackhi_epi16(y0, y2);
793 x2 = _mm_unpacklo_epi16(y1, y3);
794 x3 = _mm_unpackhi_epi16(y1, y3);
795
796 _mm_storeu_si128((__m128i *)(dst+0x00), x0);
797 _mm_storeu_si128((__m128i *)(dst+0x08), x1);
798 _mm_storeu_si128((__m128i *)(dst+0x10), x2);
799 _mm_storeu_si128((__m128i *)(dst+0x18), x3);
800
801 src += src_stride;
802 dst += dst_stride;
803 }
804 while (--numRows);
805 }
806
807 // 32-point inverse transform (32 rows)
invTransform32(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift,EB_U32 numRows)808 static void invTransform32(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_U32 shift, EB_U32 numRows)
809 {
810 __m128i s0 = _mm_cvtsi32_si128(shift);
811 __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
812 const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl2;
813
814 do
815 {
816 __m128i x0, x1, x2, x3;
817 #ifndef SSSE3/// __SSSE3__
818 __m128i y0, y1, y2, y3;
819 #endif
820 __m128i a0, a1, a2, a3, a4, a5, a6, a7;
821 __m128i b0, b1, b2, b3, b4, b5, b6, b7;
822 x0 = _mm_loadu_si128((const __m128i *)(src+0x00)); // 00 01 02 03 04 05 06 07
823 x1 = _mm_loadu_si128((const __m128i *)(src+0x08)); // 08 09 0a 0b 0c 0d 0e 0f
824 x2 = _mm_loadu_si128((const __m128i *)(src+0x10)); // 10 11 12 13 14 15 16 17
825 x3 = _mm_loadu_si128((const __m128i *)(src+0x18)); // 18 19 1a 1b 1c 1d 1e 1f
826
827 #ifdef SSSE3/// __SSSE3__
828 x0 = _mm_shuffle_epi8(x0, _mm_setr_epi8(0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15)); // 00 04 02 06 01 03 05 07
829 x1 = _mm_shuffle_epi8(x1, _mm_setr_epi8(0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15)); // 08 0c 0a 0e 09 0b 0d 0f
830 x2 = _mm_shuffle_epi8(x2, _mm_setr_epi8(0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15)); // 10 14 12 16 11 13 15 17
831 x3 = _mm_shuffle_epi8(x3, _mm_setr_epi8(0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15)); // 18 1c 1a 1e 19 1b 1d 1f
832
833 a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
834 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[2]));
835 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[4]));
836 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[6]));
837
838 a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
839 a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[3]));
840 a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[5]));
841 a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[7]));
842
843 a2 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[8]);
844 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10]));
845 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[12]));
846 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[14]));
847
848 a3 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[9]);
849 a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[11]));
850 a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[13]));
851 a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[15]));
852
853 a4 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[16]);
854 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[20]));
855 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[24]));
856 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[28]));
857 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[32]));
858 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[36]));
859 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[40]));
860 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[44]));
861
862 a5 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[17]);
863 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[21]));
864 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[25]));
865 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[29]));
866 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[33]));
867 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[37]));
868 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[41]));
869 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[45]));
870
871 a6 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[18]);
872 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[22]));
873 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[26]));
874 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[30]));
875 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[34]));
876 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[38]));
877 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[42]));
878 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[46]));
879
880 a7 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[19]);
881 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[23]));
882 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[27]));
883 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[31]));
884 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[35]));
885 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[39]));
886 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[43]));
887 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[47]));
888
889 #else
890 y0 = _mm_unpacklo_epi16(x0, x1); // 00 08 01 09 02 0a 03 0b
891 y1 = _mm_unpackhi_epi16(x0, x1); // 04 0c 05 0d 06 0e 07 0f
892 y2 = _mm_unpacklo_epi16(x2, x3); // 10 18
893 y3 = _mm_unpackhi_epi16(x2, x3); // 24 2c
894
895 x0 = _mm_unpacklo_epi16(y0, y1); // 00 04 08 0c 01 05 09 0d
896 x1 = _mm_unpackhi_epi16(y0, y1); // 02 06 0a 0e 03 07 0b 0f
897 x2 = _mm_unpacklo_epi16(y2, y3); // 10 14
898 x3 = _mm_unpackhi_epi16(y2, y3); // 12 16
899
900 y0 = _mm_unpacklo_epi64(x0, x2); // 00 04 08 0c 10 14 18 1c
901 y1 = _mm_unpacklo_epi64(x1, x3); // 02 06 0a 0e 12 16 1a 1e
902 y2 = _mm_unpackhi_epi16(x0, x1); // 01 03 05 07 09 0b 0d 0f
903 y3 = _mm_unpackhi_epi16(x2, x3); // 11 13 15 17 19 1b 1d 1f
904
905 x0 = y0;
906 x1 = y1;
907 x2 = y2;
908 x3 = y3;
909
910 a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]); // 00 04
911 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[2])); // 08 0c
912 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[4])); // 10 14
913 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[6])); // 18 1c
914
915 a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
916 a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[3]));
917 a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[5]));
918 a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[7]));
919
920 a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]); // 02 06
921 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10])); // 0a 0e
922 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[12])); // 12 16
923 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[14])); // 1a 1e
924
925 a3 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[9]);
926 a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[11]));
927 a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[13]));
928 a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[15]));
929
930 a4 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[16]); // 01 03
931 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[20])); // 05 07
932 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[24])); // 09 0b
933 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[28])); // 0d 0f
934 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[32])); // 11 13
935 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[36])); // 15 17
936 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[40])); // 19 1b
937 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[44])); // 1d 1f
938
939 a5 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[17]);
940 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[21]));
941 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[25]));
942 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[29]));
943 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[33]));
944 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[37]));
945 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[41]));
946 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[45]));
947
948 a6 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[18]);
949 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[22]));
950 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[26]));
951 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[30]));
952 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[34]));
953 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[38]));
954 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[42]));
955 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[46]));
956
957 a7 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[19]);
958 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[23]));
959 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[27]));
960 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[31]));
961 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[35]));
962 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[39]));
963 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[43]));
964 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[47]));
965 #endif
966
967 a0 = _mm_add_epi32(a0, o0);
968 a1 = _mm_add_epi32(a1, o0);
969
970 b0 = _mm_add_epi32(a0, a2);
971 b1 = _mm_add_epi32(a1, a3);
972 b2 = _mm_sub_epi32(a0, a2);
973 b3 = _mm_sub_epi32(a1, a3);
974
975 a0 = b0;
976 a1 = b1;
977 a2 = _mm_shuffle_epi32(b3, 0x1b); // 00011011
978 a3 = _mm_shuffle_epi32(b2, 0x1b);
979
980 b0 = _mm_add_epi32(a0, a4);
981 b1 = _mm_add_epi32(a1, a5);
982 b2 = _mm_add_epi32(a2, a6);
983 b3 = _mm_add_epi32(a3, a7);
984 b4 = _mm_sub_epi32(a0, a4);
985 b5 = _mm_sub_epi32(a1, a5);
986 b6 = _mm_sub_epi32(a2, a6);
987 b7 = _mm_sub_epi32(a3, a7);
988
989 a0 = _mm_sra_epi32(b0, s0);
990 a1 = _mm_sra_epi32(b1, s0);
991 a2 = _mm_sra_epi32(b2, s0);
992 a3 = _mm_sra_epi32(b3, s0);
993 a4 = _mm_sra_epi32(_mm_shuffle_epi32(b7, 0x1b), s0);
994 a5 = _mm_sra_epi32(_mm_shuffle_epi32(b6, 0x1b), s0);
995 a6 = _mm_sra_epi32(_mm_shuffle_epi32(b5, 0x1b), s0);
996 a7 = _mm_sra_epi32(_mm_shuffle_epi32(b4, 0x1b), s0);
997
998 x0 = _mm_packs_epi32(a0, a1);
999 x1 = _mm_packs_epi32(a2, a3);
1000 x2 = _mm_packs_epi32(a4, a5);
1001 x3 = _mm_packs_epi32(a6, a7);
1002
1003 _mm_storeu_si128((__m128i *)(dst+0x00), x0);
1004 _mm_storeu_si128((__m128i *)(dst+0x08), x1);
1005 _mm_storeu_si128((__m128i *)(dst+0x10), x2);
1006 _mm_storeu_si128((__m128i *)(dst+0x18), x3);
1007
1008 src += src_stride;
1009 dst += dst_stride;
1010 }
1011 while (--numRows);
1012 }
1013
invTransform32ThreeQuarter(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift,EB_U32 numRows)1014 static void invTransform32ThreeQuarter(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_U32 shift, EB_U32 numRows)
1015 {
1016 __m128i s0 = _mm_cvtsi32_si128(shift);
1017 __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
1018 const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl2;
1019
1020 do
1021 {
1022 __m128i x0, x1, x2, x3;
1023 #ifndef SSSE3/// __SSSE3__
1024 __m128i y0, y1, y2, y3;
1025 #endif
1026 __m128i a0, a1, a2, a3, a4, a5, a6, a7;
1027 __m128i b0, b1, b2, b3, b4, b5, b6, b7;
1028 x0 = _mm_loadu_si128((const __m128i *)(src+0x00)); // 00 01 02 03 04 05 06 07
1029 x1 = _mm_loadu_si128((const __m128i *)(src+0x08)); // 08 09 0a 0b 0c 0d 0e 0f
1030 x2 = _mm_loadu_si128((const __m128i *)(src+0x10)); // 10 11 12 13 14 15 16 17
1031 x3 = _mm_setzero_si128();
1032
1033 #ifdef SSSE3/// __SSSE3__
1034 x0 = _mm_shuffle_epi8(x0, _mm_setr_epi8(0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15)); // 00 04 02 06 01 03 05 07
1035 x1 = _mm_shuffle_epi8(x1, _mm_setr_epi8(0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15)); // 08 0c 0a 0e 09 0b 0d 0f
1036 x2 = _mm_shuffle_epi8(x2, _mm_setr_epi8(0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15)); // 10 14 12 16 11 13 15 17
1037
1038 a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
1039 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[2]));
1040 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[4]));
1041
1042 a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
1043 a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[3]));
1044 a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[5]));
1045
1046 a2 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[8]);
1047 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10]));
1048 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[12]));
1049
1050 a3 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[9]);
1051 a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[11]));
1052 a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[13]));
1053
1054 a4 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[16]);
1055 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[20]));
1056 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[24]));
1057 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[28]));
1058 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[32]));
1059 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[36]));
1060
1061 a5 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[17]);
1062 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[21]));
1063 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[25]));
1064 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[29]));
1065 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[33]));
1066 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[37]));
1067
1068 a6 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[18]);
1069 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[22]));
1070 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[26]));
1071 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[30]));
1072 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[34]));
1073 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[38]));
1074
1075 a7 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[19]);
1076 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[23]));
1077 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[27]));
1078 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[31]));
1079 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[35]));
1080 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[39]));
1081
1082 #else
1083 y0 = _mm_unpacklo_epi16(x0, x1); // 00 08 01 09 02 0a 03 0b
1084 y1 = _mm_unpackhi_epi16(x0, x1); // 04 0c 05 0d 06 0e 07 0f
1085 y2 = _mm_unpacklo_epi16(x2, x3); // 10 18
1086 y3 = _mm_unpackhi_epi16(x2, x3); // 24 2c
1087
1088 x0 = _mm_unpacklo_epi16(y0, y1); // 00 04 08 0c 01 05 09 0d
1089 x1 = _mm_unpackhi_epi16(y0, y1); // 02 06 0a 0e 03 07 0b 0f
1090 x2 = _mm_unpacklo_epi16(y2, y3); // 10 14
1091 x3 = _mm_unpackhi_epi16(y2, y3); // 12 16
1092
1093 y0 = _mm_unpacklo_epi64(x0, x2); // 00 04 08 0c 10 14 18 1c
1094 y1 = _mm_unpacklo_epi64(x1, x3); // 02 06 0a 0e 12 16 1a 1e
1095 y2 = _mm_unpackhi_epi16(x0, x1); // 01 03 05 07 09 0b 0d 0f
1096 y3 = _mm_unpackhi_epi16(x2, x3); // 11 13 15 17 19 1b 1d 1f
1097
1098 x0 = y0;
1099 x1 = y1;
1100 x2 = y2;
1101 x3 = y3;
1102
1103 a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]); // 00 04
1104 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[2])); // 08 0c
1105 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[4])); // 10 14
1106
1107 a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
1108 a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[3]));
1109 a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[5]));
1110
1111 a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]); // 02 06
1112 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10])); // 0a 0e
1113 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[12])); // 12 16
1114
1115 a3 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[9]);
1116 a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[11]));
1117 a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[13]));
1118
1119 a4 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[16]); // 01 03
1120 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[20])); // 05 07
1121 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[24])); // 09 0b
1122 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[28])); // 0d 0f
1123 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[32])); // 11 13
1124 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[36])); // 15 17
1125
1126 a5 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[17]);
1127 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[21]));
1128 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[25]));
1129 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[29]));
1130 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[33]));
1131 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[37]));
1132
1133 a6 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[18]);
1134 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[22]));
1135 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[26]));
1136 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[30]));
1137 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[34]));
1138 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[38]));
1139
1140 a7 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[19]);
1141 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[23]));
1142 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[27]));
1143 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[31]));
1144 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[35]));
1145 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[39]));
1146 #endif
1147
1148 a0 = _mm_add_epi32(a0, o0);
1149 a1 = _mm_add_epi32(a1, o0);
1150
1151 b0 = _mm_add_epi32(a0, a2);
1152 b1 = _mm_add_epi32(a1, a3);
1153 b2 = _mm_sub_epi32(a0, a2);
1154 b3 = _mm_sub_epi32(a1, a3);
1155
1156 a0 = b0;
1157 a1 = b1;
1158 a2 = _mm_shuffle_epi32(b3, 0x1b); // 00011011
1159 a3 = _mm_shuffle_epi32(b2, 0x1b);
1160
1161 b0 = _mm_add_epi32(a0, a4);
1162 b1 = _mm_add_epi32(a1, a5);
1163 b2 = _mm_add_epi32(a2, a6);
1164 b3 = _mm_add_epi32(a3, a7);
1165 b4 = _mm_sub_epi32(a0, a4);
1166 b5 = _mm_sub_epi32(a1, a5);
1167 b6 = _mm_sub_epi32(a2, a6);
1168 b7 = _mm_sub_epi32(a3, a7);
1169
1170 a0 = _mm_sra_epi32(b0, s0);
1171 a1 = _mm_sra_epi32(b1, s0);
1172 a2 = _mm_sra_epi32(b2, s0);
1173 a3 = _mm_sra_epi32(b3, s0);
1174 a4 = _mm_sra_epi32(_mm_shuffle_epi32(b7, 0x1b), s0);
1175 a5 = _mm_sra_epi32(_mm_shuffle_epi32(b6, 0x1b), s0);
1176 a6 = _mm_sra_epi32(_mm_shuffle_epi32(b5, 0x1b), s0);
1177 a7 = _mm_sra_epi32(_mm_shuffle_epi32(b4, 0x1b), s0);
1178
1179 x0 = _mm_packs_epi32(a0, a1);
1180 x1 = _mm_packs_epi32(a2, a3);
1181 x2 = _mm_packs_epi32(a4, a5);
1182 x3 = _mm_packs_epi32(a6, a7);
1183
1184 _mm_storeu_si128((__m128i *)(dst+0x00), x0);
1185 _mm_storeu_si128((__m128i *)(dst+0x08), x1);
1186 _mm_storeu_si128((__m128i *)(dst+0x10), x2);
1187 _mm_storeu_si128((__m128i *)(dst+0x18), x3);
1188
1189 src += src_stride;
1190 dst += dst_stride;
1191 }
1192 while (--numRows);
1193 }
1194
invTransform32Half(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift,EB_U32 numRows)1195 static void invTransform32Half(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_U32 shift, EB_U32 numRows)
1196 {
1197 __m128i s0 = _mm_cvtsi32_si128(shift);
1198 __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
1199 const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl2;
1200
1201 do
1202 {
1203 __m128i x0, x1, x2, x3;
1204 #ifndef SSSE3/// __SSSE3__
1205 __m128i y0, y1, y2;
1206 #endif
1207 __m128i a0, a1, a2, a3, a4, a5, a6, a7;
1208 __m128i b0, b1, b2, b3, b4, b5, b6, b7;
1209 x0 = _mm_loadu_si128((const __m128i *)(src+0x00)); // 00 01 02 03 04 05 06 07
1210 x1 = _mm_loadu_si128((const __m128i *)(src+0x08)); // 08 09 0a 0b 0c 0d 0e 0f
1211 x2 = _mm_setzero_si128();
1212 x3 = _mm_setzero_si128();
1213
1214 #ifdef SSSE3/// __SSSE3__
1215 x0 = _mm_shuffle_epi8(x0, _mm_setr_epi8(0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15)); // 00 04 02 06 01 03 05 07
1216 x1 = _mm_shuffle_epi8(x1, _mm_setr_epi8(0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15)); // 08 0c 0a 0e 09 0b 0d 0f
1217
1218 a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
1219 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[2]));
1220
1221 a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
1222 a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[3]));
1223
1224 a2 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[8]);
1225 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10]));
1226
1227 a3 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[9]);
1228 a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[11]));
1229
1230 a4 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[16]);
1231 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[20]));
1232 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[24]));
1233 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[28]));
1234
1235 a5 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[17]);
1236 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[21]));
1237 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[25]));
1238 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[29]));
1239
1240 a6 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[18]);
1241 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[22]));
1242 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[26]));
1243 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[30]));
1244
1245 a7 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[19]);
1246 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[23]));
1247 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[27]));
1248 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[31]));
1249
1250 #else
1251 y0 = _mm_unpacklo_epi16(x0, x1); // 00 08 01 09 02 0a 03 0b
1252 y1 = _mm_unpackhi_epi16(x0, x1); // 04 0c 05 0d 06 0e 07 0f
1253
1254 x0 = _mm_unpacklo_epi16(y0, y1); // 00 04 08 0c 01 05 09 0d
1255 x1 = _mm_unpackhi_epi16(y0, y1); // 02 06 0a 0e 03 07 0b 0f
1256
1257 y0 = _mm_unpacklo_epi64(x0, x2); // 00 04 08 0c 10 14 18 1c
1258 y1 = _mm_unpacklo_epi64(x1, x3); // 02 06 0a 0e 12 16 1a 1e
1259 y2 = _mm_unpackhi_epi16(x0, x1); // 01 03 05 07 09 0b 0d 0f
1260
1261 x0 = y0;
1262 x1 = y1;
1263 x2 = y2;
1264
1265 a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]); // 00 04
1266 a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[2])); // 08 0c
1267
1268 a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
1269 a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[3]));
1270
1271 a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]); // 02 06
1272 a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10])); // 0a 0e
1273
1274 a3 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[9]);
1275 a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[11]));
1276
1277 a4 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[16]); // 01 03
1278 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[20])); // 05 07
1279 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[24])); // 09 0b
1280 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[28])); // 0d 0f
1281
1282 a5 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[17]);
1283 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[21]));
1284 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[25]));
1285 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[29]));
1286
1287 a6 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[18]);
1288 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[22]));
1289 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[26]));
1290 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[30]));
1291
1292 a7 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[19]);
1293 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[23]));
1294 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[27]));
1295 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[31]));
1296 #endif
1297
1298 a0 = _mm_add_epi32(a0, o0);
1299 a1 = _mm_add_epi32(a1, o0);
1300
1301 b0 = _mm_add_epi32(a0, a2);
1302 b1 = _mm_add_epi32(a1, a3);
1303 b2 = _mm_sub_epi32(a0, a2);
1304 b3 = _mm_sub_epi32(a1, a3);
1305
1306 a0 = b0;
1307 a1 = b1;
1308 a2 = _mm_shuffle_epi32(b3, 0x1b); // 00011011
1309 a3 = _mm_shuffle_epi32(b2, 0x1b);
1310
1311 b0 = _mm_add_epi32(a0, a4);
1312 b1 = _mm_add_epi32(a1, a5);
1313 b2 = _mm_add_epi32(a2, a6);
1314 b3 = _mm_add_epi32(a3, a7);
1315 b4 = _mm_sub_epi32(a0, a4);
1316 b5 = _mm_sub_epi32(a1, a5);
1317 b6 = _mm_sub_epi32(a2, a6);
1318 b7 = _mm_sub_epi32(a3, a7);
1319
1320 a0 = _mm_sra_epi32(b0, s0);
1321 a1 = _mm_sra_epi32(b1, s0);
1322 a2 = _mm_sra_epi32(b2, s0);
1323 a3 = _mm_sra_epi32(b3, s0);
1324 a4 = _mm_sra_epi32(_mm_shuffle_epi32(b7, 0x1b), s0);
1325 a5 = _mm_sra_epi32(_mm_shuffle_epi32(b6, 0x1b), s0);
1326 a6 = _mm_sra_epi32(_mm_shuffle_epi32(b5, 0x1b), s0);
1327 a7 = _mm_sra_epi32(_mm_shuffle_epi32(b4, 0x1b), s0);
1328
1329 x0 = _mm_packs_epi32(a0, a1);
1330 x1 = _mm_packs_epi32(a2, a3);
1331 x2 = _mm_packs_epi32(a4, a5);
1332 x3 = _mm_packs_epi32(a6, a7);
1333
1334 _mm_storeu_si128((__m128i *)(dst+0x00), x0);
1335 _mm_storeu_si128((__m128i *)(dst+0x08), x1);
1336 _mm_storeu_si128((__m128i *)(dst+0x10), x2);
1337 _mm_storeu_si128((__m128i *)(dst+0x18), x3);
1338
1339 src += src_stride;
1340 dst += dst_stride;
1341 }
1342 while (--numRows);
1343 }
1344
invTransform32Quarter(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift,EB_U32 numRows)1345 static void invTransform32Quarter(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_U32 shift, EB_U32 numRows)
1346 {
1347 __m128i s0 = _mm_cvtsi32_si128(shift);
1348 __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
1349 const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl2;
1350
1351 do
1352 {
1353 __m128i x0, x1, x2, x3;
1354 #ifndef SSSE3/// __SSSE3__
1355 __m128i y0, y1;
1356 #endif
1357 __m128i a0, a1, a2, a3, a4, a5, a6, a7;
1358 __m128i b0, b1, b2, b3, b4, b5, b6, b7;
1359 x0 = _mm_loadu_si128((const __m128i *)(src+0x00)); // 00 01 02 03 04 05 06 07
1360
1361 #ifdef SSSE3/// __SSSE3__
1362 x0 = _mm_shuffle_epi8(x0, _mm_setr_epi8(0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15)); // 00 04 02 06 01 03 05 07
1363
1364 a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
1365
1366 a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
1367
1368 a2 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[8]);
1369
1370 a3 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[9]);
1371
1372 a4 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[16]);
1373 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[20]));
1374
1375 a5 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[17]);
1376 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[21]));
1377
1378 a6 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[18]);
1379 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[22]));
1380
1381 a7 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[19]);
1382 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[23]));
1383 #else
1384 y0 = _mm_unpacklo_epi16(x0, x0); // 00 08 01 09 02 0a 03 0b
1385 y1 = _mm_unpackhi_epi16(x0, x0); // 04 0c 05 0d 06 0e 07 0f
1386
1387 x0 = _mm_unpacklo_epi16(y0, y1); // 00 04 08 0c 01 05 09 0d
1388 x1 = _mm_unpackhi_epi16(y0, y1); // 02 06 0a 0e 03 07 0b 0f
1389
1390 y0 = _mm_unpacklo_epi64(x0, x0); // 00 04 08 0c 10 14 18 1c
1391 y1 = _mm_unpacklo_epi64(x1, x1); // 02 06 0a 0e 12 16 1a 1e
1392 x2 = _mm_unpackhi_epi16(x0, x1); // 01 03 05 07 09 0b 0d 0f
1393
1394 x0 = y0;
1395 x1 = y1;
1396
1397 a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]); // 00 04
1398
1399 a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
1400
1401 a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]); // 02 06
1402
1403 a3 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[9]);
1404
1405 a4 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[16]); // 01 03
1406 a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[20])); // 05 07
1407
1408 a5 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[17]);
1409 a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[21]));
1410
1411 a6 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[18]);
1412 a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[22]));
1413
1414 a7 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[19]);
1415 a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[23]));
1416 #endif
1417
1418 a0 = _mm_add_epi32(a0, o0);
1419 a1 = _mm_add_epi32(a1, o0);
1420
1421 b0 = _mm_add_epi32(a0, a2);
1422 b1 = _mm_add_epi32(a1, a3);
1423 b2 = _mm_sub_epi32(a0, a2);
1424 b3 = _mm_sub_epi32(a1, a3);
1425
1426 a0 = b0;
1427 a1 = b1;
1428 a2 = _mm_shuffle_epi32(b3, 0x1b); // 00011011
1429 a3 = _mm_shuffle_epi32(b2, 0x1b);
1430
1431 b0 = _mm_add_epi32(a0, a4);
1432 b1 = _mm_add_epi32(a1, a5);
1433 b2 = _mm_add_epi32(a2, a6);
1434 b3 = _mm_add_epi32(a3, a7);
1435 b4 = _mm_sub_epi32(a0, a4);
1436 b5 = _mm_sub_epi32(a1, a5);
1437 b6 = _mm_sub_epi32(a2, a6);
1438 b7 = _mm_sub_epi32(a3, a7);
1439
1440 a0 = _mm_sra_epi32(b0, s0);
1441 a1 = _mm_sra_epi32(b1, s0);
1442 a2 = _mm_sra_epi32(b2, s0);
1443 a3 = _mm_sra_epi32(b3, s0);
1444 a4 = _mm_sra_epi32(_mm_shuffle_epi32(b7, 0x1b), s0);
1445 a5 = _mm_sra_epi32(_mm_shuffle_epi32(b6, 0x1b), s0);
1446 a6 = _mm_sra_epi32(_mm_shuffle_epi32(b5, 0x1b), s0);
1447 a7 = _mm_sra_epi32(_mm_shuffle_epi32(b4, 0x1b), s0);
1448
1449 x0 = _mm_packs_epi32(a0, a1);
1450 x1 = _mm_packs_epi32(a2, a3);
1451 x2 = _mm_packs_epi32(a4, a5);
1452 x3 = _mm_packs_epi32(a6, a7);
1453
1454 _mm_storeu_si128((__m128i *)(dst+0x00), x0);
1455 _mm_storeu_si128((__m128i *)(dst+0x08), x1);
1456 _mm_storeu_si128((__m128i *)(dst+0x10), x2);
1457 _mm_storeu_si128((__m128i *)(dst+0x18), x3);
1458
1459 src += src_stride;
1460 dst += dst_stride;
1461 }
1462 while (--numRows);
1463 }
1464
invTransform32Partial(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift,EB_U32 pattern)1465 static void invTransform32Partial(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_U32 shift, EB_U32 pattern)
1466 {
1467 EB_U32 numRows = 32 - 2 * (pattern & 12);
1468
1469 switch (pattern & 3)
1470 {
1471 case 3:
1472 invTransform32Quarter(src, src_stride, dst, dst_stride, shift, numRows);
1473 break;
1474 case 2:
1475 invTransform32Half(src, src_stride, dst, dst_stride, shift, numRows);
1476 break;
1477 case 1:
1478 invTransform32ThreeQuarter(src, src_stride, dst, dst_stride, shift, numRows);
1479 break;
1480 default:
1481 invTransform32(src, src_stride, dst, dst_stride, shift, numRows);
1482 break;
1483 }
1484 }
1485
1486 // inverse 32x32 transform
PFinvTransform32x32_SSSE3(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_S16 * intermediate,EB_U32 addshift)1487 void PFinvTransform32x32_SSSE3(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_S16 *intermediate, EB_U32 addshift)
1488 {
1489
1490 EB_U32 pattern = transpose32Check0s(src, src_stride, intermediate, 32);
1491 invTransform32Partial(intermediate, 32, dst, dst_stride, 7, pattern);
1492
1493 pattern >>= 2;
1494 transpose32Partial(dst, dst_stride, intermediate, 32, pattern);
1495 invTransform32Partial(intermediate, 32, dst, dst_stride, 12-addshift, pattern);
1496
1497 }
1498
QuantizeInvQuantize4x4_SSE3(EB_S16 * coeff,const EB_U32 coeffStride,EB_S16 * quantCoeff,EB_S16 * reconCoeff,const EB_U32 qFunc,const EB_U32 q_offset,const EB_S32 shiftedQBits,const EB_S32 shiftedFFunc,const EB_S32 iq_offset,const EB_S32 shiftNum,const EB_U32 areaSize,EB_U32 * nonzerocoeff)1499 void QuantizeInvQuantize4x4_SSE3(
1500 EB_S16 *coeff,
1501 const EB_U32 coeffStride,
1502 EB_S16 *quantCoeff,
1503 EB_S16 *reconCoeff,
1504 const EB_U32 qFunc,
1505 const EB_U32 q_offset,
1506 const EB_S32 shiftedQBits,
1507 const EB_S32 shiftedFFunc,
1508 const EB_S32 iq_offset,
1509 const EB_S32 shiftNum,
1510 const EB_U32 areaSize,
1511 EB_U32 *nonzerocoeff)
1512 {
1513 int row;
1514
1515 __m128i q = _mm_set1_epi16((EB_S16)qFunc);
1516 __m128i o = _mm_set1_epi32(q_offset);
1517 __m128i s = _mm_cvtsi32_si128(shiftedQBits);
1518
1519 __m128i iq = _mm_set1_epi16((EB_S16)shiftedFFunc);
1520 __m128i io = _mm_set1_epi32(iq_offset);
1521 __m128i is = _mm_cvtsi32_si128(shiftNum);
1522
1523 __m128i z = _mm_setzero_si128();
1524
1525 (void)areaSize;
1526
1527 row = 0;
1528 do
1529 {
1530 __m128i a0, a1;
1531 __m128i b0, b1;
1532 __m128i x0 = _mm_loadl_epi64((__m128i *)(coeff + coeffStride*row + 0));
1533 __m128i x1 = _mm_loadl_epi64((__m128i *)(coeff + coeffStride*row + coeffStride));
1534 __m128i y = _mm_unpacklo_epi64(x0, x1);
1535 __m128i x;
1536
1537 x = _mm_abs_epi16(y);
1538
1539 a0 = _mm_mullo_epi16(x, q);
1540 a1 = _mm_mulhi_epi16(x, q);
1541
1542 b0 = _mm_unpacklo_epi16(a0, a1);
1543 b1 = _mm_unpackhi_epi16(a0, a1);
1544
1545 b0 = _mm_add_epi32(b0, o);
1546 b1 = _mm_add_epi32(b1, o);
1547
1548 b0 = _mm_sra_epi32(b0, s);
1549 b1 = _mm_sra_epi32(b1, s);
1550
1551 x = _mm_packs_epi32(b0, b1);
1552
1553 z = _mm_sub_epi16(z, _mm_cmpgt_epi16(x, _mm_setzero_si128()));
1554
1555 x = _mm_sign_epi16(x, y);
1556 _mm_storel_epi64((__m128i *)(quantCoeff + coeffStride*row + 0), x);
1557 _mm_storel_epi64((__m128i *)(quantCoeff + coeffStride*row + coeffStride), _mm_srli_si128(x, 8));
1558
1559 __m128i zer = _mm_setzero_si128();
1560 __m128i cmp = _mm_cmpeq_epi16(x, zer);
1561 int msk = _mm_movemask_epi8(cmp);
1562
1563 if (msk != 0xFFFF)
1564 {
1565 a0 = _mm_mullo_epi16(x, iq);
1566 a1 = _mm_mulhi_epi16(x, iq);
1567
1568 b0 = _mm_unpacklo_epi16(a0, a1);
1569 b1 = _mm_unpackhi_epi16(a0, a1);
1570
1571 b0 = _mm_add_epi32(b0, io);
1572 b1 = _mm_add_epi32(b1, io);
1573
1574 b0 = _mm_sra_epi32(b0, is);
1575 b1 = _mm_sra_epi32(b1, is);
1576
1577 x = _mm_packs_epi32(b0, b1);
1578 _mm_storel_epi64((__m128i *)(reconCoeff + coeffStride*row + 0), x);
1579 _mm_storel_epi64((__m128i *)(reconCoeff + coeffStride*row + coeffStride), _mm_srli_si128(x, 8));
1580 }
1581 else{
1582 _mm_storel_epi64((__m128i *)(reconCoeff + coeffStride*row + 0), zer);
1583 _mm_storel_epi64((__m128i *)(reconCoeff + coeffStride*row + coeffStride), zer);
1584 }
1585 row += 2;
1586 } while (row < 4);
1587
1588 z = _mm_sad_epu8(z, _mm_srli_si128(z, 7));
1589 *nonzerocoeff = _mm_cvtsi128_si32(z);
1590 }
1591