1 /*
2 * Copyright(c) 2018 Intel Corporation
3 * SPDX - License - Identifier: BSD - 2 - Clause - Patent
4 */
5 
6 #include "EbDefinitions.h"
7 
8 #include "EbTransforms_SSSE3.h"
9 
10 #include <emmintrin.h>
11 #include <tmmintrin.h>
12 
13 
14 #define SSSE3/// __SSSE3__
15 
16 
17 
18 #ifdef __cplusplus
19 extern "C" const EB_S16 EbHevcCoeff_tbl[48*8];
20 extern "C" const EB_S16 EbHevcCoeff_tbl2[48*8];
21 #else
22 extern const EB_S16 EbHevcCoeff_tbl[48*8];
23 extern const EB_S16 EbHevcCoeff_tbl2[48*8];
24 #endif
25 
26 
27 // Reverse order of 16-bit elements within 128-bit vector
28 // This can be done more efficiently with _mm_shuffle_epi8 but requires SSSE3
reverse_epi16(__m128i x)29 static __m128i reverse_epi16(__m128i x)
30 {
31 #ifdef SSSE3/// __SSSE3__
32   return _mm_shuffle_epi8(x, _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1));
33 #else
34   x = _mm_shuffle_epi32(x, 0x1b); // 00011011
35   x = _mm_shufflelo_epi16(x, 0xb1); // 10110001
36   x = _mm_shufflehi_epi16(x, 0xb1);
37   return x;
38 #endif
39 }
40 
41 
42 // transpose 16x16 block of data
transpose16(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride)43 static void transpose16(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride)
44 {
45   EB_U32 i, j;
46   for (i = 0; i < 2; i++)
47   {
48     for (j = 0; j < 2; j++)
49     {
50       __m128i a0, a1, a2, a3, a4, a5, a6, a7;
51       __m128i b0, b1, b2, b3, b4, b5, b6, b7;
52 
53       a0 = _mm_loadu_si128((const __m128i *)(src + (8*i+0)*src_stride + 8*j));
54       a1 = _mm_loadu_si128((const __m128i *)(src + (8*i+1)*src_stride + 8*j));
55       a2 = _mm_loadu_si128((const __m128i *)(src + (8*i+2)*src_stride + 8*j));
56       a3 = _mm_loadu_si128((const __m128i *)(src + (8*i+3)*src_stride + 8*j));
57       a4 = _mm_loadu_si128((const __m128i *)(src + (8*i+4)*src_stride + 8*j));
58       a5 = _mm_loadu_si128((const __m128i *)(src + (8*i+5)*src_stride + 8*j));
59       a6 = _mm_loadu_si128((const __m128i *)(src + (8*i+6)*src_stride + 8*j));
60       a7 = _mm_loadu_si128((const __m128i *)(src + (8*i+7)*src_stride + 8*j));
61 
62       b0 = _mm_unpacklo_epi16(a0, a4);
63       b1 = _mm_unpacklo_epi16(a1, a5);
64       b2 = _mm_unpacklo_epi16(a2, a6);
65       b3 = _mm_unpacklo_epi16(a3, a7);
66       b4 = _mm_unpackhi_epi16(a0, a4);
67       b5 = _mm_unpackhi_epi16(a1, a5);
68       b6 = _mm_unpackhi_epi16(a2, a6);
69       b7 = _mm_unpackhi_epi16(a3, a7);
70 
71       a0 = _mm_unpacklo_epi16(b0, b2);
72       a1 = _mm_unpacklo_epi16(b1, b3);
73       a2 = _mm_unpackhi_epi16(b0, b2);
74       a3 = _mm_unpackhi_epi16(b1, b3);
75       a4 = _mm_unpacklo_epi16(b4, b6);
76       a5 = _mm_unpacklo_epi16(b5, b7);
77       a6 = _mm_unpackhi_epi16(b4, b6);
78       a7 = _mm_unpackhi_epi16(b5, b7);
79 
80       b0 = _mm_unpacklo_epi16(a0, a1);
81       b1 = _mm_unpackhi_epi16(a0, a1);
82       b2 = _mm_unpacklo_epi16(a2, a3);
83       b3 = _mm_unpackhi_epi16(a2, a3);
84       b4 = _mm_unpacklo_epi16(a4, a5);
85       b5 = _mm_unpackhi_epi16(a4, a5);
86       b6 = _mm_unpacklo_epi16(a6, a7);
87       b7 = _mm_unpackhi_epi16(a6, a7);
88 
89       _mm_storeu_si128((__m128i *)(dst + (8*j+0)*dst_stride + 8*i), b0);
90       _mm_storeu_si128((__m128i *)(dst + (8*j+1)*dst_stride + 8*i), b1);
91       _mm_storeu_si128((__m128i *)(dst + (8*j+2)*dst_stride + 8*i), b2);
92       _mm_storeu_si128((__m128i *)(dst + (8*j+3)*dst_stride + 8*i), b3);
93       _mm_storeu_si128((__m128i *)(dst + (8*j+4)*dst_stride + 8*i), b4);
94       _mm_storeu_si128((__m128i *)(dst + (8*j+5)*dst_stride + 8*i), b5);
95       _mm_storeu_si128((__m128i *)(dst + (8*j+6)*dst_stride + 8*i), b6);
96       _mm_storeu_si128((__m128i *)(dst + (8*j+7)*dst_stride + 8*i), b7);
97     }
98   }
99 }
100 
transpose16Partial(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 pattern)101 static void transpose16Partial(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_U32 pattern)
102 {
103   EB_U32 j;
104   EB_U32 numRows = 2 - (pattern & 1);
105 
106   do
107   {
108     for (j = 0; j < 2; j++)
109     {
110       __m128i a0, a1, a2, a3, a4, a5, a6, a7;
111       __m128i b0, b1, b2, b3, b4, b5, b6, b7;
112 
113       a0 = _mm_loadu_si128((const __m128i *)(src + (0)*src_stride + 8*j));
114       a1 = _mm_loadu_si128((const __m128i *)(src + (1)*src_stride + 8*j));
115       a2 = _mm_loadu_si128((const __m128i *)(src + (2)*src_stride + 8*j));
116       a3 = _mm_loadu_si128((const __m128i *)(src + (3)*src_stride + 8*j));
117       a4 = _mm_loadu_si128((const __m128i *)(src + (4)*src_stride + 8*j));
118       a5 = _mm_loadu_si128((const __m128i *)(src + (5)*src_stride + 8*j));
119       a6 = _mm_loadu_si128((const __m128i *)(src + (6)*src_stride + 8*j));
120       a7 = _mm_loadu_si128((const __m128i *)(src + (7)*src_stride + 8*j));
121 
122       b0 = _mm_unpacklo_epi16(a0, a4);
123       b1 = _mm_unpacklo_epi16(a1, a5);
124       b2 = _mm_unpacklo_epi16(a2, a6);
125       b3 = _mm_unpacklo_epi16(a3, a7);
126       b4 = _mm_unpackhi_epi16(a0, a4);
127       b5 = _mm_unpackhi_epi16(a1, a5);
128       b6 = _mm_unpackhi_epi16(a2, a6);
129       b7 = _mm_unpackhi_epi16(a3, a7);
130 
131       a0 = _mm_unpacklo_epi16(b0, b2);
132       a1 = _mm_unpacklo_epi16(b1, b3);
133       a2 = _mm_unpackhi_epi16(b0, b2);
134       a3 = _mm_unpackhi_epi16(b1, b3);
135       a4 = _mm_unpacklo_epi16(b4, b6);
136       a5 = _mm_unpacklo_epi16(b5, b7);
137       a6 = _mm_unpackhi_epi16(b4, b6);
138       a7 = _mm_unpackhi_epi16(b5, b7);
139 
140       b0 = _mm_unpacklo_epi16(a0, a1);
141       b1 = _mm_unpackhi_epi16(a0, a1);
142       b2 = _mm_unpacklo_epi16(a2, a3);
143       b3 = _mm_unpackhi_epi16(a2, a3);
144       b4 = _mm_unpacklo_epi16(a4, a5);
145       b5 = _mm_unpackhi_epi16(a4, a5);
146       b6 = _mm_unpacklo_epi16(a6, a7);
147       b7 = _mm_unpackhi_epi16(a6, a7);
148 
149       _mm_storeu_si128((__m128i *)(dst + (8*j+0)*dst_stride), b0);
150       _mm_storeu_si128((__m128i *)(dst + (8*j+1)*dst_stride), b1);
151       _mm_storeu_si128((__m128i *)(dst + (8*j+2)*dst_stride), b2);
152       _mm_storeu_si128((__m128i *)(dst + (8*j+3)*dst_stride), b3);
153       _mm_storeu_si128((__m128i *)(dst + (8*j+4)*dst_stride), b4);
154       _mm_storeu_si128((__m128i *)(dst + (8*j+5)*dst_stride), b5);
155       _mm_storeu_si128((__m128i *)(dst + (8*j+6)*dst_stride), b6);
156       _mm_storeu_si128((__m128i *)(dst + (8*j+7)*dst_stride), b7);
157     }
158 
159     src += 8*src_stride;
160     dst += 8;
161   }
162   while (--numRows);
163 }
164 
transpose16Check0s(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride)165 static EB_U32 transpose16Check0s(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride)
166 {
167   EB_U32 i, j;
168   EB_U32 zeroPattern = 0;
169   EB_U32 result = 0;
170 
171   for (i = 0; i < 2; i++)
172   {
173     for (j = 0; j < 2; j++)
174     {
175       __m128i a0, a1, a2, a3, a4, a5, a6, a7;
176       __m128i b0, b1, b2, b3, b4, b5, b6, b7;
177       __m128i c0;
178 
179       a0 = _mm_loadu_si128((const __m128i *)(src + (8*i+0)*src_stride + 8*j));
180       a1 = _mm_loadu_si128((const __m128i *)(src + (8*i+1)*src_stride + 8*j));
181       a2 = _mm_loadu_si128((const __m128i *)(src + (8*i+2)*src_stride + 8*j));
182       a3 = _mm_loadu_si128((const __m128i *)(src + (8*i+3)*src_stride + 8*j));
183       a4 = _mm_loadu_si128((const __m128i *)(src + (8*i+4)*src_stride + 8*j));
184       a5 = _mm_loadu_si128((const __m128i *)(src + (8*i+5)*src_stride + 8*j));
185       a6 = _mm_loadu_si128((const __m128i *)(src + (8*i+6)*src_stride + 8*j));
186       a7 = _mm_loadu_si128((const __m128i *)(src + (8*i+7)*src_stride + 8*j));
187 
188       c0 = _mm_or_si128(a0, a4);
189       c0 = _mm_or_si128(c0, a1);
190       c0 = _mm_or_si128(c0, a5);
191       c0 = _mm_or_si128(c0, a2);
192       c0 = _mm_or_si128(c0, a6);
193       c0 = _mm_or_si128(c0, a3);
194       c0 = _mm_or_si128(c0, a7);
195 
196       c0 = _mm_cmpeq_epi8(c0, _mm_setzero_si128());
197 
198       zeroPattern = 2 * zeroPattern + ((_mm_movemask_epi8(c0)+1) >> 16); // add a '1' bit if all zeros
199 
200       b0 = _mm_unpacklo_epi16(a0, a4);
201       b1 = _mm_unpacklo_epi16(a1, a5);
202       b2 = _mm_unpacklo_epi16(a2, a6);
203       b3 = _mm_unpacklo_epi16(a3, a7);
204       b4 = _mm_unpackhi_epi16(a0, a4);
205       b5 = _mm_unpackhi_epi16(a1, a5);
206       b6 = _mm_unpackhi_epi16(a2, a6);
207       b7 = _mm_unpackhi_epi16(a3, a7);
208 
209       a0 = _mm_unpacklo_epi16(b0, b2);
210       a1 = _mm_unpacklo_epi16(b1, b3);
211       a2 = _mm_unpackhi_epi16(b0, b2);
212       a3 = _mm_unpackhi_epi16(b1, b3);
213       a4 = _mm_unpacklo_epi16(b4, b6);
214       a5 = _mm_unpacklo_epi16(b5, b7);
215       a6 = _mm_unpackhi_epi16(b4, b6);
216       a7 = _mm_unpackhi_epi16(b5, b7);
217 
218       b0 = _mm_unpacklo_epi16(a0, a1);
219       b1 = _mm_unpackhi_epi16(a0, a1);
220       b2 = _mm_unpacklo_epi16(a2, a3);
221       b3 = _mm_unpackhi_epi16(a2, a3);
222       b4 = _mm_unpacklo_epi16(a4, a5);
223       b5 = _mm_unpackhi_epi16(a4, a5);
224       b6 = _mm_unpacklo_epi16(a6, a7);
225       b7 = _mm_unpackhi_epi16(a6, a7);
226 
227       _mm_storeu_si128((__m128i *)(dst + (8*j+0)*dst_stride + 8*i), b0);
228       _mm_storeu_si128((__m128i *)(dst + (8*j+1)*dst_stride + 8*i), b1);
229       _mm_storeu_si128((__m128i *)(dst + (8*j+2)*dst_stride + 8*i), b2);
230       _mm_storeu_si128((__m128i *)(dst + (8*j+3)*dst_stride + 8*i), b3);
231       _mm_storeu_si128((__m128i *)(dst + (8*j+4)*dst_stride + 8*i), b4);
232       _mm_storeu_si128((__m128i *)(dst + (8*j+5)*dst_stride + 8*i), b5);
233       _mm_storeu_si128((__m128i *)(dst + (8*j+6)*dst_stride + 8*i), b6);
234       _mm_storeu_si128((__m128i *)(dst + (8*j+7)*dst_stride + 8*i), b7);
235     }
236   }
237 
238   if ((zeroPattern & 3) == 3) result |= 1; // can do half transforms 1st pass
239   if ((zeroPattern & 5) == 5) result |= 2; // can do half rows 1st pass, and half transforms 2nd pass
240   return result;
241 }
242 
243 // 16-point forward transform (16 rows)
transform16(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_S16 shift)244 static void transform16(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_S16 shift)
245 {
246   EB_U32 i;
247   __m128i s0 = _mm_cvtsi32_si128(shift);
248   __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
249   const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl;
250 
251   for (i = 0; i < 16; i++)
252   {
253     __m128i x0, x1;
254     __m128i y0, y1;
255     __m128i a0, a1, a2, a3;
256     __m128i b0, b1, b2, b3;
257 
258     y0 = _mm_loadu_si128((const __m128i *)(src+i*src_stride+0x00));
259     y1 = _mm_loadu_si128((const __m128i *)(src+i*src_stride+0x08));
260 
261 
262     // 16-point butterfly
263     y1 = reverse_epi16(y1);
264 
265     x0 = _mm_add_epi16(y0, y1);
266     x1 = _mm_sub_epi16(y0, y1);
267 
268     a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
269     a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[2]));
270     a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[4]));
271     a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[6]));
272 
273     a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
274     a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[3]));
275     a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[5]));
276     a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[7]));
277 
278     a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]);
279     a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10]));
280     a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[12]));
281     a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[14]));
282 
283     a3 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[9]);
284     a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[11]));
285     a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[13]));
286     a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[15]));
287 
288     b0 = _mm_sra_epi32(_mm_add_epi32(a0, o0), s0);
289     b1 = _mm_sra_epi32(_mm_add_epi32(a1, o0), s0);
290     b2 = _mm_sra_epi32(_mm_add_epi32(a2, o0), s0);
291     b3 = _mm_sra_epi32(_mm_add_epi32(a3, o0), s0);
292 
293     x0 = _mm_packs_epi32(b0, b1);
294     x1 = _mm_packs_epi32(b2, b3);
295 
296     y0 = _mm_unpacklo_epi16(x0, x1);
297     y1 = _mm_unpackhi_epi16(x0, x1);
298 
299     _mm_storeu_si128((__m128i *)(dst+i*dst_stride+0x00), y0);
300     _mm_storeu_si128((__m128i *)(dst+i*dst_stride+0x08), y1);
301   }
302 }
303 
304 // 16-point inverse transform
invTransform16(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift,EB_U32 numRows)305 static void invTransform16(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_U32 shift, EB_U32 numRows)
306 {
307   __m128i s0 = _mm_cvtsi32_si128(shift);
308   __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
309   const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl2;
310 
311   do
312   {
313     __m128i x0, x1;
314     __m128i a0, a1, a2, a3;
315     __m128i b0, b1, b2, b3;
316     x0 = _mm_loadu_si128((const __m128i *)(src+0x00)); // 00 01 02 03 04 05 06 07
317     x1 = _mm_loadu_si128((const __m128i *)(src+0x08)); // 08 09 0a 0b 0c 0d 0e 0f
318 
319 #ifdef SSSE3/// __SSSE3__
320     x0 = _mm_shuffle_epi8(x0, _mm_setr_epi8(0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15));
321     x1 = _mm_shuffle_epi8(x1, _mm_setr_epi8(0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15));
322 #else
323     x0 = _mm_shufflelo_epi16(x0, 0xd8); // 00 02 01 03 04 06 05 07
324     x1 = _mm_shufflelo_epi16(x1, 0xd8); // 08 0a 09 0b 0c 0e 0d 0f
325     x0 = _mm_shufflehi_epi16(x0, 0xd8);
326     x1 = _mm_shufflehi_epi16(x1, 0xd8);
327 #endif
328 
329     a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]); // 00 02
330     a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[2])); // 04 06
331     a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[4])); // 08 0a
332     a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[6])); // 0c 0e
333 
334     a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
335     a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[3]));
336     a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[5]));
337     a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[7]));
338 
339     a2 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[8]);
340     a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[10]));
341     a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[12]));
342     a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[14]));
343 
344     a3 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[9]);
345     a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[11]));
346     a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[13]));
347     a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[15]));
348 
349     a0 = _mm_add_epi32(a0, o0);
350     a1 = _mm_add_epi32(a1, o0);
351 
352     b0 = _mm_add_epi32(a0, a2);
353     b1 = _mm_add_epi32(a1, a3);
354     b2 = _mm_sub_epi32(a0, a2);
355     b3 = _mm_sub_epi32(a1, a3);
356 
357     a0 = b0;
358     a1 = b1;
359     a2 = _mm_shuffle_epi32(b3, 0x1b); // 00011011
360     a3 = _mm_shuffle_epi32(b2, 0x1b);
361 
362     a0 = _mm_sra_epi32(a0, s0);
363     a1 = _mm_sra_epi32(a1, s0);
364     a2 = _mm_sra_epi32(a2, s0);
365     a3 = _mm_sra_epi32(a3, s0);
366 
367     x0 = _mm_packs_epi32(a0, a1);
368     x1 = _mm_packs_epi32(a2, a3);
369 
370     _mm_storeu_si128((__m128i *)(dst+0x00), x0);
371     _mm_storeu_si128((__m128i *)(dst+0x08), x1);
372 
373     src += src_stride;
374     dst += dst_stride;
375   }
376   while (--numRows);
377 }
378 
379 // 16-point inverse transform
invTransform16Half(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift,EB_U32 numRows)380 static void invTransform16Half(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_U32 shift, EB_U32 numRows)
381 {
382   __m128i s0 = _mm_cvtsi32_si128(shift);
383   __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
384   const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl2;
385 
386   do
387   {
388     __m128i x0, x1;
389     __m128i a0, a1, a2, a3;
390     __m128i b0, b1, b2, b3;
391     x0 = _mm_loadu_si128((const __m128i *)(src+0x00)); // 00 01 02 03 04 05 06 07
392 
393 #ifdef SSSE3/// __SSSE3__
394     x0 = _mm_shuffle_epi8(x0, _mm_setr_epi8(0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15));
395 #else
396     x0 = _mm_shufflelo_epi16(x0, 0xd8); // 00 02 01 03 04 06 05 07
397     x0 = _mm_shufflehi_epi16(x0, 0xd8);
398 #endif
399 
400     a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]); // 00 02
401     a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[2])); // 04 06
402 
403     a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
404     a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[3]));
405 
406     a2 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[8]);
407     a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[10]));
408 
409     a3 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[9]);
410     a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[11]));
411 
412     a0 = _mm_add_epi32(a0, o0);
413     a1 = _mm_add_epi32(a1, o0);
414 
415     b0 = _mm_add_epi32(a0, a2);
416     b1 = _mm_add_epi32(a1, a3);
417     b2 = _mm_sub_epi32(a0, a2);
418     b3 = _mm_sub_epi32(a1, a3);
419 
420     a0 = b0;
421     a1 = b1;
422     a2 = _mm_shuffle_epi32(b3, 0x1b); // 00011011
423     a3 = _mm_shuffle_epi32(b2, 0x1b);
424 
425     a0 = _mm_sra_epi32(a0, s0);
426     a1 = _mm_sra_epi32(a1, s0);
427     a2 = _mm_sra_epi32(a2, s0);
428     a3 = _mm_sra_epi32(a3, s0);
429 
430     x0 = _mm_packs_epi32(a0, a1);
431     x1 = _mm_packs_epi32(a2, a3);
432 
433     _mm_storeu_si128((__m128i *)(dst+0x00), x0);
434     _mm_storeu_si128((__m128i *)(dst+0x08), x1);
435 
436     src += src_stride;
437     dst += dst_stride;
438   }
439   while (--numRows);
440 }
441 
442 
invTransform16Partial(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift,EB_U32 pattern)443 static void invTransform16Partial(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_U32 shift, EB_U32 pattern)
444 {
445   EB_U32 numRows = 16 - 4 * (pattern & 2);
446   if (pattern & 1)
447   {
448     invTransform16Half(src, src_stride, dst, dst_stride, shift, numRows);
449   }
450   else
451   {
452     invTransform16(src, src_stride, dst, dst_stride, shift, numRows);
453   }
454 }
455 
456 // inverse 16x16 transform
PFinvTransform16x16_SSSE3(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_S16 * intermediate,EB_U32 addshift)457 void PFinvTransform16x16_SSSE3(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_S16 *intermediate, EB_U32 addshift)
458 {
459 
460   EB_U32 pattern = transpose16Check0s(src, src_stride, intermediate, 16);
461   invTransform16Partial(intermediate, 16, dst, dst_stride, 7, pattern);
462 
463   pattern >>= 1;
464   transpose16Partial(dst, dst_stride, intermediate, 16, pattern);
465   invTransform16Partial(intermediate, 16, dst, dst_stride, 12-addshift, pattern);
466 
467 }
468 
469 // transpose 32x32 block of data
transpose32(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride)470 static void transpose32(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride)
471 {
472   EB_U32 i, j;
473   for (i = 0; i < 4; i++)
474   {
475     for (j = 0; j < 4; j++)
476     {
477       __m128i a0, a1, a2, a3, a4, a5, a6, a7;
478       __m128i b0, b1, b2, b3, b4, b5, b6, b7;
479 
480       a0 = _mm_loadu_si128((const __m128i *)(src + (8*i+0)*src_stride + 8*j));
481       a1 = _mm_loadu_si128((const __m128i *)(src + (8*i+1)*src_stride + 8*j));
482       a2 = _mm_loadu_si128((const __m128i *)(src + (8*i+2)*src_stride + 8*j));
483       a3 = _mm_loadu_si128((const __m128i *)(src + (8*i+3)*src_stride + 8*j));
484       a4 = _mm_loadu_si128((const __m128i *)(src + (8*i+4)*src_stride + 8*j));
485       a5 = _mm_loadu_si128((const __m128i *)(src + (8*i+5)*src_stride + 8*j));
486       a6 = _mm_loadu_si128((const __m128i *)(src + (8*i+6)*src_stride + 8*j));
487       a7 = _mm_loadu_si128((const __m128i *)(src + (8*i+7)*src_stride + 8*j));
488 
489       b0 = _mm_unpacklo_epi16(a0, a4);
490       b1 = _mm_unpacklo_epi16(a1, a5);
491       b2 = _mm_unpacklo_epi16(a2, a6);
492       b3 = _mm_unpacklo_epi16(a3, a7);
493       b4 = _mm_unpackhi_epi16(a0, a4);
494       b5 = _mm_unpackhi_epi16(a1, a5);
495       b6 = _mm_unpackhi_epi16(a2, a6);
496       b7 = _mm_unpackhi_epi16(a3, a7);
497 
498       a0 = _mm_unpacklo_epi16(b0, b2);
499       a1 = _mm_unpacklo_epi16(b1, b3);
500       a2 = _mm_unpackhi_epi16(b0, b2);
501       a3 = _mm_unpackhi_epi16(b1, b3);
502       a4 = _mm_unpacklo_epi16(b4, b6);
503       a5 = _mm_unpacklo_epi16(b5, b7);
504       a6 = _mm_unpackhi_epi16(b4, b6);
505       a7 = _mm_unpackhi_epi16(b5, b7);
506 
507       b0 = _mm_unpacklo_epi16(a0, a1);
508       b1 = _mm_unpackhi_epi16(a0, a1);
509       b2 = _mm_unpacklo_epi16(a2, a3);
510       b3 = _mm_unpackhi_epi16(a2, a3);
511       b4 = _mm_unpacklo_epi16(a4, a5);
512       b5 = _mm_unpackhi_epi16(a4, a5);
513       b6 = _mm_unpacklo_epi16(a6, a7);
514       b7 = _mm_unpackhi_epi16(a6, a7);
515 
516       _mm_storeu_si128((__m128i *)(dst + (8*j+0)*dst_stride + 8*i), b0);
517       _mm_storeu_si128((__m128i *)(dst + (8*j+1)*dst_stride + 8*i), b1);
518       _mm_storeu_si128((__m128i *)(dst + (8*j+2)*dst_stride + 8*i), b2);
519       _mm_storeu_si128((__m128i *)(dst + (8*j+3)*dst_stride + 8*i), b3);
520       _mm_storeu_si128((__m128i *)(dst + (8*j+4)*dst_stride + 8*i), b4);
521       _mm_storeu_si128((__m128i *)(dst + (8*j+5)*dst_stride + 8*i), b5);
522       _mm_storeu_si128((__m128i *)(dst + (8*j+6)*dst_stride + 8*i), b6);
523       _mm_storeu_si128((__m128i *)(dst + (8*j+7)*dst_stride + 8*i), b7);
524     }
525   }
526 }
527 
transpose32Partial(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 pattern)528 static void transpose32Partial(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_U32 pattern)
529 {
530   EB_U32 j;
531   EB_U32 numRows = 4 - (pattern & 3);
532 
533   do
534   {
535     for (j = 0; j < 4; j++)
536     {
537       __m128i a0, a1, a2, a3, a4, a5, a6, a7;
538       __m128i b0, b1, b2, b3, b4, b5, b6, b7;
539 
540       a0 = _mm_loadu_si128((const __m128i *)(src + (0)*src_stride + 8*j));
541       a1 = _mm_loadu_si128((const __m128i *)(src + (1)*src_stride + 8*j));
542       a2 = _mm_loadu_si128((const __m128i *)(src + (2)*src_stride + 8*j));
543       a3 = _mm_loadu_si128((const __m128i *)(src + (3)*src_stride + 8*j));
544       a4 = _mm_loadu_si128((const __m128i *)(src + (4)*src_stride + 8*j));
545       a5 = _mm_loadu_si128((const __m128i *)(src + (5)*src_stride + 8*j));
546       a6 = _mm_loadu_si128((const __m128i *)(src + (6)*src_stride + 8*j));
547       a7 = _mm_loadu_si128((const __m128i *)(src + (7)*src_stride + 8*j));
548 
549       b0 = _mm_unpacklo_epi16(a0, a4);
550       b1 = _mm_unpacklo_epi16(a1, a5);
551       b2 = _mm_unpacklo_epi16(a2, a6);
552       b3 = _mm_unpacklo_epi16(a3, a7);
553       b4 = _mm_unpackhi_epi16(a0, a4);
554       b5 = _mm_unpackhi_epi16(a1, a5);
555       b6 = _mm_unpackhi_epi16(a2, a6);
556       b7 = _mm_unpackhi_epi16(a3, a7);
557 
558       a0 = _mm_unpacklo_epi16(b0, b2);
559       a1 = _mm_unpacklo_epi16(b1, b3);
560       a2 = _mm_unpackhi_epi16(b0, b2);
561       a3 = _mm_unpackhi_epi16(b1, b3);
562       a4 = _mm_unpacklo_epi16(b4, b6);
563       a5 = _mm_unpacklo_epi16(b5, b7);
564       a6 = _mm_unpackhi_epi16(b4, b6);
565       a7 = _mm_unpackhi_epi16(b5, b7);
566 
567       b0 = _mm_unpacklo_epi16(a0, a1);
568       b1 = _mm_unpackhi_epi16(a0, a1);
569       b2 = _mm_unpacklo_epi16(a2, a3);
570       b3 = _mm_unpackhi_epi16(a2, a3);
571       b4 = _mm_unpacklo_epi16(a4, a5);
572       b5 = _mm_unpackhi_epi16(a4, a5);
573       b6 = _mm_unpacklo_epi16(a6, a7);
574       b7 = _mm_unpackhi_epi16(a6, a7);
575 
576       _mm_storeu_si128((__m128i *)(dst + (8*j+0)*dst_stride), b0);
577       _mm_storeu_si128((__m128i *)(dst + (8*j+1)*dst_stride), b1);
578       _mm_storeu_si128((__m128i *)(dst + (8*j+2)*dst_stride), b2);
579       _mm_storeu_si128((__m128i *)(dst + (8*j+3)*dst_stride), b3);
580       _mm_storeu_si128((__m128i *)(dst + (8*j+4)*dst_stride), b4);
581       _mm_storeu_si128((__m128i *)(dst + (8*j+5)*dst_stride), b5);
582       _mm_storeu_si128((__m128i *)(dst + (8*j+6)*dst_stride), b6);
583       _mm_storeu_si128((__m128i *)(dst + (8*j+7)*dst_stride), b7);
584     }
585 
586     src += 8 * src_stride;
587     dst += 8;
588   }
589   while (--numRows);
590 }
591 
transpose32Check0s(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride)592 static EB_U32 transpose32Check0s(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride)
593 {
594   EB_U32 i, j;
595   EB_U32 zeroPattern = 0;
596   EB_U32 result = 0;
597 
598   for (i = 0; i < 4; i++)
599   {
600     for (j = 0; j < 4; j++)
601     {
602       __m128i a0, a1, a2, a3, a4, a5, a6, a7;
603       __m128i b0, b1, b2, b3, b4, b5, b6, b7;
604       __m128i c0;
605 
606       a0 = _mm_loadu_si128((const __m128i *)(src + (8*i+0)*src_stride + 8*j));
607       a1 = _mm_loadu_si128((const __m128i *)(src + (8*i+1)*src_stride + 8*j));
608       a2 = _mm_loadu_si128((const __m128i *)(src + (8*i+2)*src_stride + 8*j));
609       a3 = _mm_loadu_si128((const __m128i *)(src + (8*i+3)*src_stride + 8*j));
610       a4 = _mm_loadu_si128((const __m128i *)(src + (8*i+4)*src_stride + 8*j));
611       a5 = _mm_loadu_si128((const __m128i *)(src + (8*i+5)*src_stride + 8*j));
612       a6 = _mm_loadu_si128((const __m128i *)(src + (8*i+6)*src_stride + 8*j));
613       a7 = _mm_loadu_si128((const __m128i *)(src + (8*i+7)*src_stride + 8*j));
614 
615       c0 = _mm_or_si128(a0, a4);
616       c0 = _mm_or_si128(c0, a1);
617       c0 = _mm_or_si128(c0, a5);
618       c0 = _mm_or_si128(c0, a2);
619       c0 = _mm_or_si128(c0, a6);
620       c0 = _mm_or_si128(c0, a3);
621       c0 = _mm_or_si128(c0, a7);
622 
623       c0 = _mm_cmpeq_epi8(c0, _mm_setzero_si128());
624 
625       zeroPattern = 2 * zeroPattern + ((_mm_movemask_epi8(c0)+1) >> 16); // add a '1' bit if all zeros
626 
627       b0 = _mm_unpacklo_epi16(a0, a4);
628       b1 = _mm_unpacklo_epi16(a1, a5);
629       b2 = _mm_unpacklo_epi16(a2, a6);
630       b3 = _mm_unpacklo_epi16(a3, a7);
631       b4 = _mm_unpackhi_epi16(a0, a4);
632       b5 = _mm_unpackhi_epi16(a1, a5);
633       b6 = _mm_unpackhi_epi16(a2, a6);
634       b7 = _mm_unpackhi_epi16(a3, a7);
635 
636       a0 = _mm_unpacklo_epi16(b0, b2);
637       a1 = _mm_unpacklo_epi16(b1, b3);
638       a2 = _mm_unpackhi_epi16(b0, b2);
639       a3 = _mm_unpackhi_epi16(b1, b3);
640       a4 = _mm_unpacklo_epi16(b4, b6);
641       a5 = _mm_unpacklo_epi16(b5, b7);
642       a6 = _mm_unpackhi_epi16(b4, b6);
643       a7 = _mm_unpackhi_epi16(b5, b7);
644 
645       b0 = _mm_unpacklo_epi16(a0, a1);
646       b1 = _mm_unpackhi_epi16(a0, a1);
647       b2 = _mm_unpacklo_epi16(a2, a3);
648       b3 = _mm_unpackhi_epi16(a2, a3);
649       b4 = _mm_unpacklo_epi16(a4, a5);
650       b5 = _mm_unpackhi_epi16(a4, a5);
651       b6 = _mm_unpacklo_epi16(a6, a7);
652       b7 = _mm_unpackhi_epi16(a6, a7);
653 
654       _mm_storeu_si128((__m128i *)(dst + (8*j+0)*dst_stride + 8*i), b0);
655       _mm_storeu_si128((__m128i *)(dst + (8*j+1)*dst_stride + 8*i), b1);
656       _mm_storeu_si128((__m128i *)(dst + (8*j+2)*dst_stride + 8*i), b2);
657       _mm_storeu_si128((__m128i *)(dst + (8*j+3)*dst_stride + 8*i), b3);
658       _mm_storeu_si128((__m128i *)(dst + (8*j+4)*dst_stride + 8*i), b4);
659       _mm_storeu_si128((__m128i *)(dst + (8*j+5)*dst_stride + 8*i), b5);
660       _mm_storeu_si128((__m128i *)(dst + (8*j+6)*dst_stride + 8*i), b6);
661       _mm_storeu_si128((__m128i *)(dst + (8*j+7)*dst_stride + 8*i), b7);
662     }
663   }
664 
665   if ((zeroPattern & 0xfff) == 0xfff) result |= 3;
666   else if ((zeroPattern & 0xff) == 0xff) result |= 2;
667   else if ((zeroPattern & 0xf) == 0xf) result |= 1;
668 
669   if ((zeroPattern & 0x7777) == 0x7777) result |= 3*4;
670   else if ((zeroPattern & 0x3333) == 0x3333) result |= 2*4;
671   else if ((zeroPattern & 0x1111) == 0x1111) result |= 1*4;
672 
673   return result;
674 }
675 
676 // 32-point forward transform (32 rows)
transform32(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift)677 static void transform32(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_U32 shift)
678 {
679   __m128i s0 = _mm_cvtsi32_si128(shift);
680   __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
681   const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl;
682 
683   EB_U32 numRows = 32;
684   do
685   {
686     __m128i x0, x1, x2, x3;
687     __m128i y0, y1, y2, y3;
688     __m128i a0, a1, a2, a3, a4, a5, a6, a7;
689     __m128i b0, b1, b2, b3, b4, b5, b6, b7;
690 
691     x0 = _mm_loadu_si128((const __m128i *)(src+0x00));
692     x1 = _mm_loadu_si128((const __m128i *)(src+0x08));
693     x2 = _mm_loadu_si128((const __m128i *)(src+0x10));
694     x3 = _mm_loadu_si128((const __m128i *)(src+0x18));
695 
696 
697     // 32-point butterfly
698     x2 = reverse_epi16(x2);
699     x3 = reverse_epi16(x3);
700 
701     y0 = _mm_add_epi16(x0, x3);
702     y1 = _mm_add_epi16(x1, x2);
703 
704     y2 = _mm_sub_epi16(x0, x3);
705     y3 = _mm_sub_epi16(x1, x2);
706 
707     // 16-point butterfly
708     y1 = reverse_epi16(y1);
709 
710     x0 = _mm_add_epi16(y0, y1);
711     x1 = _mm_sub_epi16(y0, y1);
712 
713 
714     x2 = y2;
715     x3 = y3;
716 
717     a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
718     a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[2]));
719     a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[4]));
720     a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[6]));
721 
722     a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
723     a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[3]));
724     a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[5]));
725     a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[7]));
726 
727     a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]);
728     a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10]));
729     a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[12]));
730     a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[14]));
731 
732     a3 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[9]);
733     a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[11]));
734     a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[13]));
735     a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[15]));
736 
737     a4 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[16]);
738     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[20]));
739     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[24]));
740     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[28]));
741     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[32]));
742     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[36]));
743     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[40]));
744     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[44]));
745 
746     a5 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[17]);
747     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[21]));
748     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[25]));
749     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[29]));
750     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[33]));
751     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[37]));
752     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[41]));
753     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[45]));
754 
755     a6 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[18]);
756     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[22]));
757     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[26]));
758     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[30]));
759     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[34]));
760     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[38]));
761     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[42]));
762     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[46]));
763 
764     a7 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[19]);
765     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[23]));
766     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[27]));
767     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[31]));
768     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[35]));
769     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[39]));
770     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[43]));
771     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[47]));
772 
773     b0 = _mm_sra_epi32(_mm_add_epi32(a0, o0), s0);
774     b1 = _mm_sra_epi32(_mm_add_epi32(a1, o0), s0);
775     b2 = _mm_sra_epi32(_mm_add_epi32(a2, o0), s0);
776     b3 = _mm_sra_epi32(_mm_add_epi32(a3, o0), s0);
777     b4 = _mm_sra_epi32(_mm_add_epi32(a4, o0), s0);
778     b5 = _mm_sra_epi32(_mm_add_epi32(a5, o0), s0);
779     b6 = _mm_sra_epi32(_mm_add_epi32(a6, o0), s0);
780     b7 = _mm_sra_epi32(_mm_add_epi32(a7, o0), s0);
781 
782     x0 = _mm_packs_epi32(b0, b1);
783     x1 = _mm_packs_epi32(b2, b3);
784     x2 = _mm_packs_epi32(b4, b5);
785     x3 = _mm_packs_epi32(b6, b7);
786 
787     y0 = _mm_unpacklo_epi16(x0, x1);
788     y1 = _mm_unpackhi_epi16(x0, x1);
789     y2 = x2;
790     y3 = x3;
791     x0 = _mm_unpacklo_epi16(y0, y2);
792     x1 = _mm_unpackhi_epi16(y0, y2);
793     x2 = _mm_unpacklo_epi16(y1, y3);
794     x3 = _mm_unpackhi_epi16(y1, y3);
795 
796     _mm_storeu_si128((__m128i *)(dst+0x00), x0);
797     _mm_storeu_si128((__m128i *)(dst+0x08), x1);
798     _mm_storeu_si128((__m128i *)(dst+0x10), x2);
799     _mm_storeu_si128((__m128i *)(dst+0x18), x3);
800 
801     src += src_stride;
802     dst += dst_stride;
803   }
804   while (--numRows);
805 }
806 
807 // 32-point inverse transform (32 rows)
invTransform32(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift,EB_U32 numRows)808 static void invTransform32(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_U32 shift, EB_U32 numRows)
809 {
810   __m128i s0 = _mm_cvtsi32_si128(shift);
811   __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
812   const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl2;
813 
814   do
815   {
816     __m128i x0, x1, x2, x3;
817 #ifndef SSSE3/// __SSSE3__
818     __m128i y0, y1, y2, y3;
819 #endif
820     __m128i a0, a1, a2, a3, a4, a5, a6, a7;
821     __m128i b0, b1, b2, b3, b4, b5, b6, b7;
822     x0 = _mm_loadu_si128((const __m128i *)(src+0x00)); // 00 01 02 03 04 05 06 07
823     x1 = _mm_loadu_si128((const __m128i *)(src+0x08)); // 08 09 0a 0b 0c 0d 0e 0f
824     x2 = _mm_loadu_si128((const __m128i *)(src+0x10)); // 10 11 12 13 14 15 16 17
825     x3 = _mm_loadu_si128((const __m128i *)(src+0x18)); // 18 19 1a 1b 1c 1d 1e 1f
826 
827 #ifdef SSSE3/// __SSSE3__
828     x0 = _mm_shuffle_epi8(x0, _mm_setr_epi8(0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15)); // 00 04 02 06 01 03 05 07
829     x1 = _mm_shuffle_epi8(x1, _mm_setr_epi8(0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15)); // 08 0c 0a 0e 09 0b 0d 0f
830     x2 = _mm_shuffle_epi8(x2, _mm_setr_epi8(0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15)); // 10 14 12 16 11 13 15 17
831     x3 = _mm_shuffle_epi8(x3, _mm_setr_epi8(0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15)); // 18 1c 1a 1e 19 1b 1d 1f
832 
833     a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
834     a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[2]));
835     a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[4]));
836     a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[6]));
837 
838     a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
839     a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[3]));
840     a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[5]));
841     a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[7]));
842 
843     a2 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[8]);
844     a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10]));
845     a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[12]));
846     a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[14]));
847 
848     a3 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[9]);
849     a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[11]));
850     a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[13]));
851     a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[15]));
852 
853     a4 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[16]);
854     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[20]));
855     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[24]));
856     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[28]));
857     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[32]));
858     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[36]));
859     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[40]));
860     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[44]));
861 
862     a5 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[17]);
863     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[21]));
864     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[25]));
865     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[29]));
866     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[33]));
867     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[37]));
868     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[41]));
869     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[45]));
870 
871     a6 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[18]);
872     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[22]));
873     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[26]));
874     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[30]));
875     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[34]));
876     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[38]));
877     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[42]));
878     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[46]));
879 
880     a7 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[19]);
881     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[23]));
882     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[27]));
883     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[31]));
884     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[35]));
885     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[39]));
886     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[43]));
887     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[47]));
888 
889 #else
890     y0 = _mm_unpacklo_epi16(x0, x1); // 00 08 01 09 02 0a 03 0b
891     y1 = _mm_unpackhi_epi16(x0, x1); // 04 0c 05 0d 06 0e 07 0f
892     y2 = _mm_unpacklo_epi16(x2, x3); // 10 18
893     y3 = _mm_unpackhi_epi16(x2, x3); // 24 2c
894 
895     x0 = _mm_unpacklo_epi16(y0, y1); // 00 04 08 0c 01 05 09 0d
896     x1 = _mm_unpackhi_epi16(y0, y1); // 02 06 0a 0e 03 07 0b 0f
897     x2 = _mm_unpacklo_epi16(y2, y3); // 10 14
898     x3 = _mm_unpackhi_epi16(y2, y3); // 12 16
899 
900     y0 = _mm_unpacklo_epi64(x0, x2); // 00 04 08 0c 10 14 18 1c
901     y1 = _mm_unpacklo_epi64(x1, x3); // 02 06 0a 0e 12 16 1a 1e
902     y2 = _mm_unpackhi_epi16(x0, x1); // 01 03 05 07 09 0b 0d 0f
903     y3 = _mm_unpackhi_epi16(x2, x3); // 11 13 15 17 19 1b 1d 1f
904 
905     x0 = y0;
906     x1 = y1;
907     x2 = y2;
908     x3 = y3;
909 
910     a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]); // 00 04
911     a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[2])); // 08 0c
912     a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[4])); // 10 14
913     a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[6])); // 18 1c
914 
915     a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
916     a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[3]));
917     a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[5]));
918     a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[7]));
919 
920     a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]); // 02 06
921     a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10])); // 0a 0e
922     a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[12])); // 12 16
923     a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[14])); // 1a 1e
924 
925     a3 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[9]);
926     a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[11]));
927     a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[13]));
928     a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[15]));
929 
930     a4 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[16]); // 01 03
931     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[20])); // 05 07
932     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[24])); // 09 0b
933     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[28])); // 0d 0f
934     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[32])); // 11 13
935     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[36])); // 15 17
936     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[40])); // 19 1b
937     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[44])); // 1d 1f
938 
939     a5 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[17]);
940     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[21]));
941     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[25]));
942     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[29]));
943     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[33]));
944     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[37]));
945     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[41]));
946     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[45]));
947 
948     a6 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[18]);
949     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[22]));
950     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[26]));
951     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[30]));
952     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[34]));
953     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[38]));
954     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[42]));
955     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[46]));
956 
957     a7 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[19]);
958     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[23]));
959     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[27]));
960     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[31]));
961     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[35]));
962     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[39]));
963     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xaa), coeff32[43]));
964     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0xff), coeff32[47]));
965 #endif
966 
967     a0 = _mm_add_epi32(a0, o0);
968     a1 = _mm_add_epi32(a1, o0);
969 
970     b0 = _mm_add_epi32(a0, a2);
971     b1 = _mm_add_epi32(a1, a3);
972     b2 = _mm_sub_epi32(a0, a2);
973     b3 = _mm_sub_epi32(a1, a3);
974 
975     a0 = b0;
976     a1 = b1;
977     a2 = _mm_shuffle_epi32(b3, 0x1b); // 00011011
978     a3 = _mm_shuffle_epi32(b2, 0x1b);
979 
980     b0 = _mm_add_epi32(a0, a4);
981     b1 = _mm_add_epi32(a1, a5);
982     b2 = _mm_add_epi32(a2, a6);
983     b3 = _mm_add_epi32(a3, a7);
984     b4 = _mm_sub_epi32(a0, a4);
985     b5 = _mm_sub_epi32(a1, a5);
986     b6 = _mm_sub_epi32(a2, a6);
987     b7 = _mm_sub_epi32(a3, a7);
988 
989     a0 = _mm_sra_epi32(b0, s0);
990     a1 = _mm_sra_epi32(b1, s0);
991     a2 = _mm_sra_epi32(b2, s0);
992     a3 = _mm_sra_epi32(b3, s0);
993     a4 = _mm_sra_epi32(_mm_shuffle_epi32(b7, 0x1b), s0);
994     a5 = _mm_sra_epi32(_mm_shuffle_epi32(b6, 0x1b), s0);
995     a6 = _mm_sra_epi32(_mm_shuffle_epi32(b5, 0x1b), s0);
996     a7 = _mm_sra_epi32(_mm_shuffle_epi32(b4, 0x1b), s0);
997 
998     x0 = _mm_packs_epi32(a0, a1);
999     x1 = _mm_packs_epi32(a2, a3);
1000     x2 = _mm_packs_epi32(a4, a5);
1001     x3 = _mm_packs_epi32(a6, a7);
1002 
1003     _mm_storeu_si128((__m128i *)(dst+0x00), x0);
1004     _mm_storeu_si128((__m128i *)(dst+0x08), x1);
1005     _mm_storeu_si128((__m128i *)(dst+0x10), x2);
1006     _mm_storeu_si128((__m128i *)(dst+0x18), x3);
1007 
1008     src += src_stride;
1009     dst += dst_stride;
1010   }
1011   while (--numRows);
1012 }
1013 
invTransform32ThreeQuarter(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift,EB_U32 numRows)1014 static void invTransform32ThreeQuarter(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_U32 shift, EB_U32 numRows)
1015 {
1016   __m128i s0 = _mm_cvtsi32_si128(shift);
1017   __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
1018   const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl2;
1019 
1020   do
1021   {
1022     __m128i x0, x1, x2, x3;
1023 #ifndef SSSE3/// __SSSE3__
1024     __m128i y0, y1, y2, y3;
1025 #endif
1026     __m128i a0, a1, a2, a3, a4, a5, a6, a7;
1027     __m128i b0, b1, b2, b3, b4, b5, b6, b7;
1028     x0 = _mm_loadu_si128((const __m128i *)(src+0x00)); // 00 01 02 03 04 05 06 07
1029     x1 = _mm_loadu_si128((const __m128i *)(src+0x08)); // 08 09 0a 0b 0c 0d 0e 0f
1030     x2 = _mm_loadu_si128((const __m128i *)(src+0x10)); // 10 11 12 13 14 15 16 17
1031     x3 = _mm_setzero_si128();
1032 
1033 #ifdef SSSE3/// __SSSE3__
1034     x0 = _mm_shuffle_epi8(x0, _mm_setr_epi8(0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15)); // 00 04 02 06 01 03 05 07
1035     x1 = _mm_shuffle_epi8(x1, _mm_setr_epi8(0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15)); // 08 0c 0a 0e 09 0b 0d 0f
1036     x2 = _mm_shuffle_epi8(x2, _mm_setr_epi8(0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15)); // 10 14 12 16 11 13 15 17
1037 
1038     a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
1039     a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[2]));
1040     a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[4]));
1041 
1042     a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
1043     a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[3]));
1044     a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[5]));
1045 
1046     a2 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[8]);
1047     a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10]));
1048     a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[12]));
1049 
1050     a3 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[9]);
1051     a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[11]));
1052     a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[13]));
1053 
1054     a4 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[16]);
1055     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[20]));
1056     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[24]));
1057     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[28]));
1058     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[32]));
1059     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[36]));
1060 
1061     a5 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[17]);
1062     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[21]));
1063     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[25]));
1064     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[29]));
1065     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[33]));
1066     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[37]));
1067 
1068     a6 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[18]);
1069     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[22]));
1070     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[26]));
1071     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[30]));
1072     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[34]));
1073     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[38]));
1074 
1075     a7 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[19]);
1076     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[23]));
1077     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[27]));
1078     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[31]));
1079     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[35]));
1080     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[39]));
1081 
1082 #else
1083     y0 = _mm_unpacklo_epi16(x0, x1); // 00 08 01 09 02 0a 03 0b
1084     y1 = _mm_unpackhi_epi16(x0, x1); // 04 0c 05 0d 06 0e 07 0f
1085     y2 = _mm_unpacklo_epi16(x2, x3); // 10 18
1086     y3 = _mm_unpackhi_epi16(x2, x3); // 24 2c
1087 
1088     x0 = _mm_unpacklo_epi16(y0, y1); // 00 04 08 0c 01 05 09 0d
1089     x1 = _mm_unpackhi_epi16(y0, y1); // 02 06 0a 0e 03 07 0b 0f
1090     x2 = _mm_unpacklo_epi16(y2, y3); // 10 14
1091     x3 = _mm_unpackhi_epi16(y2, y3); // 12 16
1092 
1093     y0 = _mm_unpacklo_epi64(x0, x2); // 00 04 08 0c 10 14 18 1c
1094     y1 = _mm_unpacklo_epi64(x1, x3); // 02 06 0a 0e 12 16 1a 1e
1095     y2 = _mm_unpackhi_epi16(x0, x1); // 01 03 05 07 09 0b 0d 0f
1096     y3 = _mm_unpackhi_epi16(x2, x3); // 11 13 15 17 19 1b 1d 1f
1097 
1098     x0 = y0;
1099     x1 = y1;
1100     x2 = y2;
1101     x3 = y3;
1102 
1103     a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]); // 00 04
1104     a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[2])); // 08 0c
1105     a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[4])); // 10 14
1106 
1107     a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
1108     a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[3]));
1109     a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[5]));
1110 
1111     a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]); // 02 06
1112     a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10])); // 0a 0e
1113     a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[12])); // 12 16
1114 
1115     a3 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[9]);
1116     a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[11]));
1117     a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[13]));
1118 
1119     a4 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[16]); // 01 03
1120     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[20])); // 05 07
1121     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[24])); // 09 0b
1122     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[28])); // 0d 0f
1123     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[32])); // 11 13
1124     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[36])); // 15 17
1125 
1126     a5 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[17]);
1127     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[21]));
1128     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[25]));
1129     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[29]));
1130     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[33]));
1131     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[37]));
1132 
1133     a6 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[18]);
1134     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[22]));
1135     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[26]));
1136     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[30]));
1137     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[34]));
1138     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[38]));
1139 
1140     a7 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[19]);
1141     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[23]));
1142     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[27]));
1143     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[31]));
1144     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x00), coeff32[35]));
1145     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x3, 0x55), coeff32[39]));
1146 #endif
1147 
1148     a0 = _mm_add_epi32(a0, o0);
1149     a1 = _mm_add_epi32(a1, o0);
1150 
1151     b0 = _mm_add_epi32(a0, a2);
1152     b1 = _mm_add_epi32(a1, a3);
1153     b2 = _mm_sub_epi32(a0, a2);
1154     b3 = _mm_sub_epi32(a1, a3);
1155 
1156     a0 = b0;
1157     a1 = b1;
1158     a2 = _mm_shuffle_epi32(b3, 0x1b); // 00011011
1159     a3 = _mm_shuffle_epi32(b2, 0x1b);
1160 
1161     b0 = _mm_add_epi32(a0, a4);
1162     b1 = _mm_add_epi32(a1, a5);
1163     b2 = _mm_add_epi32(a2, a6);
1164     b3 = _mm_add_epi32(a3, a7);
1165     b4 = _mm_sub_epi32(a0, a4);
1166     b5 = _mm_sub_epi32(a1, a5);
1167     b6 = _mm_sub_epi32(a2, a6);
1168     b7 = _mm_sub_epi32(a3, a7);
1169 
1170     a0 = _mm_sra_epi32(b0, s0);
1171     a1 = _mm_sra_epi32(b1, s0);
1172     a2 = _mm_sra_epi32(b2, s0);
1173     a3 = _mm_sra_epi32(b3, s0);
1174     a4 = _mm_sra_epi32(_mm_shuffle_epi32(b7, 0x1b), s0);
1175     a5 = _mm_sra_epi32(_mm_shuffle_epi32(b6, 0x1b), s0);
1176     a6 = _mm_sra_epi32(_mm_shuffle_epi32(b5, 0x1b), s0);
1177     a7 = _mm_sra_epi32(_mm_shuffle_epi32(b4, 0x1b), s0);
1178 
1179     x0 = _mm_packs_epi32(a0, a1);
1180     x1 = _mm_packs_epi32(a2, a3);
1181     x2 = _mm_packs_epi32(a4, a5);
1182     x3 = _mm_packs_epi32(a6, a7);
1183 
1184     _mm_storeu_si128((__m128i *)(dst+0x00), x0);
1185     _mm_storeu_si128((__m128i *)(dst+0x08), x1);
1186     _mm_storeu_si128((__m128i *)(dst+0x10), x2);
1187     _mm_storeu_si128((__m128i *)(dst+0x18), x3);
1188 
1189     src += src_stride;
1190     dst += dst_stride;
1191   }
1192   while (--numRows);
1193 }
1194 
invTransform32Half(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift,EB_U32 numRows)1195 static void invTransform32Half(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_U32 shift, EB_U32 numRows)
1196 {
1197   __m128i s0 = _mm_cvtsi32_si128(shift);
1198   __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
1199   const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl2;
1200 
1201   do
1202   {
1203     __m128i x0, x1, x2, x3;
1204 #ifndef SSSE3/// __SSSE3__
1205     __m128i y0, y1, y2;
1206 #endif
1207     __m128i a0, a1, a2, a3, a4, a5, a6, a7;
1208     __m128i b0, b1, b2, b3, b4, b5, b6, b7;
1209     x0 = _mm_loadu_si128((const __m128i *)(src+0x00)); // 00 01 02 03 04 05 06 07
1210     x1 = _mm_loadu_si128((const __m128i *)(src+0x08)); // 08 09 0a 0b 0c 0d 0e 0f
1211     x2 = _mm_setzero_si128();
1212     x3 = _mm_setzero_si128();
1213 
1214 #ifdef SSSE3/// __SSSE3__
1215     x0 = _mm_shuffle_epi8(x0, _mm_setr_epi8(0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15)); // 00 04 02 06 01 03 05 07
1216     x1 = _mm_shuffle_epi8(x1, _mm_setr_epi8(0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15)); // 08 0c 0a 0e 09 0b 0d 0f
1217 
1218     a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
1219     a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[2]));
1220 
1221     a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
1222     a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[3]));
1223 
1224     a2 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[8]);
1225     a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10]));
1226 
1227     a3 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[9]);
1228     a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[11]));
1229 
1230     a4 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[16]);
1231     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[20]));
1232     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[24]));
1233     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[28]));
1234 
1235     a5 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[17]);
1236     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[21]));
1237     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[25]));
1238     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[29]));
1239 
1240     a6 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[18]);
1241     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[22]));
1242     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[26]));
1243     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[30]));
1244 
1245     a7 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[19]);
1246     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[23]));
1247     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xaa), coeff32[27]));
1248     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0xff), coeff32[31]));
1249 
1250 #else
1251     y0 = _mm_unpacklo_epi16(x0, x1); // 00 08 01 09 02 0a 03 0b
1252     y1 = _mm_unpackhi_epi16(x0, x1); // 04 0c 05 0d 06 0e 07 0f
1253 
1254     x0 = _mm_unpacklo_epi16(y0, y1); // 00 04 08 0c 01 05 09 0d
1255     x1 = _mm_unpackhi_epi16(y0, y1); // 02 06 0a 0e 03 07 0b 0f
1256 
1257     y0 = _mm_unpacklo_epi64(x0, x2); // 00 04 08 0c 10 14 18 1c
1258     y1 = _mm_unpacklo_epi64(x1, x3); // 02 06 0a 0e 12 16 1a 1e
1259     y2 = _mm_unpackhi_epi16(x0, x1); // 01 03 05 07 09 0b 0d 0f
1260 
1261     x0 = y0;
1262     x1 = y1;
1263     x2 = y2;
1264 
1265     a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]); // 00 04
1266     a0 = _mm_add_epi32(a0, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[2])); // 08 0c
1267 
1268     a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
1269     a1 = _mm_add_epi32(a1, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[3]));
1270 
1271     a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]); // 02 06
1272     a2 = _mm_add_epi32(a2, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[10])); // 0a 0e
1273 
1274     a3 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[9]);
1275     a3 = _mm_add_epi32(a3, _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x55), coeff32[11]));
1276 
1277     a4 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[16]); // 01 03
1278     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[20])); // 05 07
1279     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[24])); // 09 0b
1280     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[28])); // 0d 0f
1281 
1282     a5 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[17]);
1283     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[21]));
1284     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[25]));
1285     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[29]));
1286 
1287     a6 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[18]);
1288     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[22]));
1289     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[26]));
1290     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[30]));
1291 
1292     a7 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[19]);
1293     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[23]));
1294     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xaa), coeff32[27]));
1295     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0xff), coeff32[31]));
1296 #endif
1297 
1298     a0 = _mm_add_epi32(a0, o0);
1299     a1 = _mm_add_epi32(a1, o0);
1300 
1301     b0 = _mm_add_epi32(a0, a2);
1302     b1 = _mm_add_epi32(a1, a3);
1303     b2 = _mm_sub_epi32(a0, a2);
1304     b3 = _mm_sub_epi32(a1, a3);
1305 
1306     a0 = b0;
1307     a1 = b1;
1308     a2 = _mm_shuffle_epi32(b3, 0x1b); // 00011011
1309     a3 = _mm_shuffle_epi32(b2, 0x1b);
1310 
1311     b0 = _mm_add_epi32(a0, a4);
1312     b1 = _mm_add_epi32(a1, a5);
1313     b2 = _mm_add_epi32(a2, a6);
1314     b3 = _mm_add_epi32(a3, a7);
1315     b4 = _mm_sub_epi32(a0, a4);
1316     b5 = _mm_sub_epi32(a1, a5);
1317     b6 = _mm_sub_epi32(a2, a6);
1318     b7 = _mm_sub_epi32(a3, a7);
1319 
1320     a0 = _mm_sra_epi32(b0, s0);
1321     a1 = _mm_sra_epi32(b1, s0);
1322     a2 = _mm_sra_epi32(b2, s0);
1323     a3 = _mm_sra_epi32(b3, s0);
1324     a4 = _mm_sra_epi32(_mm_shuffle_epi32(b7, 0x1b), s0);
1325     a5 = _mm_sra_epi32(_mm_shuffle_epi32(b6, 0x1b), s0);
1326     a6 = _mm_sra_epi32(_mm_shuffle_epi32(b5, 0x1b), s0);
1327     a7 = _mm_sra_epi32(_mm_shuffle_epi32(b4, 0x1b), s0);
1328 
1329     x0 = _mm_packs_epi32(a0, a1);
1330     x1 = _mm_packs_epi32(a2, a3);
1331     x2 = _mm_packs_epi32(a4, a5);
1332     x3 = _mm_packs_epi32(a6, a7);
1333 
1334     _mm_storeu_si128((__m128i *)(dst+0x00), x0);
1335     _mm_storeu_si128((__m128i *)(dst+0x08), x1);
1336     _mm_storeu_si128((__m128i *)(dst+0x10), x2);
1337     _mm_storeu_si128((__m128i *)(dst+0x18), x3);
1338 
1339     src += src_stride;
1340     dst += dst_stride;
1341   }
1342   while (--numRows);
1343 }
1344 
invTransform32Quarter(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift,EB_U32 numRows)1345 static void invTransform32Quarter(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_U32 shift, EB_U32 numRows)
1346 {
1347   __m128i s0 = _mm_cvtsi32_si128(shift);
1348   __m128i o0 = _mm_set1_epi32(1 << (shift - 1));
1349   const __m128i *coeff32 = (const __m128i *)EbHevcCoeff_tbl2;
1350 
1351   do
1352   {
1353     __m128i x0, x1, x2, x3;
1354 #ifndef SSSE3/// __SSSE3__
1355     __m128i y0, y1;
1356 #endif
1357     __m128i a0, a1, a2, a3, a4, a5, a6, a7;
1358     __m128i b0, b1, b2, b3, b4, b5, b6, b7;
1359     x0 = _mm_loadu_si128((const __m128i *)(src+0x00)); // 00 01 02 03 04 05 06 07
1360 
1361 #ifdef SSSE3/// __SSSE3__
1362     x0 = _mm_shuffle_epi8(x0, _mm_setr_epi8(0, 1, 8, 9, 4, 5, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15)); // 00 04 02 06 01 03 05 07
1363 
1364     a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]);
1365 
1366     a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
1367 
1368     a2 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[8]);
1369 
1370     a3 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x55), coeff32[9]);
1371 
1372     a4 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[16]);
1373     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[20]));
1374 
1375     a5 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[17]);
1376     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[21]));
1377 
1378     a6 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[18]);
1379     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[22]));
1380 
1381     a7 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xaa), coeff32[19]);
1382     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x0, 0xff), coeff32[23]));
1383 #else
1384     y0 = _mm_unpacklo_epi16(x0, x0); // 00 08 01 09 02 0a 03 0b
1385     y1 = _mm_unpackhi_epi16(x0, x0); // 04 0c 05 0d 06 0e 07 0f
1386 
1387     x0 = _mm_unpacklo_epi16(y0, y1); // 00 04 08 0c 01 05 09 0d
1388     x1 = _mm_unpackhi_epi16(y0, y1); // 02 06 0a 0e 03 07 0b 0f
1389 
1390     y0 = _mm_unpacklo_epi64(x0, x0); // 00 04 08 0c 10 14 18 1c
1391     y1 = _mm_unpacklo_epi64(x1, x1); // 02 06 0a 0e 12 16 1a 1e
1392     x2 = _mm_unpackhi_epi16(x0, x1); // 01 03 05 07 09 0b 0d 0f
1393 
1394     x0 = y0;
1395     x1 = y1;
1396 
1397     a0 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[0]); // 00 04
1398 
1399     a1 = _mm_madd_epi16(_mm_shuffle_epi32(x0, 0x00), coeff32[1]);
1400 
1401     a2 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[8]); // 02 06
1402 
1403     a3 = _mm_madd_epi16(_mm_shuffle_epi32(x1, 0x00), coeff32[9]);
1404 
1405     a4 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[16]); // 01 03
1406     a4 = _mm_add_epi32(a4, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[20])); // 05 07
1407 
1408     a5 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[17]);
1409     a5 = _mm_add_epi32(a5, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[21]));
1410 
1411     a6 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[18]);
1412     a6 = _mm_add_epi32(a6, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[22]));
1413 
1414     a7 = _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x00), coeff32[19]);
1415     a7 = _mm_add_epi32(a7, _mm_madd_epi16(_mm_shuffle_epi32(x2, 0x55), coeff32[23]));
1416 #endif
1417 
1418     a0 = _mm_add_epi32(a0, o0);
1419     a1 = _mm_add_epi32(a1, o0);
1420 
1421     b0 = _mm_add_epi32(a0, a2);
1422     b1 = _mm_add_epi32(a1, a3);
1423     b2 = _mm_sub_epi32(a0, a2);
1424     b3 = _mm_sub_epi32(a1, a3);
1425 
1426     a0 = b0;
1427     a1 = b1;
1428     a2 = _mm_shuffle_epi32(b3, 0x1b); // 00011011
1429     a3 = _mm_shuffle_epi32(b2, 0x1b);
1430 
1431     b0 = _mm_add_epi32(a0, a4);
1432     b1 = _mm_add_epi32(a1, a5);
1433     b2 = _mm_add_epi32(a2, a6);
1434     b3 = _mm_add_epi32(a3, a7);
1435     b4 = _mm_sub_epi32(a0, a4);
1436     b5 = _mm_sub_epi32(a1, a5);
1437     b6 = _mm_sub_epi32(a2, a6);
1438     b7 = _mm_sub_epi32(a3, a7);
1439 
1440     a0 = _mm_sra_epi32(b0, s0);
1441     a1 = _mm_sra_epi32(b1, s0);
1442     a2 = _mm_sra_epi32(b2, s0);
1443     a3 = _mm_sra_epi32(b3, s0);
1444     a4 = _mm_sra_epi32(_mm_shuffle_epi32(b7, 0x1b), s0);
1445     a5 = _mm_sra_epi32(_mm_shuffle_epi32(b6, 0x1b), s0);
1446     a6 = _mm_sra_epi32(_mm_shuffle_epi32(b5, 0x1b), s0);
1447     a7 = _mm_sra_epi32(_mm_shuffle_epi32(b4, 0x1b), s0);
1448 
1449     x0 = _mm_packs_epi32(a0, a1);
1450     x1 = _mm_packs_epi32(a2, a3);
1451     x2 = _mm_packs_epi32(a4, a5);
1452     x3 = _mm_packs_epi32(a6, a7);
1453 
1454     _mm_storeu_si128((__m128i *)(dst+0x00), x0);
1455     _mm_storeu_si128((__m128i *)(dst+0x08), x1);
1456     _mm_storeu_si128((__m128i *)(dst+0x10), x2);
1457     _mm_storeu_si128((__m128i *)(dst+0x18), x3);
1458 
1459     src += src_stride;
1460     dst += dst_stride;
1461   }
1462   while (--numRows);
1463 }
1464 
invTransform32Partial(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_U32 shift,EB_U32 pattern)1465 static void invTransform32Partial(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_U32 shift, EB_U32 pattern)
1466 {
1467   EB_U32 numRows = 32 - 2 * (pattern & 12);
1468 
1469   switch (pattern & 3)
1470   {
1471     case 3:
1472     invTransform32Quarter(src, src_stride, dst, dst_stride, shift, numRows);
1473       break;
1474     case 2:
1475     invTransform32Half(src, src_stride, dst, dst_stride, shift, numRows);
1476       break;
1477     case 1:
1478       invTransform32ThreeQuarter(src, src_stride, dst, dst_stride, shift, numRows);
1479       break;
1480     default:
1481     invTransform32(src, src_stride, dst, dst_stride, shift, numRows);
1482       break;
1483   }
1484 }
1485 
1486 // inverse 32x32 transform
PFinvTransform32x32_SSSE3(EB_S16 * src,EB_U32 src_stride,EB_S16 * dst,EB_U32 dst_stride,EB_S16 * intermediate,EB_U32 addshift)1487 void PFinvTransform32x32_SSSE3(EB_S16 *src, EB_U32 src_stride, EB_S16 *dst, EB_U32 dst_stride, EB_S16 *intermediate, EB_U32 addshift)
1488 {
1489 
1490   EB_U32 pattern = transpose32Check0s(src, src_stride, intermediate, 32);
1491   invTransform32Partial(intermediate, 32, dst, dst_stride, 7, pattern);
1492 
1493   pattern >>= 2;
1494   transpose32Partial(dst, dst_stride, intermediate, 32, pattern);
1495   invTransform32Partial(intermediate, 32, dst, dst_stride, 12-addshift, pattern);
1496 
1497 }
1498 
QuantizeInvQuantize4x4_SSE3(EB_S16 * coeff,const EB_U32 coeffStride,EB_S16 * quantCoeff,EB_S16 * reconCoeff,const EB_U32 qFunc,const EB_U32 q_offset,const EB_S32 shiftedQBits,const EB_S32 shiftedFFunc,const EB_S32 iq_offset,const EB_S32 shiftNum,const EB_U32 areaSize,EB_U32 * nonzerocoeff)1499 void QuantizeInvQuantize4x4_SSE3(
1500     EB_S16          *coeff,
1501     const EB_U32     coeffStride,
1502     EB_S16          *quantCoeff,
1503     EB_S16          *reconCoeff,
1504     const EB_U32     qFunc,
1505     const EB_U32     q_offset,
1506     const EB_S32     shiftedQBits,
1507     const EB_S32     shiftedFFunc,
1508     const EB_S32     iq_offset,
1509     const EB_S32     shiftNum,
1510     const EB_U32     areaSize,
1511     EB_U32          *nonzerocoeff)
1512 {
1513     int row;
1514 
1515     __m128i q = _mm_set1_epi16((EB_S16)qFunc);
1516     __m128i o = _mm_set1_epi32(q_offset);
1517     __m128i s = _mm_cvtsi32_si128(shiftedQBits);
1518 
1519     __m128i iq = _mm_set1_epi16((EB_S16)shiftedFFunc);
1520     __m128i io = _mm_set1_epi32(iq_offset);
1521     __m128i is = _mm_cvtsi32_si128(shiftNum);
1522 
1523     __m128i z = _mm_setzero_si128();
1524 
1525     (void)areaSize;
1526 
1527     row = 0;
1528     do
1529     {
1530         __m128i a0, a1;
1531         __m128i b0, b1;
1532         __m128i x0 = _mm_loadl_epi64((__m128i *)(coeff + coeffStride*row + 0));
1533         __m128i x1 = _mm_loadl_epi64((__m128i *)(coeff + coeffStride*row + coeffStride));
1534         __m128i y = _mm_unpacklo_epi64(x0, x1);
1535         __m128i x;
1536 
1537         x = _mm_abs_epi16(y);
1538 
1539         a0 = _mm_mullo_epi16(x, q);
1540         a1 = _mm_mulhi_epi16(x, q);
1541 
1542         b0 = _mm_unpacklo_epi16(a0, a1);
1543         b1 = _mm_unpackhi_epi16(a0, a1);
1544 
1545         b0 = _mm_add_epi32(b0, o);
1546         b1 = _mm_add_epi32(b1, o);
1547 
1548         b0 = _mm_sra_epi32(b0, s);
1549         b1 = _mm_sra_epi32(b1, s);
1550 
1551         x = _mm_packs_epi32(b0, b1);
1552 
1553         z = _mm_sub_epi16(z, _mm_cmpgt_epi16(x, _mm_setzero_si128()));
1554 
1555         x = _mm_sign_epi16(x, y);
1556         _mm_storel_epi64((__m128i *)(quantCoeff + coeffStride*row + 0), x);
1557         _mm_storel_epi64((__m128i *)(quantCoeff + coeffStride*row + coeffStride), _mm_srli_si128(x, 8));
1558 
1559 		__m128i zer = _mm_setzero_si128();
1560 		__m128i cmp = _mm_cmpeq_epi16(x, zer);
1561 		int msk = _mm_movemask_epi8(cmp);
1562 
1563 		if (msk != 0xFFFF)
1564 		{
1565         a0 = _mm_mullo_epi16(x, iq);
1566         a1 = _mm_mulhi_epi16(x, iq);
1567 
1568         b0 = _mm_unpacklo_epi16(a0, a1);
1569         b1 = _mm_unpackhi_epi16(a0, a1);
1570 
1571         b0 = _mm_add_epi32(b0, io);
1572         b1 = _mm_add_epi32(b1, io);
1573 
1574         b0 = _mm_sra_epi32(b0, is);
1575         b1 = _mm_sra_epi32(b1, is);
1576 
1577         x = _mm_packs_epi32(b0, b1);
1578         _mm_storel_epi64((__m128i *)(reconCoeff + coeffStride*row + 0), x);
1579         _mm_storel_epi64((__m128i *)(reconCoeff + coeffStride*row + coeffStride), _mm_srli_si128(x, 8));
1580 		}
1581 		else{
1582 			_mm_storel_epi64((__m128i *)(reconCoeff + coeffStride*row + 0), zer);
1583 			_mm_storel_epi64((__m128i *)(reconCoeff + coeffStride*row + coeffStride), zer);
1584 		}
1585         row += 2;
1586     } while (row < 4);
1587 
1588     z = _mm_sad_epu8(z, _mm_srli_si128(z, 7));
1589     *nonzerocoeff = _mm_cvtsi128_si32(z);
1590 }
1591