1 /*
2 * H.265 video codec.
3 * Copyright (c) 2013 openHEVC contributors
4 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
5 *
6 * This file is part of libde265.
7 *
8 * libde265 is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU Lesser General Public License as
10 * published by the Free Software Foundation, either version 3 of
11 * the License, or (at your option) any later version.
12 *
13 * libde265 is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public License
19 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
20 */
21
22 #include "x86/sse-dct.h"
23 #include "libde265/util.h"
24
25 #ifdef HAVE_CONFIG_H
26 #include "config.h"
27 #endif
28
29 #include <emmintrin.h> // SSE2
30 #include <tmmintrin.h> // SSSE3
31
32 #if HAVE_SSE4_1
33 #include <smmintrin.h> // SSE4.1
34 #endif
35
36
37 ALIGNED_16(static const int16_t) transform4x4_luma[8][8] =
38 {
39 { 29, +84, 29, +84, 29, +84, 29, +84 },
40 { +74, +55, +74, +55, +74, +55, +74, +55 },
41 { 55, -29, 55, -29, 55, -29, 55, -29 },
42 { +74, -84, +74, -84, +74, -84, +74, -84 },
43 { 74, -74, 74, -74, 74, -74, 74, -74 },
44 { 0, +74, 0, +74, 0, +74, 0, +74 },
45 { 84, +55, 84, +55, 84, +55, 84, +55 },
46 { -74, -29, -74, -29, -74, -29, -74, -29 }
47 };
48
49 ALIGNED_16(static const int16_t) transform4x4[4][8] = {
50 { 64, 64, 64, 64, 64, 64, 64, 64 },
51 { 64, -64, 64, -64, 64, -64, 64, -64 },
52 { 83, 36, 83, 36, 83, 36, 83, 36 },
53 { 36, -83, 36, -83, 36, -83, 36, -83 }
54 };
55
56 ALIGNED_16(static const int16_t) transform8x8[12][8] =
57 {
58 { 89, 75, 89, 75, 89, 75, 89, 75 },
59 { 50, 18, 50, 18, 50, 18, 50, 18 },
60 { 75, -18, 75, -18, 75, -18, 75, -18 },
61 { -89, -50, -89, -50,-89, -50,-89, -50 },
62 { 50, -89, 50, -89, 50, -89, 50, -89 },
63 { 18, 75, 18, 75, 18, 75, 18, 75 },
64 { 18, -50, 18, -50, 18, -50, 18, -50 },
65 { 75, -89, 75, -89, 75, -89, 75, -89 },
66 { 64, 64, 64, 64, 64, 64, 64, 64 },
67 { 64, -64, 64, -64, 64, -64, 64, -64 },
68 { 83, 36, 83, 36, 83, 36, 83, 36 },
69 { 36, -83, 36, -83, 36, -83, 36, -83 }
70 };
71
72 ALIGNED_16(static const int16_t) transform16x16_1[4][8][8] =
73 {
74 {/*1-3*/ /*2-6*/
75 { 90, 87, 90, 87, 90, 87, 90, 87 },
76 { 87, 57, 87, 57, 87, 57, 87, 57 },
77 { 80, 9, 80, 9, 80, 9, 80, 9 },
78 { 70, -43, 70, -43, 70, -43, 70, -43 },
79 { 57, -80, 57, -80, 57, -80, 57, -80 },
80 { 43, -90, 43, -90, 43, -90, 43, -90 },
81 { 25, -70, 25, -70, 25, -70, 25, -70 },
82 { 9, -25, 9, -25, 9, -25, 9, -25 },
83 },{ /*5-7*/ /*10-14*/
84 { 80, 70, 80, 70, 80, 70, 80, 70 },
85 { 9, -43, 9, -43, 9, -43, 9, -43 },
86 { -70, -87, -70, -87, -70, -87, -70, -87 },
87 { -87, 9, -87, 9, -87, 9, -87, 9 },
88 { -25, 90, -25, 90, -25, 90, -25, 90 },
89 { 57, 25, 57, 25, 57, 25, 57, 25 },
90 { 90, -80, 90, -80, 90, -80, 90, -80 },
91 { 43, -57, 43, -57, 43, -57, 43, -57 },
92 },{ /*9-11*/ /*18-22*/
93 { 57, 43, 57, 43, 57, 43, 57, 43 },
94 { -80, -90, -80, -90, -80, -90, -80, -90 },
95 { -25, 57, -25, 57, -25, 57, -25, 57 },
96 { 90, 25, 90, 25, 90, 25, 90, 25 },
97 { -9, -87, -9, -87, -9, -87, -9, -87 },
98 { -87, 70, -87, 70, -87, 70, -87, 70 },
99 { 43, 9, 43, 9, 43, 9, 43, 9 },
100 { 70, -80, 70, -80, 70, -80, 70, -80 },
101 },{/*13-15*/ /* 26-30 */
102 { 25, 9, 25, 9, 25, 9, 25, 9 },
103 { -70, -25, -70, -25, -70, -25, -70, -25 },
104 { 90, 43, 90, 43, 90, 43, 90, 43 },
105 { -80, -57, -80, -57, -80, -57, -80, -57 },
106 { 43, 70, 43, 70, 43, 70, 43, 70 },
107 { 9, -80, 9, -80, 9, -80, 9, -80 },
108 { -57, 87, -57, 87, -57, 87, -57, 87 },
109 { 87, -90, 87, -90, 87, -90, 87, -90 },
110 }
111 };
112
113 ALIGNED_16(static const int16_t) transform16x16_2[2][4][8] =
114 {
115 { /*2-6*/ /*4-12*/
116 { 89, 75, 89, 75, 89, 75, 89, 75 },
117 { 75, -18, 75, -18, 75, -18, 75, -18 },
118 { 50, -89, 50, -89, 50, -89, 50, -89 },
119 { 18, -50, 18, -50, 18, -50, 18, -50 },
120 },{ /*10-14*/ /*20-28*/
121 { 50, 18, 50, 18, 50, 18, 50, 18 },
122 { -89, -50, -89, -50, -89, -50, -89, -50 },
123 { 18, 75, 18, 75, 18, 75, 18, 75 },
124 { 75, -89, 75, -89, 75, -89, 75, -89 },
125 }
126 };
127
128 ALIGNED_16(static const int16_t) transform16x16_3[2][2][8] =
129 {
130 {/*4-12*/ /*8-24*/
131 { 83, 36, 83, 36, 83, 36, 83, 36 },
132 { 36, -83, 36, -83, 36, -83, 36, -83 },
133 },{ /*0-8*/ /*0-16*/
134 { 64, 64, 64, 64, 64, 64, 64, 64 },
135 { 64, -64, 64, -64, 64, -64, 64, -64 },
136 }
137 };
138
139
140 ALIGNED_16(static const int16_t) transform32x32[8][16][8] =
141 {
142 { /* 1-3 */
143 { 90, 90, 90, 90, 90, 90, 90, 90 },
144 { 90, 82, 90, 82, 90, 82, 90, 82 },
145 { 88, 67, 88, 67, 88, 67, 88, 67 },
146 { 85, 46, 85, 46, 85, 46, 85, 46 },
147 { 82, 22, 82, 22, 82, 22, 82, 22 },
148 { 78, -4, 78, -4, 78, -4, 78, -4 },
149 { 73, -31, 73, -31, 73, -31, 73, -31 },
150 { 67, -54, 67, -54, 67, -54, 67, -54 },
151 { 61, -73, 61, -73, 61, -73, 61, -73 },
152 { 54, -85, 54, -85, 54, -85, 54, -85 },
153 { 46, -90, 46, -90, 46, -90, 46, -90 },
154 { 38, -88, 38, -88, 38, -88, 38, -88 },
155 { 31, -78, 31, -78, 31, -78, 31, -78 },
156 { 22, -61, 22, -61, 22, -61, 22, -61 },
157 { 13, -38, 13, -38, 13, -38, 13, -38 },
158 { 4, -13, 4, -13, 4, -13, 4, -13 },
159 },{/* 5-7 */
160 { 88, 85, 88, 85, 88, 85, 88, 85 },
161 { 67, 46, 67, 46, 67, 46, 67, 46 },
162 { 31, -13, 31, -13, 31, -13, 31, -13 },
163 { -13, -67, -13, -67, -13, -67, -13, -67 },
164 { -54, -90, -54, -90, -54, -90, -54, -90 },
165 { -82, -73, -82, -73, -82, -73, -82, -73 },
166 { -90, -22, -90, -22, -90, -22, -90, -22 },
167 { -78, 38, -78, 38, -78, 38, -78, 38 },
168 { -46, 82, -46, 82, -46, 82, -46, 82 },
169 { -4, 88, -4, 88, -4, 88, -4, 88 },
170 { 38, 54, 38, 54, 38, 54, 38, 54 },
171 { 73, -4, 73, -4, 73, -4, 73, -4 },
172 { 90, -61, 90, -61, 90, -61, 90, -61 },
173 { 85, -90, 85, -90, 85, -90, 85, -90 },
174 { 61, -78, 61, -78, 61, -78, 61, -78 },
175 { 22, -31, 22, -31, 22, -31, 22, -31 },
176 },{/* 9-11 */
177 { 82, 78, 82, 78, 82, 78, 82, 78 },
178 { 22, -4, 22, -4, 22, -4, 22, -4 },
179 { -54, -82, -54, -82, -54, -82, -54, -82 },
180 { -90, -73, -90, -73, -90, -73, -90, -73 },
181 { -61, 13, -61, 13, -61, 13, -61, 13 },
182 { 13, 85, 13, 85, 13, 85, 13, 85 },
183 { 78, 67, 78, 67, 78, 67, 78, 67 },
184 { 85, -22, 85, -22, 85, -22, 85, -22 },
185 { 31, -88, 31, -88, 31, -88, 31, -88 },
186 { -46, -61, -46, -61, -46, -61, -46, -61 },
187 { -90, 31, -90, 31, -90, 31, -90, 31 },
188 { -67, 90, -67, 90, -67, 90, -67, 90 },
189 { 4, 54, 4, 54, 4, 54, 4, 54 },
190 { 73, -38, 73, -38, 73, -38, 73, -38 },
191 { 88, -90, 88, -90, 88, -90, 88, -90 },
192 { 38, -46, 38, -46, 38, -46, 38, -46 },
193 },{/* 13-15 */
194 { 73, 67, 73, 67, 73, 67, 73, 67 },
195 { -31, -54, -31, -54, -31, -54, -31, -54 },
196 { -90, -78, -90, -78, -90, -78, -90, -78 },
197 { -22, 38, -22, 38, -22, 38, -22, 38 },
198 { 78, 85, 78, 85, 78, 85, 78, 85 },
199 { 67, -22, 67, -22, 67, -22, 67, -22 },
200 { -38, -90, -38, -90, -38, -90, -38, -90 },
201 { -90, 4, -90, 4, -90, 4, -90, 4 },
202 { -13, 90, -13, 90, -13, 90, -13, 90 },
203 { 82, 13, 82, 13, 82, 13, 82, 13 },
204 { 61, -88, 61, -88, 61, -88, 61, -88 },
205 { -46, -31, -46, -31, -46, -31, -46, -31 },
206 { -88, 82, -88, 82, -88, 82, -88, 82 },
207 { -4, 46, -4, 46, -4, 46, -4, 46 },
208 { 85, -73, 85, -73, 85, -73, 85, -73 },
209 { 54, -61, 54, -61, 54, -61, 54, -61 },
210 },{/* 17-19 */
211 { 61, 54, 61, 54, 61, 54, 61, 54 },
212 { -73, -85, -73, -85, -73, -85, -73, -85 },
213 { -46, -4, -46, -4, -46, -4, -46, -4 },
214 { 82, 88, 82, 88, 82, 88, 82, 88 },
215 { 31, -46, 31, -46, 31, -46, 31, -46 },
216 { -88, -61, -88, -61, -88, -61, -88, -61 },
217 { -13, 82, -13, 82, -13, 82, -13, 82 },
218 { 90, 13, 90, 13, 90, 13, 90, 13 },
219 { -4, -90, -4, -90, -4, -90, -4, -90 },
220 { -90, 38, -90, 38, -90, 38, -90, 38 },
221 { 22, 67, 22, 67, 22, 67, 22, 67 },
222 { 85, -78, 85, -78, 85, -78, 85, -78 },
223 { -38, -22, -38, -22, -38, -22, -38, -22 },
224 { -78, 90, -78, 90, -78, 90, -78, 90 },
225 { 54, -31, 54, -31, 54, -31, 54, -31 },
226 { 67, -73, 67, -73, 67, -73, 67, -73 },
227 },{ /* 21-23 */
228 { 46, 38, 46, 38, 46, 38, 46, 38 },
229 { -90, -88, -90, -88, -90, -88, -90, -88 },
230 { 38, 73, 38, 73, 38, 73, 38, 73 },
231 { 54, -4, 54, -4, 54, -4, 54, -4 },
232 { -90, -67, -90, -67, -90, -67, -90, -67 },
233 { 31, 90, 31, 90, 31, 90, 31, 90 },
234 { 61, -46, 61, -46, 61, -46, 61, -46 },
235 { -88, -31, -88, -31, -88, -31, -88, -31 },
236 { 22, 85, 22, 85, 22, 85, 22, 85 },
237 { 67, -78, 67, -78, 67, -78, 67, -78 },
238 { -85, 13, -85, 13, -85, 13, -85, 13 },
239 { 13, 61, 13, 61, 13, 61, 13, 61 },
240 { 73, -90, 73, -90, 73, -90, 73, -90 },
241 { -82, 54, -82, 54, -82, 54, -82, 54 },
242 { 4, 22, 4, 22, 4, 22, 4, 22 },
243 { 78, -82, 78, -82, 78, -82, 78, -82 },
244 },{ /* 25-27 */
245 { 31, 22, 31, 22, 31, 22, 31, 22 },
246 { -78, -61, -78, -61, -78, -61, -78, -61 },
247 { 90, 85, 90, 85, 90, 85, 90, 85 },
248 { -61, -90, -61, -90, -61, -90, -61, -90 },
249 { 4, 73, 4, 73, 4, 73, 4, 73 },
250 { 54, -38, 54, -38, 54, -38, 54, -38 },
251 { -88, -4, -88, -4, -88, -4, -88, -4 },
252 { 82, 46, 82, 46, 82, 46, 82, 46 },
253 { -38, -78, -38, -78, -38, -78, -38, -78 },
254 { -22, 90, -22, 90, -22, 90, -22, 90 },
255 { 73, -82, 73, -82, 73, -82, 73, -82 },
256 { -90, 54, -90, 54, -90, 54, -90, 54 },
257 { 67, -13, 67, -13, 67, -13, 67, -13 },
258 { -13, -31, -13, -31, -13, -31, -13, -31 },
259 { -46, 67, -46, 67, -46, 67, -46, 67 },
260 { 85, -88, 85, -88, 85, -88, 85, -88 },
261 },{/* 29-31 */
262 { 13, 4, 13, 4, 13, 4, 13, 4 },
263 { -38, -13, -38, -13, -38, -13, -38, -13 },
264 { 61, 22, 61, 22, 61, 22, 61, 22 },
265 { -78, -31, -78, -31, -78, -31, -78, -31 },
266 { 88, 38, 88, 38, 88, 38, 88, 38 },
267 { -90, -46, -90, -46, -90, -46, -90, -46 },
268 { 85, 54, 85, 54, 85, 54, 85, 54 },
269 { -73, -61, -73, -61, -73, -61, -73, -61 },
270 { 54, 67, 54, 67, 54, 67, 54, 67 },
271 { -31, -73, -31, -73, -31, -73, -31, -73 },
272 { 4, 78, 4, 78, 4, 78, 4, 78 },
273 { 22, -82, 22, -82, 22, -82, 22, -82 },
274 { -46, 85, -46, 85, -46, 85, -46, 85 },
275 { 67, -88, 67, -88, 67, -88, 67, -88 },
276 { -82, 90, -82, 90, -82, 90, -82, 90 },
277 { 90, -90, 90, -90, 90, -90, 90, -90 },
278 }
279 };
280
281 #define shift_1st 7
282 #define add_1st (1 << (shift_1st - 1))
283
284
ff_hevc_transform_skip_8_sse(uint8_t * _dst,const int16_t * coeffs,ptrdiff_t _stride)285 void ff_hevc_transform_skip_8_sse(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t _stride)
286 {
287 uint8_t *dst = (uint8_t*)_dst;
288 ptrdiff_t stride = _stride;
289 int shift = 5;
290 int offset = 16;
291 __m128i r0,r1,r2,r3,r4,r5,r6,r9;
292
293 r9= _mm_setzero_si128();
294 //r8= _mm_set_epi32(0,0,0,-1);
295 r2= _mm_set1_epi16(offset);
296
297 r0= _mm_load_si128((__m128i*)(coeffs));
298 r1= _mm_load_si128((__m128i*)(coeffs+8));
299
300
301 r0= _mm_adds_epi16(r0,r2);
302 r1= _mm_adds_epi16(r1,r2);
303
304 r0= _mm_srai_epi16(r0,shift);
305 r1= _mm_srai_epi16(r1,shift);
306
307 r3= _mm_loadl_epi64((__m128i*)(dst));
308 r4= _mm_loadl_epi64((__m128i*)(dst + stride));
309 r5= _mm_loadl_epi64((__m128i*)(dst + 2*stride));
310 r6= _mm_loadl_epi64((__m128i*)(dst + 3*stride));
311
312 r3= _mm_unpacklo_epi8(r3,r9);
313 r4= _mm_unpacklo_epi8(r4,r9);
314 r5= _mm_unpacklo_epi8(r5,r9);
315 r6= _mm_unpacklo_epi8(r6,r9);
316 r3= _mm_unpacklo_epi64(r3,r4);
317 r4= _mm_unpacklo_epi64(r5,r6);
318
319
320 r3= _mm_adds_epi16(r3,r0);
321 r4= _mm_adds_epi16(r4,r1);
322
323 r3= _mm_packus_epi16(r3,r4);
324 //r8= _mm_set_epi32(0,0,0,-1);
325
326 //_mm_maskmoveu_si128(r3,r8,(char *) (dst));
327 *((uint32_t*)(dst)) = _mm_cvtsi128_si32(r3);
328
329 r3= _mm_srli_si128(r3,4);
330 //_mm_maskmoveu_si128(r3,r8,(char *) (dst+stride));
331 *((uint32_t*)(dst+stride)) = _mm_cvtsi128_si32(r3);
332
333 r3= _mm_srli_si128(r3,4);
334 //_mm_maskmoveu_si128(r3,r8,(char *) (dst+2*stride));
335 *((uint32_t*)(dst+2*stride)) = _mm_cvtsi128_si32(r3);
336
337 r3= _mm_srli_si128(r3,4);
338 //_mm_maskmoveu_si128(r3,r8,(char *) (dst+3*stride));
339 *((uint32_t*)(dst+3*stride)) = _mm_cvtsi128_si32(r3);
340 }
341
342
343
344 #if HAVE_SSE4_1
ff_hevc_transform_4x4_luma_add_8_sse4(uint8_t * _dst,const int16_t * coeffs,ptrdiff_t _stride)345 void ff_hevc_transform_4x4_luma_add_8_sse4(uint8_t *_dst, const int16_t *coeffs,
346 ptrdiff_t _stride) {
347
348 uint8_t shift_2nd = 12; // 20 - Bit depth
349 uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
350
351 uint8_t *dst = (uint8_t*) _dst;
352 ptrdiff_t stride = _stride;
353 const int16_t *src = coeffs;
354 __m128i m128iAdd, S0, S8, m128iTmp1, m128iTmp2, m128iAC, m128iBD, m128iA,
355 m128iD;
356 m128iAdd = _mm_set1_epi32(64);
357
358 S0 = _mm_load_si128((__m128i *) (src));
359 S8 = _mm_load_si128((__m128i *) (src + 8));
360
361 m128iAC = _mm_unpacklo_epi16(S0, S8);
362 m128iBD = _mm_unpackhi_epi16(S0, S8);
363
364 m128iTmp1 = _mm_madd_epi16(m128iAC,
365 _mm_load_si128((__m128i *) (transform4x4_luma[0])));
366 m128iTmp2 = _mm_madd_epi16(m128iBD,
367 _mm_load_si128((__m128i *) (transform4x4_luma[1])));
368 S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
369 S0 = _mm_add_epi32(S0, m128iAdd);
370 S0 = _mm_srai_epi32(S0, shift_1st);
371
372 m128iTmp1 = _mm_madd_epi16(m128iAC,
373 _mm_load_si128((__m128i *) (transform4x4_luma[2])));
374 m128iTmp2 = _mm_madd_epi16(m128iBD,
375 _mm_load_si128((__m128i *) (transform4x4_luma[3])));
376 S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
377 S8 = _mm_add_epi32(S8, m128iAdd);
378 S8 = _mm_srai_epi32(S8, shift_1st);
379
380 m128iA = _mm_packs_epi32(S0, S8);
381
382 m128iTmp1 = _mm_madd_epi16(m128iAC,
383 _mm_load_si128((__m128i *) (transform4x4_luma[4])));
384 m128iTmp2 = _mm_madd_epi16(m128iBD,
385 _mm_load_si128((__m128i *) (transform4x4_luma[5])));
386 S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
387 S0 = _mm_add_epi32(S0, m128iAdd);
388 S0 = _mm_srai_epi32(S0, shift_1st);
389
390 m128iTmp1 = _mm_madd_epi16(m128iAC,
391 _mm_load_si128((__m128i *) (transform4x4_luma[6])));
392 m128iTmp2 = _mm_madd_epi16(m128iBD,
393 _mm_load_si128((__m128i *) (transform4x4_luma[7])));
394 S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
395 S8 = _mm_add_epi32(S8, m128iAdd);
396 S8 = _mm_srai_epi32(S8, shift_1st);
397
398 m128iD = _mm_packs_epi32(S0, S8);
399
400 S0 = _mm_unpacklo_epi16(m128iA, m128iD);
401 S8 = _mm_unpackhi_epi16(m128iA, m128iD);
402
403 m128iA = _mm_unpacklo_epi16(S0, S8);
404 m128iD = _mm_unpackhi_epi16(S0, S8);
405
406 /* ################### */
407 m128iAdd = _mm_set1_epi32(add_2nd);
408
409 m128iAC = _mm_unpacklo_epi16(m128iA, m128iD);
410 m128iBD = _mm_unpackhi_epi16(m128iA, m128iD);
411
412 m128iTmp1 = _mm_madd_epi16(m128iAC,
413 _mm_load_si128((__m128i *) (transform4x4_luma[0])));
414 m128iTmp2 = _mm_madd_epi16(m128iBD,
415 _mm_load_si128((__m128i *) (transform4x4_luma[1])));
416 S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
417 S0 = _mm_add_epi32(S0, m128iAdd);
418 S0 = _mm_srai_epi32(S0, shift_2nd);
419
420 m128iTmp1 = _mm_madd_epi16(m128iAC,
421 _mm_load_si128((__m128i *) (transform4x4_luma[2])));
422 m128iTmp2 = _mm_madd_epi16(m128iBD,
423 _mm_load_si128((__m128i *) (transform4x4_luma[3])));
424 S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
425 S8 = _mm_add_epi32(S8, m128iAdd);
426 S8 = _mm_srai_epi32(S8, shift_2nd);
427
428 m128iA = _mm_packs_epi32(S0, S8);
429
430 m128iTmp1 = _mm_madd_epi16(m128iAC,
431 _mm_load_si128((__m128i *) (transform4x4_luma[4])));
432 m128iTmp2 = _mm_madd_epi16(m128iBD,
433 _mm_load_si128((__m128i *) (transform4x4_luma[5])));
434 S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
435 S0 = _mm_add_epi32(S0, m128iAdd);
436 S0 = _mm_srai_epi32(S0, shift_2nd);
437
438 m128iTmp1 = _mm_madd_epi16(m128iAC,
439 _mm_load_si128((__m128i *) (transform4x4_luma[6])));
440 m128iTmp2 = _mm_madd_epi16(m128iBD,
441 _mm_load_si128((__m128i *) (transform4x4_luma[7])));
442 S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
443 S8 = _mm_add_epi32(S8, m128iAdd);
444 S8 = _mm_srai_epi32(S8, shift_2nd);
445
446 m128iD = _mm_packs_epi32(S0, S8);
447
448 // _mm_storeu_si128((__m128i *) (src), m128iA);
449 // _mm_storeu_si128((__m128i *) (src + 8), m128iD);
450
451 S0 = _mm_move_epi64(m128iA); //contains row 0
452 S8 = _mm_move_epi64(m128iD); //row 2
453 m128iA = _mm_srli_si128(m128iA, 8); // row 1
454 m128iD = _mm_srli_si128(m128iD, 8); // row 3
455 m128iTmp1 = _mm_unpacklo_epi16(S0, m128iA);
456 m128iTmp2 = _mm_unpacklo_epi16(S8, m128iD);
457 S0 = _mm_unpacklo_epi32(m128iTmp1, m128iTmp2);
458 S8 = _mm_unpackhi_epi32(m128iTmp1, m128iTmp2);
459
460 //m128iTmp2 = _mm_set_epi32(0, 0, 0, -1); //mask to store 4 * 8bit data
461
462 m128iA = _mm_loadl_epi64((__m128i *) dst);
463 m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
464 m128iTmp1 = _mm_adds_epi16(S0, m128iA); //contains first 4 values
465 m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
466 //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
467 *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
468
469 dst += stride;
470
471 m128iA = _mm_loadl_epi64((__m128i *) dst);
472 m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
473 m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S0, 8), m128iA);
474 m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
475 //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
476 *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
477
478 dst += stride;
479
480 m128iA = _mm_loadl_epi64((__m128i *) dst);
481 m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
482 m128iTmp1 = _mm_adds_epi16(S8, m128iA);
483 m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
484 //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
485 *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
486
487 dst += stride;
488
489 m128iA = _mm_loadl_epi64((__m128i *) dst);
490 m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
491 m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S8, 8), m128iA);
492 m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
493 //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
494 *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
495 }
496 #endif // SSE4.1
497
498 #if 0
499 void ff_hevc_transform_4x4_luma_add_10_sse4(uint8_t *_dst, const int16_t *coeffs,
500 ptrdiff_t _stride) {
501 int i,j;
502 uint8_t shift_2nd = 10; // 20 - Bit depth
503 uint16_t add_2nd = 1 << 9; //(1 << (shift_2nd - 1))
504
505 uint16_t *dst = (uint16_t*) _dst;
506 ptrdiff_t stride = _stride/(sizeof(uint16_t));
507 int16_t *src = coeffs;
508 __m128i m128iAdd, S0, S8, m128iTmp1, m128iTmp2, m128iAC, m128iBD, m128iA,
509 m128iD;
510
511 m128iAdd = _mm_set1_epi32(64);
512
513 S0 = _mm_loadu_si128((__m128i *) (src));
514 S8 = _mm_loadu_si128((__m128i *) (src + 8));
515
516 m128iAC = _mm_unpacklo_epi16(S0, S8);
517 m128iBD = _mm_unpackhi_epi16(S0, S8);
518
519 m128iTmp1 = _mm_madd_epi16(m128iAC,
520 _mm_loadu_si128((__m128i *) (transform4x4_luma[0])));
521 m128iTmp2 = _mm_madd_epi16(m128iBD,
522 _mm_loadu_si128((__m128i *) (transform4x4_luma[1])));
523 S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
524 S0 = _mm_add_epi32(S0, m128iAdd);
525 S0 = _mm_srai_epi32(S0, shift_1st);
526
527 m128iTmp1 = _mm_madd_epi16(m128iAC,
528 _mm_loadu_si128((__m128i *) (transform4x4_luma[2])));
529 m128iTmp2 = _mm_madd_epi16(m128iBD,
530 _mm_loadu_si128((__m128i *) (transform4x4_luma[3])));
531 S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
532 S8 = _mm_add_epi32(S8, m128iAdd);
533 S8 = _mm_srai_epi32(S8, shift_1st);
534
535 m128iA = _mm_packs_epi32(S0, S8);
536
537 m128iTmp1 = _mm_madd_epi16(m128iAC,
538 _mm_loadu_si128((__m128i *) (transform4x4_luma[4])));
539 m128iTmp2 = _mm_madd_epi16(m128iBD,
540 _mm_loadu_si128((__m128i *) (transform4x4_luma[5])));
541 S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
542 S0 = _mm_add_epi32(S0, m128iAdd);
543 S0 = _mm_srai_epi32(S0, shift_1st);
544
545 m128iTmp1 = _mm_madd_epi16(m128iAC,
546 _mm_loadu_si128((__m128i *) (transform4x4_luma[6])));
547 m128iTmp2 = _mm_madd_epi16(m128iBD,
548 _mm_loadu_si128((__m128i *) (transform4x4_luma[7])));
549 S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
550 S8 = _mm_add_epi32(S8, m128iAdd);
551 S8 = _mm_srai_epi32(S8, shift_1st);
552
553 m128iD = _mm_packs_epi32(S0, S8);
554
555 S0 = _mm_unpacklo_epi16(m128iA, m128iD);
556 S8 = _mm_unpackhi_epi16(m128iA, m128iD);
557
558 m128iA = _mm_unpacklo_epi16(S0, S8);
559 m128iD = _mm_unpackhi_epi16(S0, S8);
560
561 /* ################### */
562 m128iAdd = _mm_set1_epi32(add_2nd);
563
564 m128iAC = _mm_unpacklo_epi16(m128iA, m128iD);
565 m128iBD = _mm_unpackhi_epi16(m128iA, m128iD);
566
567 m128iTmp1 = _mm_madd_epi16(m128iAC,
568 _mm_load_si128((__m128i *) (transform4x4_luma[0])));
569 m128iTmp2 = _mm_madd_epi16(m128iBD,
570 _mm_load_si128((__m128i *) (transform4x4_luma[1])));
571 S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
572 S0 = _mm_add_epi32(S0, m128iAdd);
573 S0 = _mm_srai_epi32(S0, shift_2nd);
574
575 m128iTmp1 = _mm_madd_epi16(m128iAC,
576 _mm_load_si128((__m128i *) (transform4x4_luma[2])));
577 m128iTmp2 = _mm_madd_epi16(m128iBD,
578 _mm_load_si128((__m128i *) (transform4x4_luma[3])));
579 S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
580 S8 = _mm_add_epi32(S8, m128iAdd);
581 S8 = _mm_srai_epi32(S8, shift_2nd);
582
583 m128iA = _mm_packs_epi32(S0, S8);
584
585 m128iTmp1 = _mm_madd_epi16(m128iAC,
586 _mm_load_si128((__m128i *) (transform4x4_luma[4])));
587 m128iTmp2 = _mm_madd_epi16(m128iBD,
588 _mm_load_si128((__m128i *) (transform4x4_luma[5])));
589 S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
590 S0 = _mm_add_epi32(S0, m128iAdd);
591 S0 = _mm_srai_epi32(S0, shift_2nd);
592
593 m128iTmp1 = _mm_madd_epi16(m128iAC,
594 _mm_load_si128((__m128i *) (transform4x4_luma[6])));
595 m128iTmp2 = _mm_madd_epi16(m128iBD,
596 _mm_load_si128((__m128i *) (transform4x4_luma[7])));
597 S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
598 S8 = _mm_add_epi32(S8, m128iAdd);
599 S8 = _mm_srai_epi32(S8, shift_2nd);
600
601 m128iD = _mm_packs_epi32(S0, S8);
602
603 _mm_storeu_si128((__m128i *) (src), m128iA);
604 _mm_storeu_si128((__m128i *) (src + 8), m128iD);
605 j = 0;
606 for (i = 0; i < 2; i++) {
607 dst[0] = av_clip_uintp2(dst[0] + src[j],10);
608 dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10);
609 dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10);
610 dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10);
611 j += 1;
612 dst += stride;
613 dst[0] = av_clip_uintp2(dst[0] + src[j],10);
614 dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10);
615 dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10);
616 dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10);
617 j += 1;
618 dst += stride;
619 }
620
621 }
622 #endif
623
624
625 #if HAVE_SSE4_1
ff_hevc_transform_4x4_add_8_sse4(uint8_t * _dst,const int16_t * coeffs,ptrdiff_t _stride)626 void ff_hevc_transform_4x4_add_8_sse4(uint8_t *_dst, const int16_t *coeffs,
627 ptrdiff_t _stride) {
628 uint8_t shift_2nd = 12; // 20 - Bit depth
629 uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
630
631 uint8_t *dst = (uint8_t*) _dst;
632 ptrdiff_t stride = _stride;
633 const int16_t *src = coeffs;
634
635 __m128i S0, S8, m128iAdd, m128Tmp, E1, E2, O1, O2, m128iA, m128iD, m128iTmp1,m128iTmp2;
636 S0 = _mm_load_si128((__m128i *) (src));
637 S8 = _mm_load_si128((__m128i *) (src + 8));
638 m128iAdd = _mm_set1_epi32(add_1st);
639
640 m128Tmp = _mm_unpacklo_epi16(S0, S8);
641 E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0])));
642 E1 = _mm_add_epi32(E1, m128iAdd);
643
644 E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1])));
645 E2 = _mm_add_epi32(E2, m128iAdd);
646
647 m128Tmp = _mm_unpackhi_epi16(S0, S8);
648 O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2])));
649 O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3])));
650
651 m128iA = _mm_add_epi32(E1, O1);
652 m128iA = _mm_srai_epi32(m128iA, shift_1st); // Sum = Sum >> iShiftNum
653 m128Tmp = _mm_add_epi32(E2, O2);
654 m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st); // Sum = Sum >> iShiftNum
655 m128iA = _mm_packs_epi32(m128iA, m128Tmp);
656
657 m128iD = _mm_sub_epi32(E2, O2);
658 m128iD = _mm_srai_epi32(m128iD, shift_1st); // Sum = Sum >> iShiftNum
659
660 m128Tmp = _mm_sub_epi32(E1, O1);
661 m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st); // Sum = Sum >> iShiftNum
662
663 m128iD = _mm_packs_epi32(m128iD, m128Tmp);
664
665 S0 = _mm_unpacklo_epi16(m128iA, m128iD);
666 S8 = _mm_unpackhi_epi16(m128iA, m128iD);
667
668 m128iA = _mm_unpacklo_epi16(S0, S8);
669 m128iD = _mm_unpackhi_epi16(S0, S8);
670
671 /* ########################## */
672
673 m128iAdd = _mm_set1_epi32(add_2nd);
674 m128Tmp = _mm_unpacklo_epi16(m128iA, m128iD);
675 E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0])));
676 E1 = _mm_add_epi32(E1, m128iAdd);
677
678 E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1])));
679 E2 = _mm_add_epi32(E2, m128iAdd);
680
681 m128Tmp = _mm_unpackhi_epi16(m128iA, m128iD);
682 O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2])));
683 O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3])));
684
685 m128iA = _mm_add_epi32(E1, O1);
686 m128iA = _mm_srai_epi32(m128iA, shift_2nd);
687 m128Tmp = _mm_add_epi32(E2, O2);
688 m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd);
689 m128iA = _mm_packs_epi32(m128iA, m128Tmp);
690
691 m128iD = _mm_sub_epi32(E2, O2);
692 m128iD = _mm_srai_epi32(m128iD, shift_2nd);
693
694 m128Tmp = _mm_sub_epi32(E1, O1);
695 m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd);
696
697 m128iD = _mm_packs_epi32(m128iD, m128Tmp);
698
699 S0 = _mm_move_epi64(m128iA); //contains row 0
700 S8 = _mm_move_epi64(m128iD); //row 2
701 m128iA = _mm_srli_si128(m128iA, 8); // row 1
702 m128iD = _mm_srli_si128(m128iD, 8); // row 3
703 m128iTmp1 = _mm_unpacklo_epi16(S0, m128iA);
704 m128iTmp2 = _mm_unpacklo_epi16(S8, m128iD);
705 S0 = _mm_unpacklo_epi32(m128iTmp1, m128iTmp2);
706 S8 = _mm_unpackhi_epi32(m128iTmp1, m128iTmp2);
707
708 //m128iTmp2 = _mm_set_epi32(0, 0, 0, -1); //mask to store 4 * 8bit data
709
710 m128iA = _mm_loadl_epi64((__m128i *) dst);
711 m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
712 m128iTmp1 = _mm_adds_epi16(S0, m128iA); //contains first 4 values
713 m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
714 //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
715 *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
716
717 dst += stride;
718
719 m128iA = _mm_loadl_epi64((__m128i *) dst);
720 m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
721 m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S0, 8), m128iA);
722 m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
723 //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
724 *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
725
726 dst += stride;
727
728 m128iA = _mm_loadl_epi64((__m128i *) dst);
729 m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
730 m128iTmp1 = _mm_adds_epi16(S8, m128iA);
731 m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
732 //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
733 *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
734
735 dst += stride;
736
737 m128iA = _mm_loadl_epi64((__m128i *) dst);
738 m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
739 m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S8, 8), m128iA);
740 m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
741 //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
742 *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
743 }
744 #endif
745
746 #if 0
747 void ff_hevc_transform_4x4_add_10_sse4(uint8_t *_dst, const int16_t *coeffs,
748 ptrdiff_t _stride) {
749 int i;
750 uint8_t shift_2nd = 10; // 20 - Bit depth
751 uint16_t add_2nd = 1 << 9; //(1 << (shift_2nd - 1))
752
753 uint16_t *dst = (uint16_t*) _dst;
754 ptrdiff_t stride = _stride/2;
755 int16_t *src = coeffs;
756
757 int j;
758 __m128i S0, S8, m128iAdd, m128Tmp, E1, E2, O1, O2, m128iA, m128iD;
759 S0 = _mm_load_si128((__m128i *) (src));
760 S8 = _mm_load_si128((__m128i *) (src + 8));
761 m128iAdd = _mm_set1_epi32(add_1st);
762
763 m128Tmp = _mm_unpacklo_epi16(S0, S8);
764 E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0])));
765 E1 = _mm_add_epi32(E1, m128iAdd);
766
767 E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1])));
768 E2 = _mm_add_epi32(E2, m128iAdd);
769
770 m128Tmp = _mm_unpackhi_epi16(S0, S8);
771 O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2])));
772 O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3])));
773
774 m128iA = _mm_add_epi32(E1, O1);
775 m128iA = _mm_srai_epi32(m128iA, shift_1st); // Sum = Sum >> iShiftNum
776 m128Tmp = _mm_add_epi32(E2, O2);
777 m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st); // Sum = Sum >> iShiftNum
778 m128iA = _mm_packs_epi32(m128iA, m128Tmp);
779
780 m128iD = _mm_sub_epi32(E2, O2);
781 m128iD = _mm_srai_epi32(m128iD, shift_1st); // Sum = Sum >> iShiftNum
782
783 m128Tmp = _mm_sub_epi32(E1, O1);
784 m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st); // Sum = Sum >> iShiftNum
785
786 m128iD = _mm_packs_epi32(m128iD, m128Tmp);
787
788 S0 = _mm_unpacklo_epi16(m128iA, m128iD);
789 S8 = _mm_unpackhi_epi16(m128iA, m128iD);
790
791 m128iA = _mm_unpacklo_epi16(S0, S8);
792 m128iD = _mm_unpackhi_epi16(S0, S8);
793
794 /* ########################## */
795
796 m128iAdd = _mm_set1_epi32(add_2nd);
797 m128Tmp = _mm_unpacklo_epi16(m128iA, m128iD);
798 E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0])));
799 E1 = _mm_add_epi32(E1, m128iAdd);
800
801 E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1])));
802 E2 = _mm_add_epi32(E2, m128iAdd);
803
804 m128Tmp = _mm_unpackhi_epi16(m128iA, m128iD);
805 O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2])));
806 O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3])));
807
808 m128iA = _mm_add_epi32(E1, O1);
809 m128iA = _mm_srai_epi32(m128iA, shift_2nd);
810 m128Tmp = _mm_add_epi32(E2, O2);
811 m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd);
812 m128iA = _mm_packs_epi32(m128iA, m128Tmp);
813
814 m128iD = _mm_sub_epi32(E2, O2);
815 m128iD = _mm_srai_epi32(m128iD, shift_2nd);
816
817 m128Tmp = _mm_sub_epi32(E1, O1);
818 m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd);
819
820 m128iD = _mm_packs_epi32(m128iD, m128Tmp);
821 _mm_storeu_si128((__m128i *) (src), m128iA);
822 _mm_storeu_si128((__m128i *) (src + 8), m128iD);
823 j = 0;
824 for (i = 0; i < 2; i++) {
825 dst[0] = av_clip_uintp2(dst[0] + src[j],10);
826 dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10);
827 dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10);
828 dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10);
829 j += 1;
830 dst += stride;
831 dst[0] = av_clip_uintp2(dst[0] + src[j],10);
832 dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10);
833 dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10);
834 dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10);
835 j += 1;
836 dst += stride;
837 }
838 }
839 #endif
840
841 #if HAVE_SSE4_1
ff_hevc_transform_8x8_add_8_sse4(uint8_t * _dst,const int16_t * coeffs,ptrdiff_t _stride)842 void ff_hevc_transform_8x8_add_8_sse4(uint8_t *_dst, const int16_t *coeffs,
843 ptrdiff_t _stride) {
844 uint8_t shift_2nd = 12; // 20 - Bit depth
845 uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
846
847 uint8_t *dst = (uint8_t*) _dst;
848 ptrdiff_t stride = _stride / sizeof(uint8_t);
849 const int16_t *src = coeffs;
850 __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
851 m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h,
852 E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l,
853
854 O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h,
855 T0,T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11;
856 T0= _mm_load_si128((__m128i *) (transform8x8[0]));
857 T1= _mm_load_si128((__m128i *) (transform8x8[1]));
858 T2= _mm_load_si128((__m128i *) (transform8x8[2]));
859 T3= _mm_load_si128((__m128i *) (transform8x8[3]));
860 T4= _mm_load_si128((__m128i *) (transform8x8[4]));
861 T5= _mm_load_si128((__m128i *) (transform8x8[5]));
862 T6= _mm_load_si128((__m128i *) (transform8x8[6]));
863 T7= _mm_load_si128((__m128i *) (transform8x8[7]));
864 T8= _mm_load_si128((__m128i *) (transform8x8[8]));
865 T9= _mm_load_si128((__m128i *) (transform8x8[9]));
866 T10= _mm_load_si128((__m128i *) (transform8x8[10]));
867 T11= _mm_load_si128((__m128i *) (transform8x8[11]));
868
869 m128iAdd = _mm_set1_epi32(add_1st);
870
871 m128iS1 = _mm_load_si128((__m128i *) (src + 8));
872 m128iS3 = _mm_load_si128((__m128i *) (src + 24));
873 m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
874 E1l = _mm_madd_epi16(m128Tmp0, T0);
875 m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
876 E1h = _mm_madd_epi16(m128Tmp1, T0);
877 m128iS5 = _mm_load_si128((__m128i *) (src + 40));
878 m128iS7 = _mm_load_si128((__m128i *) (src + 56));
879 m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
880 E2l = _mm_madd_epi16(m128Tmp2, T1);
881 m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
882 E2h = _mm_madd_epi16(m128Tmp3, T1);
883 O0l = _mm_add_epi32(E1l, E2l);
884 O0h = _mm_add_epi32(E1h, E2h);
885
886 E1l = _mm_madd_epi16(m128Tmp0, T2);
887 E1h = _mm_madd_epi16(m128Tmp1, T2);
888 E2l = _mm_madd_epi16(m128Tmp2, T3);
889 E2h = _mm_madd_epi16(m128Tmp3, T3);
890
891 O1l = _mm_add_epi32(E1l, E2l);
892 O1h = _mm_add_epi32(E1h, E2h);
893
894 E1l = _mm_madd_epi16(m128Tmp0, T4);
895 E1h = _mm_madd_epi16(m128Tmp1, T4);
896 E2l = _mm_madd_epi16(m128Tmp2, T5);
897 E2h = _mm_madd_epi16(m128Tmp3, T5);
898 O2l = _mm_add_epi32(E1l, E2l);
899 O2h = _mm_add_epi32(E1h, E2h);
900
901 E1l = _mm_madd_epi16(m128Tmp0, T6);
902 E1h = _mm_madd_epi16(m128Tmp1, T6);
903 E2l = _mm_madd_epi16(m128Tmp2, T7);
904 E2h = _mm_madd_epi16(m128Tmp3, T7);
905 O3h = _mm_add_epi32(E1h, E2h);
906 O3l = _mm_add_epi32(E1l, E2l);
907
908 /* ------- */
909
910 m128iS0 = _mm_load_si128((__m128i *) (src + 0));
911 m128iS4 = _mm_load_si128((__m128i *) (src + 32));
912 m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
913 EE0l = _mm_madd_epi16(m128Tmp0, T8);
914 m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
915 EE0h = _mm_madd_epi16(m128Tmp1, T8);
916
917 EE1l = _mm_madd_epi16(m128Tmp0, T9);
918 EE1h = _mm_madd_epi16(m128Tmp1, T9);
919
920 /* ------- */
921
922 m128iS2 = _mm_load_si128((__m128i *) (src + 16));
923 m128iS6 = _mm_load_si128((__m128i *) (src + 48));
924 m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
925 E00l = _mm_madd_epi16(m128Tmp0, T10);
926 m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
927 E00h = _mm_madd_epi16(m128Tmp1, T10);
928 E01l = _mm_madd_epi16(m128Tmp0, T11);
929 E01h = _mm_madd_epi16(m128Tmp1, T11);
930 E0l = _mm_add_epi32(EE0l, E00l);
931 E0l = _mm_add_epi32(E0l, m128iAdd);
932 E0h = _mm_add_epi32(EE0h, E00h);
933 E0h = _mm_add_epi32(E0h, m128iAdd);
934 E3l = _mm_sub_epi32(EE0l, E00l);
935 E3l = _mm_add_epi32(E3l, m128iAdd);
936 E3h = _mm_sub_epi32(EE0h, E00h);
937 E3h = _mm_add_epi32(E3h, m128iAdd);
938
939 E1l = _mm_add_epi32(EE1l, E01l);
940 E1l = _mm_add_epi32(E1l, m128iAdd);
941 E1h = _mm_add_epi32(EE1h, E01h);
942 E1h = _mm_add_epi32(E1h, m128iAdd);
943 E2l = _mm_sub_epi32(EE1l, E01l);
944 E2l = _mm_add_epi32(E2l, m128iAdd);
945 E2h = _mm_sub_epi32(EE1h, E01h);
946 E2h = _mm_add_epi32(E2h, m128iAdd);
947 m128iS0 = _mm_packs_epi32(
948 _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_1st),
949 _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_1st));
950 m128iS1 = _mm_packs_epi32(
951 _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_1st),
952 _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_1st));
953 m128iS2 = _mm_packs_epi32(
954 _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_1st),
955 _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_1st));
956 m128iS3 = _mm_packs_epi32(
957 _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_1st),
958 _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_1st));
959 m128iS4 = _mm_packs_epi32(
960 _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_1st),
961 _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_1st));
962 m128iS5 = _mm_packs_epi32(
963 _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_1st),
964 _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_1st));
965 m128iS6 = _mm_packs_epi32(
966 _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_1st),
967 _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_1st));
968 m128iS7 = _mm_packs_epi32(
969 _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_1st),
970 _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_1st));
971 /* Invers matrix */
972
973 E0l = _mm_unpacklo_epi16(m128iS0, m128iS4);
974 E1l = _mm_unpacklo_epi16(m128iS1, m128iS5);
975 E2l = _mm_unpacklo_epi16(m128iS2, m128iS6);
976 E3l = _mm_unpacklo_epi16(m128iS3, m128iS7);
977 O0l = _mm_unpackhi_epi16(m128iS0, m128iS4);
978 O1l = _mm_unpackhi_epi16(m128iS1, m128iS5);
979 O2l = _mm_unpackhi_epi16(m128iS2, m128iS6);
980 O3l = _mm_unpackhi_epi16(m128iS3, m128iS7);
981 m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l);
982 m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l);
983 m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
984 m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
985 m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l);
986 m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l);
987 m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
988 m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
989 m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l);
990 m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l);
991 m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
992 m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
993 m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l);
994 m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l);
995 m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
996 m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
997
998 m128iAdd = _mm_set1_epi32(add_2nd);
999
1000 m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
1001 E1l = _mm_madd_epi16(m128Tmp0, T0);
1002 m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
1003 E1h = _mm_madd_epi16(m128Tmp1, T0);
1004 m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
1005 E2l = _mm_madd_epi16(m128Tmp2, T1);
1006 m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
1007 E2h = _mm_madd_epi16(m128Tmp3, T1);
1008 O0l = _mm_add_epi32(E1l, E2l);
1009 O0h = _mm_add_epi32(E1h, E2h);
1010 E1l = _mm_madd_epi16(m128Tmp0, T2);
1011 E1h = _mm_madd_epi16(m128Tmp1, T2);
1012 E2l = _mm_madd_epi16(m128Tmp2, T3);
1013 E2h = _mm_madd_epi16(m128Tmp3, T3);
1014 O1l = _mm_add_epi32(E1l, E2l);
1015 O1h = _mm_add_epi32(E1h, E2h);
1016 E1l = _mm_madd_epi16(m128Tmp0, T4);
1017 E1h = _mm_madd_epi16(m128Tmp1, T4);
1018 E2l = _mm_madd_epi16(m128Tmp2, T5);
1019 E2h = _mm_madd_epi16(m128Tmp3, T5);
1020 O2l = _mm_add_epi32(E1l, E2l);
1021 O2h = _mm_add_epi32(E1h, E2h);
1022 E1l = _mm_madd_epi16(m128Tmp0, T6);
1023 E1h = _mm_madd_epi16(m128Tmp1, T6);
1024 E2l = _mm_madd_epi16(m128Tmp2, T7);
1025 E2h = _mm_madd_epi16(m128Tmp3, T7);
1026 O3h = _mm_add_epi32(E1h, E2h);
1027 O3l = _mm_add_epi32(E1l, E2l);
1028
1029 m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
1030 EE0l = _mm_madd_epi16(m128Tmp0, T8);
1031 m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
1032 EE0h = _mm_madd_epi16(m128Tmp1, T8);
1033 EE1l = _mm_madd_epi16(m128Tmp0, T9);
1034 EE1h = _mm_madd_epi16(m128Tmp1, T9);
1035
1036 m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
1037 E00l = _mm_madd_epi16(m128Tmp0, T10);
1038 m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
1039 E00h = _mm_madd_epi16(m128Tmp1, T10);
1040 E01l = _mm_madd_epi16(m128Tmp0, T11);
1041 E01h = _mm_madd_epi16(m128Tmp1, T11);
1042 E0l = _mm_add_epi32(EE0l, E00l);
1043 E0l = _mm_add_epi32(E0l, m128iAdd);
1044 E0h = _mm_add_epi32(EE0h, E00h);
1045 E0h = _mm_add_epi32(E0h, m128iAdd);
1046 E3l = _mm_sub_epi32(EE0l, E00l);
1047 E3l = _mm_add_epi32(E3l, m128iAdd);
1048 E3h = _mm_sub_epi32(EE0h, E00h);
1049 E3h = _mm_add_epi32(E3h, m128iAdd);
1050 E1l = _mm_add_epi32(EE1l, E01l);
1051 E1l = _mm_add_epi32(E1l, m128iAdd);
1052 E1h = _mm_add_epi32(EE1h, E01h);
1053 E1h = _mm_add_epi32(E1h, m128iAdd);
1054 E2l = _mm_sub_epi32(EE1l, E01l);
1055 E2l = _mm_add_epi32(E2l, m128iAdd);
1056 E2h = _mm_sub_epi32(EE1h, E01h);
1057 E2h = _mm_add_epi32(E2h, m128iAdd);
1058
1059 m128iS0 = _mm_packs_epi32(
1060 _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_2nd),
1061 _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_2nd));
1062 m128iS1 = _mm_packs_epi32(
1063 _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_2nd),
1064 _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_2nd));
1065 m128iS2 = _mm_packs_epi32(
1066 _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_2nd),
1067 _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_2nd));
1068 m128iS3 = _mm_packs_epi32(
1069 _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_2nd),
1070 _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_2nd));
1071 m128iS4 = _mm_packs_epi32(
1072 _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_2nd),
1073 _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_2nd));
1074 m128iS5 = _mm_packs_epi32(
1075 _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_2nd),
1076 _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_2nd));
1077 m128iS6 = _mm_packs_epi32(
1078 _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_2nd),
1079 _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_2nd));
1080 m128iS7 = _mm_packs_epi32(
1081 _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_2nd),
1082 _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_2nd));
1083
1084 E0l = _mm_unpacklo_epi16(m128iS0, m128iS4);
1085 E1l = _mm_unpacklo_epi16(m128iS1, m128iS5);
1086 E2l = _mm_unpacklo_epi16(m128iS2, m128iS6);
1087 E3l = _mm_unpacklo_epi16(m128iS3, m128iS7);
1088 O0l = _mm_unpackhi_epi16(m128iS0, m128iS4);
1089 O1l = _mm_unpackhi_epi16(m128iS1, m128iS5);
1090 O2l = _mm_unpackhi_epi16(m128iS2, m128iS6);
1091 O3l = _mm_unpackhi_epi16(m128iS3, m128iS7);
1092 m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l);
1093 m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l);
1094 m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
1095 m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
1096 m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l);
1097 m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l);
1098 m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
1099 m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
1100 m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l);
1101 m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l);
1102 m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
1103 m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
1104 m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l);
1105 m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l);
1106 m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
1107 m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
1108
1109 E0l = _mm_loadl_epi64((__m128i *) dst);
1110 E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1111
1112 E0l = _mm_adds_epi16(E0l, m128iS0);
1113 E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1114 _mm_storel_epi64((__m128i *) dst, E0l);
1115 dst += stride;
1116
1117 E0l = _mm_loadl_epi64((__m128i *) dst);
1118 E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1119
1120 E0l = _mm_adds_epi16(E0l, m128iS1);
1121 E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1122 _mm_storel_epi64((__m128i *) dst, E0l);
1123 dst += stride;
1124
1125 E0l = _mm_loadl_epi64((__m128i *) dst);
1126 E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1127
1128 E0l = _mm_adds_epi16(E0l, m128iS2);
1129 E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1130 _mm_storel_epi64((__m128i *) dst, E0l);
1131 dst += stride;
1132
1133 E0l = _mm_loadl_epi64((__m128i *) dst);
1134 E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1135
1136 E0l = _mm_adds_epi16(E0l, m128iS3);
1137 E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1138 _mm_storel_epi64((__m128i *) dst, E0l);
1139 dst += stride;
1140
1141 E0l = _mm_loadl_epi64((__m128i *) dst);
1142 E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1143
1144 E0l = _mm_adds_epi16(E0l, m128iS4);
1145 E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1146 _mm_storel_epi64((__m128i *) dst, E0l);
1147 dst += stride;
1148
1149 E0l = _mm_loadl_epi64((__m128i *) dst);
1150 E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1151
1152 E0l = _mm_adds_epi16(E0l, m128iS5);
1153 E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1154 _mm_storel_epi64((__m128i *) dst, E0l);
1155 dst += stride;
1156
1157 E0l = _mm_loadl_epi64((__m128i *) dst);
1158 E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1159
1160 E0l = _mm_adds_epi16(E0l, m128iS6);
1161 E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1162 _mm_storel_epi64((__m128i *) dst, E0l);
1163 dst += stride;
1164
1165 E0l = _mm_loadl_epi64((__m128i *) dst);
1166 E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1167
1168 E0l = _mm_adds_epi16(E0l, m128iS7);
1169 E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1170 _mm_storel_epi64((__m128i *) dst, E0l);
1171 dst += stride;
1172
1173 }
1174 #endif
1175
1176 #if 0
1177 void ff_hevc_transform_8x8_add_10_sse4(uint8_t *_dst, const int16_t *coeffs,
1178 ptrdiff_t _stride) {
1179 int i;
1180 uint16_t *dst = (uint16_t*) _dst;
1181 ptrdiff_t stride = _stride / sizeof(uint16_t);
1182 int16_t *src = coeffs;
1183 uint8_t shift_2nd = 10; // 20 - Bit depth
1184 uint16_t add_2nd = 1 << 9; //(1 << (shift_2nd - 1))
1185
1186 __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
1187 m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h,
1188 E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l,
1189 O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h;
1190 int j;
1191 m128iAdd = _mm_set1_epi32(add_1st);
1192
1193 m128iS1 = _mm_load_si128((__m128i *) (src + 8));
1194 m128iS3 = _mm_load_si128((__m128i *) (src + 24));
1195 m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
1196 E1l = _mm_madd_epi16(m128Tmp0,
1197 _mm_load_si128((__m128i *) (transform8x8[0])));
1198 m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
1199 E1h = _mm_madd_epi16(m128Tmp1,
1200 _mm_load_si128((__m128i *) (transform8x8[0])));
1201 m128iS5 = _mm_load_si128((__m128i *) (src + 40));
1202 m128iS7 = _mm_load_si128((__m128i *) (src + 56));
1203 m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
1204 E2l = _mm_madd_epi16(m128Tmp2,
1205 _mm_load_si128((__m128i *) (transform8x8[1])));
1206 m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
1207 E2h = _mm_madd_epi16(m128Tmp3,
1208 _mm_load_si128((__m128i *) (transform8x8[1])));
1209 O0l = _mm_add_epi32(E1l, E2l);
1210 O0h = _mm_add_epi32(E1h, E2h);
1211
1212 E1l = _mm_madd_epi16(m128Tmp0,
1213 _mm_load_si128((__m128i *) (transform8x8[2])));
1214 E1h = _mm_madd_epi16(m128Tmp1,
1215 _mm_load_si128((__m128i *) (transform8x8[2])));
1216 E2l = _mm_madd_epi16(m128Tmp2,
1217 _mm_load_si128((__m128i *) (transform8x8[3])));
1218 E2h = _mm_madd_epi16(m128Tmp3,
1219 _mm_load_si128((__m128i *) (transform8x8[3])));
1220
1221 O1l = _mm_add_epi32(E1l, E2l);
1222 O1h = _mm_add_epi32(E1h, E2h);
1223
1224 E1l = _mm_madd_epi16(m128Tmp0,
1225 _mm_load_si128((__m128i *) (transform8x8[4])));
1226 E1h = _mm_madd_epi16(m128Tmp1,
1227 _mm_load_si128((__m128i *) (transform8x8[4])));
1228 E2l = _mm_madd_epi16(m128Tmp2,
1229 _mm_load_si128((__m128i *) (transform8x8[5])));
1230 E2h = _mm_madd_epi16(m128Tmp3,
1231 _mm_load_si128((__m128i *) (transform8x8[5])));
1232 O2l = _mm_add_epi32(E1l, E2l);
1233 O2h = _mm_add_epi32(E1h, E2h);
1234
1235 E1l = _mm_madd_epi16(m128Tmp0,
1236 _mm_load_si128((__m128i *) (transform8x8[6])));
1237 E1h = _mm_madd_epi16(m128Tmp1,
1238 _mm_load_si128((__m128i *) (transform8x8[6])));
1239 E2l = _mm_madd_epi16(m128Tmp2,
1240 _mm_load_si128((__m128i *) (transform8x8[7])));
1241 E2h = _mm_madd_epi16(m128Tmp3,
1242 _mm_load_si128((__m128i *) (transform8x8[7])));
1243 O3h = _mm_add_epi32(E1h, E2h);
1244 O3l = _mm_add_epi32(E1l, E2l);
1245
1246 /* ------- */
1247
1248 m128iS0 = _mm_load_si128((__m128i *) (src + 0));
1249 m128iS4 = _mm_load_si128((__m128i *) (src + 32));
1250 m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
1251 EE0l = _mm_madd_epi16(m128Tmp0,
1252 _mm_load_si128((__m128i *) (transform8x8[8])));
1253 m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
1254 EE0h = _mm_madd_epi16(m128Tmp1,
1255 _mm_load_si128((__m128i *) (transform8x8[8])));
1256
1257 EE1l = _mm_madd_epi16(m128Tmp0,
1258 _mm_load_si128((__m128i *) (transform8x8[9])));
1259 EE1h = _mm_madd_epi16(m128Tmp1,
1260 _mm_load_si128((__m128i *) (transform8x8[9])));
1261
1262 /* ------- */
1263
1264 m128iS2 = _mm_load_si128((__m128i *) (src + 16));
1265 m128iS6 = _mm_load_si128((__m128i *) (src + 48));
1266 m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
1267 E00l = _mm_madd_epi16(m128Tmp0,
1268 _mm_load_si128((__m128i *) (transform8x8[10])));
1269 m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
1270 E00h = _mm_madd_epi16(m128Tmp1,
1271 _mm_load_si128((__m128i *) (transform8x8[10])));
1272 E01l = _mm_madd_epi16(m128Tmp0,
1273 _mm_load_si128((__m128i *) (transform8x8[11])));
1274 E01h = _mm_madd_epi16(m128Tmp1,
1275 _mm_load_si128((__m128i *) (transform8x8[11])));
1276 E0l = _mm_add_epi32(EE0l, E00l);
1277 E0l = _mm_add_epi32(E0l, m128iAdd);
1278 E0h = _mm_add_epi32(EE0h, E00h);
1279 E0h = _mm_add_epi32(E0h, m128iAdd);
1280 E3l = _mm_sub_epi32(EE0l, E00l);
1281 E3l = _mm_add_epi32(E3l, m128iAdd);
1282 E3h = _mm_sub_epi32(EE0h, E00h);
1283 E3h = _mm_add_epi32(E3h, m128iAdd);
1284
1285 E1l = _mm_add_epi32(EE1l, E01l);
1286 E1l = _mm_add_epi32(E1l, m128iAdd);
1287 E1h = _mm_add_epi32(EE1h, E01h);
1288 E1h = _mm_add_epi32(E1h, m128iAdd);
1289 E2l = _mm_sub_epi32(EE1l, E01l);
1290 E2l = _mm_add_epi32(E2l, m128iAdd);
1291 E2h = _mm_sub_epi32(EE1h, E01h);
1292 E2h = _mm_add_epi32(E2h, m128iAdd);
1293 m128iS0 = _mm_packs_epi32(
1294 _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_1st),
1295 _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_1st));
1296 m128iS1 = _mm_packs_epi32(
1297 _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_1st),
1298 _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_1st));
1299 m128iS2 = _mm_packs_epi32(
1300 _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_1st),
1301 _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_1st));
1302 m128iS3 = _mm_packs_epi32(
1303 _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_1st),
1304 _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_1st));
1305 m128iS4 = _mm_packs_epi32(
1306 _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_1st),
1307 _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_1st));
1308 m128iS5 = _mm_packs_epi32(
1309 _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_1st),
1310 _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_1st));
1311 m128iS6 = _mm_packs_epi32(
1312 _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_1st),
1313 _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_1st));
1314 m128iS7 = _mm_packs_epi32(
1315 _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_1st),
1316 _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_1st));
1317 /* Invers matrix */
1318
1319 E0l = _mm_unpacklo_epi16(m128iS0, m128iS4);
1320 E1l = _mm_unpacklo_epi16(m128iS1, m128iS5);
1321 E2l = _mm_unpacklo_epi16(m128iS2, m128iS6);
1322 E3l = _mm_unpacklo_epi16(m128iS3, m128iS7);
1323 O0l = _mm_unpackhi_epi16(m128iS0, m128iS4);
1324 O1l = _mm_unpackhi_epi16(m128iS1, m128iS5);
1325 O2l = _mm_unpackhi_epi16(m128iS2, m128iS6);
1326 O3l = _mm_unpackhi_epi16(m128iS3, m128iS7);
1327 m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l);
1328 m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l);
1329 m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
1330 m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
1331 m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l);
1332 m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l);
1333 m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
1334 m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
1335 m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l);
1336 m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l);
1337 m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
1338 m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
1339 m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l);
1340 m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l);
1341 m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
1342 m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
1343
1344 m128iAdd = _mm_set1_epi32(add_2nd);
1345
1346 m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
1347 E1l = _mm_madd_epi16(m128Tmp0,
1348 _mm_load_si128((__m128i *) (transform8x8[0])));
1349 m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
1350 E1h = _mm_madd_epi16(m128Tmp1,
1351 _mm_load_si128((__m128i *) (transform8x8[0])));
1352 m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
1353 E2l = _mm_madd_epi16(m128Tmp2,
1354 _mm_load_si128((__m128i *) (transform8x8[1])));
1355 m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
1356 E2h = _mm_madd_epi16(m128Tmp3,
1357 _mm_load_si128((__m128i *) (transform8x8[1])));
1358 O0l = _mm_add_epi32(E1l, E2l);
1359 O0h = _mm_add_epi32(E1h, E2h);
1360 E1l = _mm_madd_epi16(m128Tmp0,
1361 _mm_load_si128((__m128i *) (transform8x8[2])));
1362 E1h = _mm_madd_epi16(m128Tmp1,
1363 _mm_load_si128((__m128i *) (transform8x8[2])));
1364 E2l = _mm_madd_epi16(m128Tmp2,
1365 _mm_load_si128((__m128i *) (transform8x8[3])));
1366 E2h = _mm_madd_epi16(m128Tmp3,
1367 _mm_load_si128((__m128i *) (transform8x8[3])));
1368 O1l = _mm_add_epi32(E1l, E2l);
1369 O1h = _mm_add_epi32(E1h, E2h);
1370 E1l = _mm_madd_epi16(m128Tmp0,
1371 _mm_load_si128((__m128i *) (transform8x8[4])));
1372 E1h = _mm_madd_epi16(m128Tmp1,
1373 _mm_load_si128((__m128i *) (transform8x8[4])));
1374 E2l = _mm_madd_epi16(m128Tmp2,
1375 _mm_load_si128((__m128i *) (transform8x8[5])));
1376 E2h = _mm_madd_epi16(m128Tmp3,
1377 _mm_load_si128((__m128i *) (transform8x8[5])));
1378 O2l = _mm_add_epi32(E1l, E2l);
1379 O2h = _mm_add_epi32(E1h, E2h);
1380 E1l = _mm_madd_epi16(m128Tmp0,
1381 _mm_load_si128((__m128i *) (transform8x8[6])));
1382 E1h = _mm_madd_epi16(m128Tmp1,
1383 _mm_load_si128((__m128i *) (transform8x8[6])));
1384 E2l = _mm_madd_epi16(m128Tmp2,
1385 _mm_load_si128((__m128i *) (transform8x8[7])));
1386 E2h = _mm_madd_epi16(m128Tmp3,
1387 _mm_load_si128((__m128i *) (transform8x8[7])));
1388 O3h = _mm_add_epi32(E1h, E2h);
1389 O3l = _mm_add_epi32(E1l, E2l);
1390
1391 m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
1392 EE0l = _mm_madd_epi16(m128Tmp0,
1393 _mm_load_si128((__m128i *) (transform8x8[8])));
1394 m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
1395 EE0h = _mm_madd_epi16(m128Tmp1,
1396 _mm_load_si128((__m128i *) (transform8x8[8])));
1397 EE1l = _mm_madd_epi16(m128Tmp0,
1398 _mm_load_si128((__m128i *) (transform8x8[9])));
1399 EE1h = _mm_madd_epi16(m128Tmp1,
1400 _mm_load_si128((__m128i *) (transform8x8[9])));
1401
1402 m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
1403 E00l = _mm_madd_epi16(m128Tmp0,
1404 _mm_load_si128((__m128i *) (transform8x8[10])));
1405 m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
1406 E00h = _mm_madd_epi16(m128Tmp1,
1407 _mm_load_si128((__m128i *) (transform8x8[10])));
1408 E01l = _mm_madd_epi16(m128Tmp0,
1409 _mm_load_si128((__m128i *) (transform8x8[11])));
1410 E01h = _mm_madd_epi16(m128Tmp1,
1411 _mm_load_si128((__m128i *) (transform8x8[11])));
1412 E0l = _mm_add_epi32(EE0l, E00l);
1413 E0l = _mm_add_epi32(E0l, m128iAdd);
1414 E0h = _mm_add_epi32(EE0h, E00h);
1415 E0h = _mm_add_epi32(E0h, m128iAdd);
1416 E3l = _mm_sub_epi32(EE0l, E00l);
1417 E3l = _mm_add_epi32(E3l, m128iAdd);
1418 E3h = _mm_sub_epi32(EE0h, E00h);
1419 E3h = _mm_add_epi32(E3h, m128iAdd);
1420 E1l = _mm_add_epi32(EE1l, E01l);
1421 E1l = _mm_add_epi32(E1l, m128iAdd);
1422 E1h = _mm_add_epi32(EE1h, E01h);
1423 E1h = _mm_add_epi32(E1h, m128iAdd);
1424 E2l = _mm_sub_epi32(EE1l, E01l);
1425 E2l = _mm_add_epi32(E2l, m128iAdd);
1426 E2h = _mm_sub_epi32(EE1h, E01h);
1427 E2h = _mm_add_epi32(E2h, m128iAdd);
1428
1429 m128iS0 = _mm_packs_epi32(
1430 _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_2nd),
1431 _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_2nd));
1432 m128iS1 = _mm_packs_epi32(
1433 _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_2nd),
1434 _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_2nd));
1435 m128iS2 = _mm_packs_epi32(
1436 _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_2nd),
1437 _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_2nd));
1438 m128iS3 = _mm_packs_epi32(
1439 _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_2nd),
1440 _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_2nd));
1441 m128iS4 = _mm_packs_epi32(
1442 _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_2nd),
1443 _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_2nd));
1444 m128iS5 = _mm_packs_epi32(
1445 _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_2nd),
1446 _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_2nd));
1447 m128iS6 = _mm_packs_epi32(
1448 _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_2nd),
1449 _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_2nd));
1450 m128iS7 = _mm_packs_epi32(
1451 _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_2nd),
1452 _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_2nd));
1453
1454 _mm_store_si128((__m128i *) (src), m128iS0);
1455 _mm_store_si128((__m128i *) (src + 8), m128iS1);
1456 _mm_store_si128((__m128i *) (src + 16), m128iS2);
1457 _mm_store_si128((__m128i *) (src + 24), m128iS3);
1458 _mm_store_si128((__m128i *) (src + 32), m128iS4);
1459 _mm_store_si128((__m128i *) (src + 40), m128iS5);
1460 _mm_store_si128((__m128i *) (src + 48), m128iS6);
1461 _mm_store_si128((__m128i *) (src + 56), m128iS7);
1462
1463 j = 0;
1464 for (i = 0; i < 4; i++) {
1465 dst[0] = av_clip_uintp2(dst[0] + src[j],10);
1466 dst[1] = av_clip_uintp2(dst[1] + src[j + 8],10);
1467 dst[2] = av_clip_uintp2(dst[2] + src[j + 16],10);
1468 dst[3] = av_clip_uintp2(dst[3] + src[j + 24],10);
1469 dst[4] = av_clip_uintp2(dst[4] + src[j + 32],10);
1470 dst[5] = av_clip_uintp2(dst[5] + src[j + 40],10);
1471 dst[6] = av_clip_uintp2(dst[6] + src[j + 48],10);
1472 dst[7] = av_clip_uintp2(dst[7] + src[j + 56],10);
1473 j += 1;
1474 dst += stride;
1475 dst[0] = av_clip_uintp2(dst[0] + src[j],10);
1476 dst[1] = av_clip_uintp2(dst[1] + src[j + 8],10);
1477 dst[2] = av_clip_uintp2(dst[2] + src[j + 16],10);
1478 dst[3] = av_clip_uintp2(dst[3] + src[j + 24],10);
1479 dst[4] = av_clip_uintp2(dst[4] + src[j + 32],10);
1480 dst[5] = av_clip_uintp2(dst[5] + src[j + 40],10);
1481 dst[6] = av_clip_uintp2(dst[6] + src[j + 48],10);
1482 dst[7] = av_clip_uintp2(dst[7] + src[j + 56],10);
1483 j += 1;
1484 dst += stride;
1485 }
1486
1487 }
1488 #endif
1489
1490
1491 #if HAVE_SSE4_1
ff_hevc_transform_16x16_add_8_sse4(uint8_t * _dst,const int16_t * coeffs,ptrdiff_t _stride)1492 void ff_hevc_transform_16x16_add_8_sse4(uint8_t *_dst, const int16_t *coeffs,
1493 ptrdiff_t _stride) {
1494 uint8_t shift_2nd = 12; // 20 - Bit depth
1495 uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
1496 int i;
1497 uint8_t *dst = (uint8_t*) _dst;
1498 ptrdiff_t stride = _stride / sizeof(uint8_t);
1499 const int16_t *src = coeffs;
1500 int32_t shift;
1501 __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
1502 m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13,
1503 m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2,
1504 m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h,
1505 E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h,
1506 O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l,
1507 E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h;
1508 __m128i E4l, E5l, E6l, E7l;
1509 __m128i E4h, E5h, E6h, E7h;
1510 __m128i r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15;
1511 __m128i r16,r17,r18,r19,r20,r21,r22,r23,r24,r25,r26,r27,r28,r29,r30,r31;
1512
1513
1514 /*__m128i T00,T01, T02, T03, T04, T05, T06, T07;
1515 __m128i T10,T11, T12, T13, T14, T15, T16, T17;
1516 __m128i T20,T21, T22, T23, T24, T25, T26, T27;
1517 __m128i T30,T31, T32, T33, T34, T35, T36, T37;
1518
1519 __m128i U00,U01, U02, U03, U10, U11, U12, U13;
1520
1521 __m128i V00,V01, V10, V11;*/
1522
1523
1524 const __m128i T00 = _mm_load_si128((__m128i *) (transform16x16_1[0][0]));
1525 const __m128i T01 = _mm_load_si128((__m128i *) (transform16x16_1[0][1]));
1526 const __m128i T02 = _mm_load_si128((__m128i *) (transform16x16_1[0][2]));
1527 const __m128i T03 = _mm_load_si128((__m128i *) (transform16x16_1[0][3]));
1528 const __m128i T04 = _mm_load_si128((__m128i *) (transform16x16_1[0][4]));
1529 const __m128i T05 = _mm_load_si128((__m128i *) (transform16x16_1[0][5]));
1530 const __m128i T06 = _mm_load_si128((__m128i *) (transform16x16_1[0][6]));
1531 const __m128i T07 = _mm_load_si128((__m128i *) (transform16x16_1[0][7]));
1532 const __m128i T10 = _mm_load_si128((__m128i *) (transform16x16_1[1][0]));
1533 const __m128i T11 = _mm_load_si128((__m128i *) (transform16x16_1[1][1]));
1534 const __m128i T12 = _mm_load_si128((__m128i *) (transform16x16_1[1][2]));
1535 const __m128i T13 = _mm_load_si128((__m128i *) (transform16x16_1[1][3]));
1536 const __m128i T14 = _mm_load_si128((__m128i *) (transform16x16_1[1][4]));
1537 const __m128i T15 = _mm_load_si128((__m128i *) (transform16x16_1[1][5]));
1538 const __m128i T16 = _mm_load_si128((__m128i *) (transform16x16_1[1][6]));
1539 const __m128i T17 = _mm_load_si128((__m128i *) (transform16x16_1[1][7]));
1540 const __m128i T20 = _mm_load_si128((__m128i *) (transform16x16_1[2][0]));
1541 const __m128i T21 = _mm_load_si128((__m128i *) (transform16x16_1[2][1]));
1542 const __m128i T22 = _mm_load_si128((__m128i *) (transform16x16_1[2][2]));
1543 const __m128i T23 = _mm_load_si128((__m128i *) (transform16x16_1[2][3]));
1544 const __m128i T24 = _mm_load_si128((__m128i *) (transform16x16_1[2][4]));
1545 const __m128i T25 = _mm_load_si128((__m128i *) (transform16x16_1[2][5]));
1546 const __m128i T26 = _mm_load_si128((__m128i *) (transform16x16_1[2][6]));
1547 const __m128i T27 = _mm_load_si128((__m128i *) (transform16x16_1[2][7]));
1548 const __m128i T30 = _mm_load_si128((__m128i *) (transform16x16_1[3][0]));
1549 const __m128i T31 = _mm_load_si128((__m128i *) (transform16x16_1[3][1]));
1550 const __m128i T32 = _mm_load_si128((__m128i *) (transform16x16_1[3][2]));
1551 const __m128i T33 = _mm_load_si128((__m128i *) (transform16x16_1[3][3]));
1552 const __m128i T34 = _mm_load_si128((__m128i *) (transform16x16_1[3][4]));
1553 const __m128i T35 = _mm_load_si128((__m128i *) (transform16x16_1[3][5]));
1554 const __m128i T36 = _mm_load_si128((__m128i *) (transform16x16_1[3][6]));
1555 const __m128i T37 = _mm_load_si128((__m128i *) (transform16x16_1[3][7]));
1556
1557 const __m128i U00 = _mm_load_si128((__m128i *) (transform16x16_2[0][0]));
1558 const __m128i U01 = _mm_load_si128((__m128i *) (transform16x16_2[0][1]));
1559 const __m128i U02 = _mm_load_si128((__m128i *) (transform16x16_2[0][2]));
1560 const __m128i U03 = _mm_load_si128((__m128i *) (transform16x16_2[0][3]));
1561 const __m128i U10 = _mm_load_si128((__m128i *) (transform16x16_2[1][0]));
1562 const __m128i U11 = _mm_load_si128((__m128i *) (transform16x16_2[1][1]));
1563 const __m128i U12 = _mm_load_si128((__m128i *) (transform16x16_2[1][2]));
1564 const __m128i U13 = _mm_load_si128((__m128i *) (transform16x16_2[1][3]));
1565
1566 const __m128i V00 = _mm_load_si128((__m128i *) (transform16x16_3[0][0]));
1567 const __m128i V01 = _mm_load_si128((__m128i *) (transform16x16_3[0][1]));
1568 const __m128i V10 = _mm_load_si128((__m128i *) (transform16x16_3[1][0]));
1569 const __m128i V11 = _mm_load_si128((__m128i *) (transform16x16_3[1][1]));
1570
1571
1572
1573 int j;
1574 m128iS0 = _mm_load_si128((__m128i *) (src));
1575 m128iS1 = _mm_load_si128((__m128i *) (src + 16));
1576 m128iS2 = _mm_load_si128((__m128i *) (src + 32));
1577 m128iS3 = _mm_load_si128((__m128i *) (src + 48));
1578 m128iS4 = _mm_loadu_si128((__m128i *) (src + 64));
1579 m128iS5 = _mm_load_si128((__m128i *) (src + 80));
1580 m128iS6 = _mm_load_si128((__m128i *) (src + 96));
1581 m128iS7 = _mm_load_si128((__m128i *) (src + 112));
1582 m128iS8 = _mm_load_si128((__m128i *) (src + 128));
1583 m128iS9 = _mm_load_si128((__m128i *) (src + 144));
1584 m128iS10 = _mm_load_si128((__m128i *) (src + 160));
1585 m128iS11 = _mm_load_si128((__m128i *) (src + 176));
1586 m128iS12 = _mm_load_si128((__m128i *) (src + 192));
1587 m128iS13 = _mm_load_si128((__m128i *) (src + 208));
1588 m128iS14 = _mm_load_si128((__m128i *) (src + 224));
1589 m128iS15 = _mm_load_si128((__m128i *) (src + 240));
1590 shift = shift_1st;
1591 m128iAdd = _mm_set1_epi32(add_1st);
1592
1593 for (j = 0; j < 2; j++) {
1594 for (i = 0; i < 16; i += 8) {
1595
1596 m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
1597 E0l = _mm_madd_epi16(m128Tmp0,T00);
1598 m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
1599 E0h = _mm_madd_epi16(m128Tmp1,T00);
1600
1601 m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
1602 E1l = _mm_madd_epi16(m128Tmp2,T10);
1603 m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
1604 E1h = _mm_madd_epi16(m128Tmp3,T10);
1605
1606 m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11);
1607 E2l = _mm_madd_epi16(m128Tmp4,T20);
1608 m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11);
1609 E2h = _mm_madd_epi16(m128Tmp5,T20);
1610
1611 m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15);
1612 E3l = _mm_madd_epi16(m128Tmp6,T30);
1613 m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15);
1614 E3h = _mm_madd_epi16(m128Tmp7,T30);
1615
1616 O0l = _mm_add_epi32(E0l, E1l);
1617 O0l = _mm_add_epi32(O0l, E2l);
1618 O0l = _mm_add_epi32(O0l, E3l);
1619
1620 O0h = _mm_add_epi32(E0h, E1h);
1621 O0h = _mm_add_epi32(O0h, E2h);
1622 O0h = _mm_add_epi32(O0h, E3h);
1623
1624 /* Compute O1*/
1625 E0l = _mm_madd_epi16(m128Tmp0,T01);
1626 E0h = _mm_madd_epi16(m128Tmp1,T01);
1627 E1l = _mm_madd_epi16(m128Tmp2,T11);
1628 E1h = _mm_madd_epi16(m128Tmp3,T11);
1629 E2l = _mm_madd_epi16(m128Tmp4,T21);
1630 E2h = _mm_madd_epi16(m128Tmp5,T21);
1631 E3l = _mm_madd_epi16(m128Tmp6,T31);
1632 E3h = _mm_madd_epi16(m128Tmp7,T31);
1633 O1l = _mm_add_epi32(E0l, E1l);
1634 O1l = _mm_add_epi32(O1l, E2l);
1635 O1l = _mm_add_epi32(O1l, E3l);
1636 O1h = _mm_add_epi32(E0h, E1h);
1637 O1h = _mm_add_epi32(O1h, E2h);
1638 O1h = _mm_add_epi32(O1h, E3h);
1639
1640 /* Compute O2*/
1641 E0l = _mm_madd_epi16(m128Tmp0,T02);
1642 E0h = _mm_madd_epi16(m128Tmp1,T02);
1643 E1l = _mm_madd_epi16(m128Tmp2,T12);
1644 E1h = _mm_madd_epi16(m128Tmp3,T12);
1645 E2l = _mm_madd_epi16(m128Tmp4,T22);
1646 E2h = _mm_madd_epi16(m128Tmp5,T22);
1647 E3l = _mm_madd_epi16(m128Tmp6,T32);
1648 E3h = _mm_madd_epi16(m128Tmp7,T32);
1649 O2l = _mm_add_epi32(E0l, E1l);
1650 O2l = _mm_add_epi32(O2l, E2l);
1651 O2l = _mm_add_epi32(O2l, E3l);
1652
1653 O2h = _mm_add_epi32(E0h, E1h);
1654 O2h = _mm_add_epi32(O2h, E2h);
1655 O2h = _mm_add_epi32(O2h, E3h);
1656
1657 /* Compute O3*/
1658 E0l = _mm_madd_epi16(m128Tmp0,T03);
1659 E0h = _mm_madd_epi16(m128Tmp1,T03);
1660 E1l = _mm_madd_epi16(m128Tmp2,T13);
1661 E1h = _mm_madd_epi16(m128Tmp3,T13);
1662 E2l = _mm_madd_epi16(m128Tmp4,T23);
1663 E2h = _mm_madd_epi16(m128Tmp5,T23);
1664 E3l = _mm_madd_epi16(m128Tmp6,T33);
1665 E3h = _mm_madd_epi16(m128Tmp7,T33);
1666
1667 O3l = _mm_add_epi32(E0l, E1l);
1668 O3l = _mm_add_epi32(O3l, E2l);
1669 O3l = _mm_add_epi32(O3l, E3l);
1670
1671 O3h = _mm_add_epi32(E0h, E1h);
1672 O3h = _mm_add_epi32(O3h, E2h);
1673 O3h = _mm_add_epi32(O3h, E3h);
1674
1675 /* Compute O4*/
1676
1677 E0l = _mm_madd_epi16(m128Tmp0,T04);
1678 E0h = _mm_madd_epi16(m128Tmp1,T04);
1679 E1l = _mm_madd_epi16(m128Tmp2,T14);
1680 E1h = _mm_madd_epi16(m128Tmp3,T14);
1681 E2l = _mm_madd_epi16(m128Tmp4,T24);
1682 E2h = _mm_madd_epi16(m128Tmp5,T24);
1683 E3l = _mm_madd_epi16(m128Tmp6,T34);
1684 E3h = _mm_madd_epi16(m128Tmp7,T34);
1685
1686 O4l = _mm_add_epi32(E0l, E1l);
1687 O4l = _mm_add_epi32(O4l, E2l);
1688 O4l = _mm_add_epi32(O4l, E3l);
1689
1690 O4h = _mm_add_epi32(E0h, E1h);
1691 O4h = _mm_add_epi32(O4h, E2h);
1692 O4h = _mm_add_epi32(O4h, E3h);
1693
1694 /* Compute O5*/
1695 E0l = _mm_madd_epi16(m128Tmp0,T05);
1696 E0h = _mm_madd_epi16(m128Tmp1,T05);
1697 E1l = _mm_madd_epi16(m128Tmp2,T15);
1698 E1h = _mm_madd_epi16(m128Tmp3,T15);
1699 E2l = _mm_madd_epi16(m128Tmp4,T25);
1700 E2h = _mm_madd_epi16(m128Tmp5,T25);
1701 E3l = _mm_madd_epi16(m128Tmp6,T35);
1702 E3h = _mm_madd_epi16(m128Tmp7,T35);
1703
1704 O5l = _mm_add_epi32(E0l, E1l);
1705 O5l = _mm_add_epi32(O5l, E2l);
1706 O5l = _mm_add_epi32(O5l, E3l);
1707
1708 O5h = _mm_add_epi32(E0h, E1h);
1709 O5h = _mm_add_epi32(O5h, E2h);
1710 O5h = _mm_add_epi32(O5h, E3h);
1711
1712 /* Compute O6*/
1713
1714 E0l = _mm_madd_epi16(m128Tmp0,T06);
1715 E0h = _mm_madd_epi16(m128Tmp1,T06);
1716 E1l = _mm_madd_epi16(m128Tmp2,T16);
1717 E1h = _mm_madd_epi16(m128Tmp3,T16);
1718 E2l = _mm_madd_epi16(m128Tmp4,T26);
1719 E2h = _mm_madd_epi16(m128Tmp5,T26);
1720 E3l = _mm_madd_epi16(m128Tmp6,T36);
1721 E3h = _mm_madd_epi16(m128Tmp7,T36);
1722
1723 O6l = _mm_add_epi32(E0l, E1l);
1724 O6l = _mm_add_epi32(O6l, E2l);
1725 O6l = _mm_add_epi32(O6l, E3l);
1726
1727 O6h = _mm_add_epi32(E0h, E1h);
1728 O6h = _mm_add_epi32(O6h, E2h);
1729 O6h = _mm_add_epi32(O6h, E3h);
1730
1731 /* Compute O7*/
1732
1733 E0l = _mm_madd_epi16(m128Tmp0,T07);
1734 E0h = _mm_madd_epi16(m128Tmp1,T07);
1735 E1l = _mm_madd_epi16(m128Tmp2,T17);
1736 E1h = _mm_madd_epi16(m128Tmp3,T17);
1737 E2l = _mm_madd_epi16(m128Tmp4,T27);
1738 E2h = _mm_madd_epi16(m128Tmp5,T27);
1739 E3l = _mm_madd_epi16(m128Tmp6,T37);
1740 E3h = _mm_madd_epi16(m128Tmp7,T37);
1741
1742 O7l = _mm_add_epi32(E0l, E1l);
1743 O7l = _mm_add_epi32(O7l, E2l);
1744 O7l = _mm_add_epi32(O7l, E3l);
1745
1746 O7h = _mm_add_epi32(E0h, E1h);
1747 O7h = _mm_add_epi32(O7h, E2h);
1748 O7h = _mm_add_epi32(O7h, E3h);
1749
1750 /* Compute E0 */
1751
1752
1753
1754 m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
1755 E0l = _mm_madd_epi16(m128Tmp0,U00);
1756 m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
1757 E0h = _mm_madd_epi16(m128Tmp1,U00);
1758
1759 m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14);
1760 E0l = _mm_add_epi32(E0l,
1761 _mm_madd_epi16(m128Tmp2,U10));
1762 m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14);
1763 E0h = _mm_add_epi32(E0h,
1764 _mm_madd_epi16(m128Tmp3,U10));
1765
1766 /* Compute E1 */
1767 E1l = _mm_madd_epi16(m128Tmp0,U01);
1768 E1h = _mm_madd_epi16(m128Tmp1,U01);
1769 E1l = _mm_add_epi32(E1l,
1770 _mm_madd_epi16(m128Tmp2,U11));
1771 E1h = _mm_add_epi32(E1h,
1772 _mm_madd_epi16(m128Tmp3,U11));
1773
1774 /* Compute E2 */
1775 E2l = _mm_madd_epi16(m128Tmp0,U02);
1776 E2h = _mm_madd_epi16(m128Tmp1,U02);
1777 E2l = _mm_add_epi32(E2l,
1778 _mm_madd_epi16(m128Tmp2,U12));
1779 E2h = _mm_add_epi32(E2h,
1780 _mm_madd_epi16(m128Tmp3,U12));
1781 /* Compute E3 */
1782 E3l = _mm_madd_epi16(m128Tmp0,U03);
1783 E3h = _mm_madd_epi16(m128Tmp1,U03);
1784 E3l = _mm_add_epi32(E3l,
1785 _mm_madd_epi16(m128Tmp2,U13));
1786 E3h = _mm_add_epi32(E3h,
1787 _mm_madd_epi16(m128Tmp3,U13));
1788
1789 /* Compute EE0 and EEE */
1790
1791 m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12);
1792 E00l = _mm_madd_epi16(m128Tmp0,V00);
1793 m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12);
1794 E00h = _mm_madd_epi16(m128Tmp1,V00);
1795
1796 m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS8);
1797 EE0l = _mm_madd_epi16(m128Tmp2,V10);
1798 m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS8);
1799 EE0h = _mm_madd_epi16(m128Tmp3,V10);
1800
1801 E01l = _mm_madd_epi16(m128Tmp0,V01);
1802 E01h = _mm_madd_epi16(m128Tmp1,V01);
1803
1804 EE1l = _mm_madd_epi16(m128Tmp2,V11);
1805 EE1h = _mm_madd_epi16(m128Tmp3,V11);
1806
1807 /* Compute EE */
1808 EE2l = _mm_sub_epi32(EE1l, E01l);
1809 EE3l = _mm_sub_epi32(EE0l, E00l);
1810 EE2h = _mm_sub_epi32(EE1h, E01h);
1811 EE3h = _mm_sub_epi32(EE0h, E00h);
1812
1813 EE0l = _mm_add_epi32(EE0l, E00l);
1814 EE1l = _mm_add_epi32(EE1l, E01l);
1815 EE0h = _mm_add_epi32(EE0h, E00h);
1816 EE1h = _mm_add_epi32(EE1h, E01h);
1817
1818 /* Compute E */
1819
1820 E4l = _mm_sub_epi32(EE3l, E3l);
1821 E4l = _mm_add_epi32(E4l, m128iAdd);
1822
1823 E5l = _mm_sub_epi32(EE2l, E2l);
1824 E5l = _mm_add_epi32(E5l, m128iAdd);
1825
1826 E6l = _mm_sub_epi32(EE1l, E1l);
1827 E6l = _mm_add_epi32(E6l, m128iAdd);
1828
1829 E7l = _mm_sub_epi32(EE0l, E0l);
1830 E7l = _mm_add_epi32(E7l, m128iAdd);
1831
1832 E4h = _mm_sub_epi32(EE3h, E3h);
1833 E4h = _mm_add_epi32(E4h, m128iAdd);
1834
1835 E5h = _mm_sub_epi32(EE2h, E2h);
1836 E5h = _mm_add_epi32(E5h, m128iAdd);
1837
1838 E6h = _mm_sub_epi32(EE1h, E1h);
1839 E6h = _mm_add_epi32(E6h, m128iAdd);
1840
1841 E7h = _mm_sub_epi32(EE0h, E0h);
1842 E7h = _mm_add_epi32(E7h, m128iAdd);
1843
1844 E0l = _mm_add_epi32(EE0l, E0l);
1845 E0l = _mm_add_epi32(E0l, m128iAdd);
1846
1847 E1l = _mm_add_epi32(EE1l, E1l);
1848 E1l = _mm_add_epi32(E1l, m128iAdd);
1849
1850 E2l = _mm_add_epi32(EE2l, E2l);
1851 E2l = _mm_add_epi32(E2l, m128iAdd);
1852
1853 E3l = _mm_add_epi32(EE3l, E3l);
1854 E3l = _mm_add_epi32(E3l, m128iAdd);
1855
1856 E0h = _mm_add_epi32(EE0h, E0h);
1857 E0h = _mm_add_epi32(E0h, m128iAdd);
1858
1859 E1h = _mm_add_epi32(EE1h, E1h);
1860 E1h = _mm_add_epi32(E1h, m128iAdd);
1861
1862 E2h = _mm_add_epi32(EE2h, E2h);
1863 E2h = _mm_add_epi32(E2h, m128iAdd);
1864
1865 E3h = _mm_add_epi32(EE3h, E3h);
1866 E3h = _mm_add_epi32(E3h, m128iAdd);
1867
1868 m128iS0 = _mm_packs_epi32(
1869 _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift),
1870 _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift));
1871 m128iS1 = _mm_packs_epi32(
1872 _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift),
1873 _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift));
1874 m128iS2 = _mm_packs_epi32(
1875 _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift),
1876 _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift));
1877 m128iS3 = _mm_packs_epi32(
1878 _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift),
1879 _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift));
1880
1881 m128iS4 = _mm_packs_epi32(
1882 _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift),
1883 _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift));
1884 m128iS5 = _mm_packs_epi32(
1885 _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift),
1886 _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift));
1887 m128iS6 = _mm_packs_epi32(
1888 _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift),
1889 _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift));
1890 m128iS7 = _mm_packs_epi32(
1891 _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift),
1892 _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift));
1893
1894 m128iS15 = _mm_packs_epi32(
1895 _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift),
1896 _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift));
1897 m128iS14 = _mm_packs_epi32(
1898 _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift),
1899 _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift));
1900 m128iS13 = _mm_packs_epi32(
1901 _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift),
1902 _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift));
1903 m128iS12 = _mm_packs_epi32(
1904 _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift),
1905 _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift));
1906
1907 m128iS11 = _mm_packs_epi32(
1908 _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift),
1909 _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift));
1910 m128iS10 = _mm_packs_epi32(
1911 _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift),
1912 _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift));
1913 m128iS9 = _mm_packs_epi32(
1914 _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift),
1915 _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift));
1916 m128iS8 = _mm_packs_epi32(
1917 _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift),
1918 _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift));
1919
1920
1921
1922 if (!j) { //first pass
1923
1924 /* Inverse the matrix */
1925 E0l = _mm_unpacklo_epi16(m128iS0, m128iS8);
1926 E1l = _mm_unpacklo_epi16(m128iS1, m128iS9);
1927 E2l = _mm_unpacklo_epi16(m128iS2, m128iS10);
1928 E3l = _mm_unpacklo_epi16(m128iS3, m128iS11);
1929 E4l = _mm_unpacklo_epi16(m128iS4, m128iS12);
1930 E5l = _mm_unpacklo_epi16(m128iS5, m128iS13);
1931 E6l = _mm_unpacklo_epi16(m128iS6, m128iS14);
1932 E7l = _mm_unpacklo_epi16(m128iS7, m128iS15);
1933
1934 E0h = _mm_unpackhi_epi16(m128iS0, m128iS8);
1935 E1h = _mm_unpackhi_epi16(m128iS1, m128iS9);
1936 E2h = _mm_unpackhi_epi16(m128iS2, m128iS10);
1937 E3h = _mm_unpackhi_epi16(m128iS3, m128iS11);
1938 E4h = _mm_unpackhi_epi16(m128iS4, m128iS12);
1939 E5h = _mm_unpackhi_epi16(m128iS5, m128iS13);
1940 E6h = _mm_unpackhi_epi16(m128iS6, m128iS14);
1941 E7h = _mm_unpackhi_epi16(m128iS7, m128iS15);
1942
1943 m128Tmp0 = _mm_unpacklo_epi16(E0l, E4l);
1944 m128Tmp1 = _mm_unpacklo_epi16(E1l, E5l);
1945 m128Tmp2 = _mm_unpacklo_epi16(E2l, E6l);
1946 m128Tmp3 = _mm_unpacklo_epi16(E3l, E7l);
1947
1948 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
1949 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
1950 m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1951 m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1952
1953 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
1954 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
1955 m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1956 m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1957
1958 m128Tmp0 = _mm_unpackhi_epi16(E0l, E4l);
1959 m128Tmp1 = _mm_unpackhi_epi16(E1l, E5l);
1960 m128Tmp2 = _mm_unpackhi_epi16(E2l, E6l);
1961 m128Tmp3 = _mm_unpackhi_epi16(E3l, E7l);
1962
1963 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
1964 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
1965 m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1966 m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1967
1968 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
1969 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
1970 m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1971 m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1972
1973 m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h);
1974 m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h);
1975 m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h);
1976 m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h);
1977
1978 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
1979 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
1980 m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1981 m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1982
1983 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
1984 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
1985 m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1986 m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1987
1988 m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h);
1989 m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h);
1990 m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h);
1991 m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h);
1992
1993 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
1994 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
1995 m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1996 m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1997
1998 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
1999 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
2000 m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2001 m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2002
2003 if (!i) {
2004
2005 r0= m128iS0; //0
2006 r1= m128iS1; //16
2007 r2= m128iS2; //32
2008 r3= m128iS3; //48
2009 r4= m128iS4; //64
2010 r5= m128iS5; //80
2011 r6= m128iS6; //96
2012 r7= m128iS7; //112
2013 r8= m128iS8; //128
2014 r9= m128iS9; //144
2015 r10= m128iS10; //160
2016 r11= m128iS11; //176
2017 r12= m128iS12; //192
2018 r13= m128iS13; //208
2019 r14= m128iS14; //224
2020 r15= m128iS15; //240
2021
2022
2023
2024 m128iS0 = _mm_load_si128((__m128i *) (src + 8));
2025 m128iS1 = _mm_load_si128((__m128i *) (src + 24));
2026 m128iS2 = _mm_load_si128((__m128i *) (src + 40));
2027 m128iS3 = _mm_load_si128((__m128i *) (src + 56));
2028 m128iS4 = _mm_loadu_si128((__m128i *) (src + 72));
2029 m128iS5 = _mm_load_si128((__m128i *) (src + 88));
2030 m128iS6 = _mm_load_si128((__m128i *) (src + 104));
2031 m128iS7 = _mm_load_si128((__m128i *) (src + 120));
2032 m128iS8 = _mm_load_si128((__m128i *) (src + 136));
2033 m128iS9 = _mm_load_si128((__m128i *) (src + 152));
2034 m128iS10 = _mm_load_si128((__m128i *) (src + 168));
2035 m128iS11 = _mm_load_si128((__m128i *) (src + 184));
2036 m128iS12 = _mm_load_si128((__m128i *) (src + 200));
2037 m128iS13 = _mm_load_si128((__m128i *) (src + 216));
2038 m128iS14 = _mm_load_si128((__m128i *) (src + 232));
2039 m128iS15 = _mm_load_si128((__m128i *) (src + 248));
2040 } else {
2041
2042 r16= m128iS0; //8
2043 r17= m128iS1; //24
2044 r18= m128iS2; //40
2045 r19= m128iS3; //56
2046 r20= m128iS4; //72
2047 r21= m128iS5; //88
2048 r22= m128iS6; //104
2049 r23= m128iS7; //120
2050 r24= m128iS8; //136
2051 r25= m128iS9; //152
2052 r26= m128iS10; //168
2053 r27= m128iS11; //184
2054 r28= m128iS12; //200
2055 r29= m128iS13; //216
2056 r30= m128iS14; //232
2057 r31= m128iS15; //248
2058
2059 //prepare next iteration :
2060
2061 m128iS0= r0;
2062 m128iS1= r2;
2063 m128iS2= r4;
2064 m128iS3= r6;
2065 m128iS4= r8;
2066 m128iS5= r10;
2067 m128iS6= r12;
2068 m128iS7= r14;
2069 m128iS8= r16;
2070 m128iS9= r18;
2071 m128iS10=r20;
2072 m128iS11=r22;
2073 m128iS12=r24;
2074 m128iS13=r26;
2075 m128iS14=r28;
2076 m128iS15=r30;
2077
2078 shift = shift_2nd;
2079 m128iAdd = _mm_set1_epi32(add_2nd);
2080 }
2081
2082 } else {
2083
2084 //transpose half matrix :
2085 //instead of having 1 register = 1 half-column,
2086 //1 register = 1 half-row.
2087 E0l = _mm_unpacklo_epi16(m128iS0, m128iS1);
2088 E1l = _mm_unpacklo_epi16(m128iS2, m128iS3);
2089 E2l = _mm_unpacklo_epi16(m128iS4, m128iS5);
2090 E3l = _mm_unpacklo_epi16(m128iS6, m128iS7);
2091 E4l = _mm_unpacklo_epi16(m128iS8, m128iS9);
2092 E5l = _mm_unpacklo_epi16(m128iS10, m128iS11);
2093 E6l = _mm_unpacklo_epi16(m128iS12, m128iS13);
2094 E7l = _mm_unpacklo_epi16(m128iS14, m128iS15);
2095
2096 O0l = _mm_unpackhi_epi16(m128iS0, m128iS1);
2097 O1l = _mm_unpackhi_epi16(m128iS2, m128iS3);
2098 O2l = _mm_unpackhi_epi16(m128iS4, m128iS5);
2099 O3l = _mm_unpackhi_epi16(m128iS6, m128iS7);
2100 O4l = _mm_unpackhi_epi16(m128iS8, m128iS9);
2101 O5l = _mm_unpackhi_epi16(m128iS10, m128iS11);
2102 O6l = _mm_unpackhi_epi16(m128iS12, m128iS13);
2103 O7l = _mm_unpackhi_epi16(m128iS14, m128iS15);
2104
2105
2106 m128Tmp0 = _mm_unpacklo_epi32(E0l, E1l);
2107 m128Tmp1 = _mm_unpacklo_epi32(E2l, E3l);
2108
2109 m128Tmp2 = _mm_unpacklo_epi32(E4l, E5l);
2110 m128Tmp3 = _mm_unpacklo_epi32(E6l, E7l);
2111
2112 r0 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1); //1st half 1st row
2113 r2 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3); //2nd half 1st row
2114
2115
2116 r4 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1); //1st half 2nd row
2117 r6 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3); //2nd hald 2nd row
2118
2119 m128Tmp0 = _mm_unpackhi_epi32(E0l, E1l);
2120 m128Tmp1 = _mm_unpackhi_epi32(E2l, E3l);
2121 m128Tmp2 = _mm_unpackhi_epi32(E4l, E5l);
2122 m128Tmp3 = _mm_unpackhi_epi32(E6l, E7l);
2123
2124
2125 r8 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1);
2126 r10 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3);
2127
2128 r12 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1);
2129 r14 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3);
2130
2131 m128Tmp0 = _mm_unpacklo_epi32(O0l, O1l);
2132 m128Tmp1 = _mm_unpacklo_epi32(O2l, O3l);
2133 m128Tmp2 = _mm_unpacklo_epi32(O4l, O5l);
2134 m128Tmp3 = _mm_unpacklo_epi32(O6l, O7l);
2135
2136 r16 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1);
2137 r18 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3);
2138
2139
2140 r20 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1);
2141 r22 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3);
2142
2143 m128Tmp0 = _mm_unpackhi_epi32(O0l, O1l);
2144 m128Tmp1 = _mm_unpackhi_epi32(O2l, O3l);
2145 m128Tmp2 = _mm_unpackhi_epi32(O4l, O5l);
2146 m128Tmp3 = _mm_unpackhi_epi32(O6l, O7l);
2147
2148 r24 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1);
2149 r26 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3);
2150
2151
2152 r28 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1);
2153 r30 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3);
2154
2155 dst = (uint8_t*) (_dst + (i*stride));
2156 m128Tmp0= _mm_setzero_si128();
2157 m128Tmp1= _mm_load_si128((__m128i*)dst);
2158 m128Tmp2= _mm_load_si128((__m128i*)(dst+stride));
2159 m128Tmp3= _mm_load_si128((__m128i*)(dst+2*stride));
2160 m128Tmp4= _mm_load_si128((__m128i*)(dst+3*stride));
2161 m128Tmp5= _mm_load_si128((__m128i*)(dst+4*stride));
2162 m128Tmp6= _mm_load_si128((__m128i*)(dst+5*stride));
2163 m128Tmp7= _mm_load_si128((__m128i*)(dst+6*stride));
2164 E0l= _mm_load_si128((__m128i*)(dst+7*stride));
2165
2166
2167 r0= _mm_adds_epi16(r0,_mm_unpacklo_epi8(m128Tmp1,m128Tmp0));
2168 r2= _mm_adds_epi16(r2,_mm_unpackhi_epi8(m128Tmp1,m128Tmp0));
2169 r0= _mm_packus_epi16(r0,r2);
2170
2171
2172
2173
2174 r4= _mm_adds_epi16(r4,_mm_unpacklo_epi8(m128Tmp2,m128Tmp0));
2175 r6= _mm_adds_epi16(r6,_mm_unpackhi_epi8(m128Tmp2,m128Tmp0));
2176 r4= _mm_packus_epi16(r4,r6);
2177
2178
2179 r8= _mm_adds_epi16(r8,_mm_unpacklo_epi8(m128Tmp3,m128Tmp0));
2180 r10= _mm_adds_epi16(r10,_mm_unpackhi_epi8(m128Tmp3,m128Tmp0));
2181 r8= _mm_packus_epi16(r8,r10);
2182
2183
2184 r12= _mm_adds_epi16(r12,_mm_unpacklo_epi8(m128Tmp4,m128Tmp0));
2185 r14= _mm_adds_epi16(r14,_mm_unpackhi_epi8(m128Tmp4,m128Tmp0));
2186 r12= _mm_packus_epi16(r12,r14);
2187
2188
2189 r16= _mm_adds_epi16(r16,_mm_unpacklo_epi8(m128Tmp5,m128Tmp0));
2190 r18= _mm_adds_epi16(r18,_mm_unpackhi_epi8(m128Tmp5,m128Tmp0));
2191 r16= _mm_packus_epi16(r16,r18);
2192
2193
2194 r20= _mm_adds_epi16(r20,_mm_unpacklo_epi8(m128Tmp6,m128Tmp0));
2195 r22= _mm_adds_epi16(r22,_mm_unpackhi_epi8(m128Tmp6,m128Tmp0));
2196 r20= _mm_packus_epi16(r20,r22);
2197
2198
2199 r24= _mm_adds_epi16(r24,_mm_unpacklo_epi8(m128Tmp7,m128Tmp0));
2200 r26= _mm_adds_epi16(r26,_mm_unpackhi_epi8(m128Tmp7,m128Tmp0));
2201 r24= _mm_packus_epi16(r24,r26);
2202
2203
2204
2205 r28= _mm_adds_epi16(r28,_mm_unpacklo_epi8(E0l,m128Tmp0));
2206 r30= _mm_adds_epi16(r30,_mm_unpackhi_epi8(E0l,m128Tmp0));
2207 r28= _mm_packus_epi16(r28,r30);
2208
2209 _mm_store_si128((__m128i*)dst,r0);
2210 _mm_store_si128((__m128i*)(dst+stride),r4);
2211 _mm_store_si128((__m128i*)(dst+2*stride),r8);
2212 _mm_store_si128((__m128i*)(dst+3*stride),r12);
2213 _mm_store_si128((__m128i*)(dst+4*stride),r16);
2214 _mm_store_si128((__m128i*)(dst+5*stride),r20);
2215 _mm_store_si128((__m128i*)(dst+6*stride),r24);
2216 _mm_store_si128((__m128i*)(dst+7*stride),r28);
2217
2218
2219
2220 if (!i) {
2221 //first half done, can store !
2222
2223
2224 m128iS0= r1;
2225 m128iS1= r3;
2226 m128iS2= r5;
2227 m128iS3= r7;
2228 m128iS4= r9;
2229 m128iS5= r11;
2230 m128iS6= r13;
2231 m128iS7= r15;
2232 m128iS8= r17;
2233 m128iS9= r19;
2234 m128iS10=r21;
2235 m128iS11=r23;
2236 m128iS12=r25;
2237 m128iS13=r27;
2238 m128iS14=r29;
2239 m128iS15=r31;
2240 }
2241 }
2242 }
2243 }
2244 }
2245 #endif
2246
2247
2248 #if 0
2249 void ff_hevc_transform_16x16_add_10_sse4(uint8_t *_dst, const int16_t *coeffs,
2250 ptrdiff_t _stride) {
2251 int i;
2252 uint16_t *dst = (uint16_t*) _dst;
2253 ptrdiff_t stride = _stride / 2;
2254 int16_t *src = coeffs;
2255 int32_t shift;
2256 uint8_t shift_2nd = 10; //20 - bit depth
2257 uint16_t add_2nd = 1 << 9; //shift - 1;
2258 __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
2259 m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13,
2260 m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2,
2261 m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h,
2262 E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h,
2263 O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l,
2264 E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h;
2265 __m128i E4l, E5l, E6l, E7l;
2266 __m128i E4h, E5h, E6h, E7h;
2267 int j;
2268 m128iS0 = _mm_load_si128((__m128i *) (src));
2269 m128iS1 = _mm_load_si128((__m128i *) (src + 16));
2270 m128iS2 = _mm_load_si128((__m128i *) (src + 32));
2271 m128iS3 = _mm_load_si128((__m128i *) (src + 48));
2272 m128iS4 = _mm_loadu_si128((__m128i *) (src + 64));
2273 m128iS5 = _mm_load_si128((__m128i *) (src + 80));
2274 m128iS6 = _mm_load_si128((__m128i *) (src + 96));
2275 m128iS7 = _mm_load_si128((__m128i *) (src + 112));
2276 m128iS8 = _mm_load_si128((__m128i *) (src + 128));
2277 m128iS9 = _mm_load_si128((__m128i *) (src + 144));
2278 m128iS10 = _mm_load_si128((__m128i *) (src + 160));
2279 m128iS11 = _mm_load_si128((__m128i *) (src + 176));
2280 m128iS12 = _mm_loadu_si128((__m128i *) (src + 192));
2281 m128iS13 = _mm_load_si128((__m128i *) (src + 208));
2282 m128iS14 = _mm_load_si128((__m128i *) (src + 224));
2283 m128iS15 = _mm_load_si128((__m128i *) (src + 240));
2284 shift = shift_1st;
2285 m128iAdd = _mm_set1_epi32(add_1st);
2286
2287 for (j = 0; j < 2; j++) {
2288 for (i = 0; i < 16; i += 8) {
2289
2290 m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
2291 E0l = _mm_madd_epi16(m128Tmp0,
2292 _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
2293 m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
2294 E0h = _mm_madd_epi16(m128Tmp1,
2295 _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
2296
2297 m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
2298 E1l = _mm_madd_epi16(m128Tmp2,
2299 _mm_load_si128((__m128i *) (transform16x16_1[1][0])));
2300 m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
2301 E1h = _mm_madd_epi16(m128Tmp3,
2302 _mm_load_si128((__m128i *) (transform16x16_1[1][0])));
2303
2304 m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11);
2305 E2l = _mm_madd_epi16(m128Tmp4,
2306 _mm_load_si128((__m128i *) (transform16x16_1[2][0])));
2307 m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11);
2308 E2h = _mm_madd_epi16(m128Tmp5,
2309 _mm_load_si128((__m128i *) (transform16x16_1[2][0])));
2310
2311 m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15);
2312 E3l = _mm_madd_epi16(m128Tmp6,
2313 _mm_load_si128((__m128i *) (transform16x16_1[3][0])));
2314 m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15);
2315 E3h = _mm_madd_epi16(m128Tmp7,
2316 _mm_load_si128((__m128i *) (transform16x16_1[3][0])));
2317
2318 O0l = _mm_add_epi32(E0l, E1l);
2319 O0l = _mm_add_epi32(O0l, E2l);
2320 O0l = _mm_add_epi32(O0l, E3l);
2321
2322 O0h = _mm_add_epi32(E0h, E1h);
2323 O0h = _mm_add_epi32(O0h, E2h);
2324 O0h = _mm_add_epi32(O0h, E3h);
2325
2326 /* Compute O1*/
2327 E0l = _mm_madd_epi16(m128Tmp0,
2328 _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
2329 E0h = _mm_madd_epi16(m128Tmp1,
2330 _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
2331 E1l = _mm_madd_epi16(m128Tmp2,
2332 _mm_load_si128((__m128i *) (transform16x16_1[1][1])));
2333 E1h = _mm_madd_epi16(m128Tmp3,
2334 _mm_load_si128((__m128i *) (transform16x16_1[1][1])));
2335 E2l = _mm_madd_epi16(m128Tmp4,
2336 _mm_load_si128((__m128i *) (transform16x16_1[2][1])));
2337 E2h = _mm_madd_epi16(m128Tmp5,
2338 _mm_load_si128((__m128i *) (transform16x16_1[2][1])));
2339 E3l = _mm_madd_epi16(m128Tmp6,
2340 _mm_load_si128((__m128i *) (transform16x16_1[3][1])));
2341 E3h = _mm_madd_epi16(m128Tmp7,
2342 _mm_load_si128((__m128i *) (transform16x16_1[3][1])));
2343 O1l = _mm_add_epi32(E0l, E1l);
2344 O1l = _mm_add_epi32(O1l, E2l);
2345 O1l = _mm_add_epi32(O1l, E3l);
2346 O1h = _mm_add_epi32(E0h, E1h);
2347 O1h = _mm_add_epi32(O1h, E2h);
2348 O1h = _mm_add_epi32(O1h, E3h);
2349
2350 /* Compute O2*/
2351 E0l = _mm_madd_epi16(m128Tmp0,
2352 _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
2353 E0h = _mm_madd_epi16(m128Tmp1,
2354 _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
2355 E1l = _mm_madd_epi16(m128Tmp2,
2356 _mm_load_si128((__m128i *) (transform16x16_1[1][2])));
2357 E1h = _mm_madd_epi16(m128Tmp3,
2358 _mm_load_si128((__m128i *) (transform16x16_1[1][2])));
2359 E2l = _mm_madd_epi16(m128Tmp4,
2360 _mm_load_si128((__m128i *) (transform16x16_1[2][2])));
2361 E2h = _mm_madd_epi16(m128Tmp5,
2362 _mm_load_si128((__m128i *) (transform16x16_1[2][2])));
2363 E3l = _mm_madd_epi16(m128Tmp6,
2364 _mm_load_si128((__m128i *) (transform16x16_1[3][2])));
2365 E3h = _mm_madd_epi16(m128Tmp7,
2366 _mm_load_si128((__m128i *) (transform16x16_1[3][2])));
2367 O2l = _mm_add_epi32(E0l, E1l);
2368 O2l = _mm_add_epi32(O2l, E2l);
2369 O2l = _mm_add_epi32(O2l, E3l);
2370
2371 O2h = _mm_add_epi32(E0h, E1h);
2372 O2h = _mm_add_epi32(O2h, E2h);
2373 O2h = _mm_add_epi32(O2h, E3h);
2374
2375 /* Compute O3*/
2376 E0l = _mm_madd_epi16(m128Tmp0,
2377 _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
2378 E0h = _mm_madd_epi16(m128Tmp1,
2379 _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
2380 E1l = _mm_madd_epi16(m128Tmp2,
2381 _mm_load_si128((__m128i *) (transform16x16_1[1][3])));
2382 E1h = _mm_madd_epi16(m128Tmp3,
2383 _mm_load_si128((__m128i *) (transform16x16_1[1][3])));
2384 E2l = _mm_madd_epi16(m128Tmp4,
2385 _mm_load_si128((__m128i *) (transform16x16_1[2][3])));
2386 E2h = _mm_madd_epi16(m128Tmp5,
2387 _mm_load_si128((__m128i *) (transform16x16_1[2][3])));
2388 E3l = _mm_madd_epi16(m128Tmp6,
2389 _mm_load_si128((__m128i *) (transform16x16_1[3][3])));
2390 E3h = _mm_madd_epi16(m128Tmp7,
2391 _mm_load_si128((__m128i *) (transform16x16_1[3][3])));
2392
2393 O3l = _mm_add_epi32(E0l, E1l);
2394 O3l = _mm_add_epi32(O3l, E2l);
2395 O3l = _mm_add_epi32(O3l, E3l);
2396
2397 O3h = _mm_add_epi32(E0h, E1h);
2398 O3h = _mm_add_epi32(O3h, E2h);
2399 O3h = _mm_add_epi32(O3h, E3h);
2400
2401 /* Compute O4*/
2402
2403 E0l = _mm_madd_epi16(m128Tmp0,
2404 _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
2405 E0h = _mm_madd_epi16(m128Tmp1,
2406 _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
2407 E1l = _mm_madd_epi16(m128Tmp2,
2408 _mm_load_si128((__m128i *) (transform16x16_1[1][4])));
2409 E1h = _mm_madd_epi16(m128Tmp3,
2410 _mm_load_si128((__m128i *) (transform16x16_1[1][4])));
2411 E2l = _mm_madd_epi16(m128Tmp4,
2412 _mm_load_si128((__m128i *) (transform16x16_1[2][4])));
2413 E2h = _mm_madd_epi16(m128Tmp5,
2414 _mm_load_si128((__m128i *) (transform16x16_1[2][4])));
2415 E3l = _mm_madd_epi16(m128Tmp6,
2416 _mm_load_si128((__m128i *) (transform16x16_1[3][4])));
2417 E3h = _mm_madd_epi16(m128Tmp7,
2418 _mm_load_si128((__m128i *) (transform16x16_1[3][4])));
2419
2420 O4l = _mm_add_epi32(E0l, E1l);
2421 O4l = _mm_add_epi32(O4l, E2l);
2422 O4l = _mm_add_epi32(O4l, E3l);
2423
2424 O4h = _mm_add_epi32(E0h, E1h);
2425 O4h = _mm_add_epi32(O4h, E2h);
2426 O4h = _mm_add_epi32(O4h, E3h);
2427
2428 /* Compute O5*/
2429 E0l = _mm_madd_epi16(m128Tmp0,
2430 _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
2431 E0h = _mm_madd_epi16(m128Tmp1,
2432 _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
2433 E1l = _mm_madd_epi16(m128Tmp2,
2434 _mm_load_si128((__m128i *) (transform16x16_1[1][5])));
2435 E1h = _mm_madd_epi16(m128Tmp3,
2436 _mm_load_si128((__m128i *) (transform16x16_1[1][5])));
2437 E2l = _mm_madd_epi16(m128Tmp4,
2438 _mm_load_si128((__m128i *) (transform16x16_1[2][5])));
2439 E2h = _mm_madd_epi16(m128Tmp5,
2440 _mm_load_si128((__m128i *) (transform16x16_1[2][5])));
2441 E3l = _mm_madd_epi16(m128Tmp6,
2442 _mm_load_si128((__m128i *) (transform16x16_1[3][5])));
2443 E3h = _mm_madd_epi16(m128Tmp7,
2444 _mm_load_si128((__m128i *) (transform16x16_1[3][5])));
2445
2446 O5l = _mm_add_epi32(E0l, E1l);
2447 O5l = _mm_add_epi32(O5l, E2l);
2448 O5l = _mm_add_epi32(O5l, E3l);
2449
2450 O5h = _mm_add_epi32(E0h, E1h);
2451 O5h = _mm_add_epi32(O5h, E2h);
2452 O5h = _mm_add_epi32(O5h, E3h);
2453
2454 /* Compute O6*/
2455
2456 E0l = _mm_madd_epi16(m128Tmp0,
2457 _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
2458 E0h = _mm_madd_epi16(m128Tmp1,
2459 _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
2460 E1l = _mm_madd_epi16(m128Tmp2,
2461 _mm_load_si128((__m128i *) (transform16x16_1[1][6])));
2462 E1h = _mm_madd_epi16(m128Tmp3,
2463 _mm_load_si128((__m128i *) (transform16x16_1[1][6])));
2464 E2l = _mm_madd_epi16(m128Tmp4,
2465 _mm_load_si128((__m128i *) (transform16x16_1[2][6])));
2466 E2h = _mm_madd_epi16(m128Tmp5,
2467 _mm_load_si128((__m128i *) (transform16x16_1[2][6])));
2468 E3l = _mm_madd_epi16(m128Tmp6,
2469 _mm_load_si128((__m128i *) (transform16x16_1[3][6])));
2470 E3h = _mm_madd_epi16(m128Tmp7,
2471 _mm_load_si128((__m128i *) (transform16x16_1[3][6])));
2472
2473 O6l = _mm_add_epi32(E0l, E1l);
2474 O6l = _mm_add_epi32(O6l, E2l);
2475 O6l = _mm_add_epi32(O6l, E3l);
2476
2477 O6h = _mm_add_epi32(E0h, E1h);
2478 O6h = _mm_add_epi32(O6h, E2h);
2479 O6h = _mm_add_epi32(O6h, E3h);
2480
2481 /* Compute O7*/
2482
2483 E0l = _mm_madd_epi16(m128Tmp0,
2484 _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
2485 E0h = _mm_madd_epi16(m128Tmp1,
2486 _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
2487 E1l = _mm_madd_epi16(m128Tmp2,
2488 _mm_load_si128((__m128i *) (transform16x16_1[1][7])));
2489 E1h = _mm_madd_epi16(m128Tmp3,
2490 _mm_load_si128((__m128i *) (transform16x16_1[1][7])));
2491 E2l = _mm_madd_epi16(m128Tmp4,
2492 _mm_load_si128((__m128i *) (transform16x16_1[2][7])));
2493 E2h = _mm_madd_epi16(m128Tmp5,
2494 _mm_load_si128((__m128i *) (transform16x16_1[2][7])));
2495 E3l = _mm_madd_epi16(m128Tmp6,
2496 _mm_load_si128((__m128i *) (transform16x16_1[3][7])));
2497 E3h = _mm_madd_epi16(m128Tmp7,
2498 _mm_load_si128((__m128i *) (transform16x16_1[3][7])));
2499
2500 O7l = _mm_add_epi32(E0l, E1l);
2501 O7l = _mm_add_epi32(O7l, E2l);
2502 O7l = _mm_add_epi32(O7l, E3l);
2503
2504 O7h = _mm_add_epi32(E0h, E1h);
2505 O7h = _mm_add_epi32(O7h, E2h);
2506 O7h = _mm_add_epi32(O7h, E3h);
2507
2508 /* Compute E0 */
2509
2510 m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
2511 E0l = _mm_madd_epi16(m128Tmp0,
2512 _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
2513 m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
2514 E0h = _mm_madd_epi16(m128Tmp1,
2515 _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
2516
2517 m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14);
2518 E0l = _mm_add_epi32(E0l,
2519 _mm_madd_epi16(m128Tmp2,
2520 _mm_load_si128(
2521 (__m128i *) (transform16x16_2[1][0]))));
2522 m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14);
2523 E0h = _mm_add_epi32(E0h,
2524 _mm_madd_epi16(m128Tmp3,
2525 _mm_load_si128(
2526 (__m128i *) (transform16x16_2[1][0]))));
2527
2528 /* Compute E1 */
2529 E1l = _mm_madd_epi16(m128Tmp0,
2530 _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
2531 E1h = _mm_madd_epi16(m128Tmp1,
2532 _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
2533 E1l = _mm_add_epi32(E1l,
2534 _mm_madd_epi16(m128Tmp2,
2535 _mm_load_si128(
2536 (__m128i *) (transform16x16_2[1][1]))));
2537 E1h = _mm_add_epi32(E1h,
2538 _mm_madd_epi16(m128Tmp3,
2539 _mm_load_si128(
2540 (__m128i *) (transform16x16_2[1][1]))));
2541
2542 /* Compute E2 */
2543 E2l = _mm_madd_epi16(m128Tmp0,
2544 _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
2545 E2h = _mm_madd_epi16(m128Tmp1,
2546 _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
2547 E2l = _mm_add_epi32(E2l,
2548 _mm_madd_epi16(m128Tmp2,
2549 _mm_load_si128(
2550 (__m128i *) (transform16x16_2[1][2]))));
2551 E2h = _mm_add_epi32(E2h,
2552 _mm_madd_epi16(m128Tmp3,
2553 _mm_load_si128(
2554 (__m128i *) (transform16x16_2[1][2]))));
2555 /* Compute E3 */
2556 E3l = _mm_madd_epi16(m128Tmp0,
2557 _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
2558 E3h = _mm_madd_epi16(m128Tmp1,
2559 _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
2560 E3l = _mm_add_epi32(E3l,
2561 _mm_madd_epi16(m128Tmp2,
2562 _mm_load_si128(
2563 (__m128i *) (transform16x16_2[1][3]))));
2564 E3h = _mm_add_epi32(E3h,
2565 _mm_madd_epi16(m128Tmp3,
2566 _mm_load_si128(
2567 (__m128i *) (transform16x16_2[1][3]))));
2568
2569 /* Compute EE0 and EEE */
2570
2571 m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12);
2572 E00l = _mm_madd_epi16(m128Tmp0,
2573 _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
2574 m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12);
2575 E00h = _mm_madd_epi16(m128Tmp1,
2576 _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
2577
2578 m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS8);
2579 EE0l = _mm_madd_epi16(m128Tmp2,
2580 _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
2581 m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS8);
2582 EE0h = _mm_madd_epi16(m128Tmp3,
2583 _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
2584
2585 E01l = _mm_madd_epi16(m128Tmp0,
2586 _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
2587 E01h = _mm_madd_epi16(m128Tmp1,
2588 _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
2589
2590 EE1l = _mm_madd_epi16(m128Tmp2,
2591 _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
2592 EE1h = _mm_madd_epi16(m128Tmp3,
2593 _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
2594
2595 /* Compute EE */
2596 EE2l = _mm_sub_epi32(EE1l, E01l);
2597 EE3l = _mm_sub_epi32(EE0l, E00l);
2598 EE2h = _mm_sub_epi32(EE1h, E01h);
2599 EE3h = _mm_sub_epi32(EE0h, E00h);
2600
2601 EE0l = _mm_add_epi32(EE0l, E00l);
2602 EE1l = _mm_add_epi32(EE1l, E01l);
2603 EE0h = _mm_add_epi32(EE0h, E00h);
2604 EE1h = _mm_add_epi32(EE1h, E01h);
2605
2606 /* Compute E */
2607
2608 E4l = _mm_sub_epi32(EE3l, E3l);
2609 E4l = _mm_add_epi32(E4l, m128iAdd);
2610
2611 E5l = _mm_sub_epi32(EE2l, E2l);
2612 E5l = _mm_add_epi32(E5l, m128iAdd);
2613
2614 E6l = _mm_sub_epi32(EE1l, E1l);
2615 E6l = _mm_add_epi32(E6l, m128iAdd);
2616
2617 E7l = _mm_sub_epi32(EE0l, E0l);
2618 E7l = _mm_add_epi32(E7l, m128iAdd);
2619
2620 E4h = _mm_sub_epi32(EE3h, E3h);
2621 E4h = _mm_add_epi32(E4h, m128iAdd);
2622
2623 E5h = _mm_sub_epi32(EE2h, E2h);
2624 E5h = _mm_add_epi32(E5h, m128iAdd);
2625
2626 E6h = _mm_sub_epi32(EE1h, E1h);
2627 E6h = _mm_add_epi32(E6h, m128iAdd);
2628
2629 E7h = _mm_sub_epi32(EE0h, E0h);
2630 E7h = _mm_add_epi32(E7h, m128iAdd);
2631
2632 E0l = _mm_add_epi32(EE0l, E0l);
2633 E0l = _mm_add_epi32(E0l, m128iAdd);
2634
2635 E1l = _mm_add_epi32(EE1l, E1l);
2636 E1l = _mm_add_epi32(E1l, m128iAdd);
2637
2638 E2l = _mm_add_epi32(EE2l, E2l);
2639 E2l = _mm_add_epi32(E2l, m128iAdd);
2640
2641 E3l = _mm_add_epi32(EE3l, E3l);
2642 E3l = _mm_add_epi32(E3l, m128iAdd);
2643
2644 E0h = _mm_add_epi32(EE0h, E0h);
2645 E0h = _mm_add_epi32(E0h, m128iAdd);
2646
2647 E1h = _mm_add_epi32(EE1h, E1h);
2648 E1h = _mm_add_epi32(E1h, m128iAdd);
2649
2650 E2h = _mm_add_epi32(EE2h, E2h);
2651 E2h = _mm_add_epi32(E2h, m128iAdd);
2652
2653 E3h = _mm_add_epi32(EE3h, E3h);
2654 E3h = _mm_add_epi32(E3h, m128iAdd);
2655
2656 m128iS0 = _mm_packs_epi32(
2657 _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift),
2658 _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift));
2659 m128iS1 = _mm_packs_epi32(
2660 _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift),
2661 _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift));
2662 m128iS2 = _mm_packs_epi32(
2663 _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift),
2664 _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift));
2665 m128iS3 = _mm_packs_epi32(
2666 _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift),
2667 _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift));
2668
2669 m128iS4 = _mm_packs_epi32(
2670 _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift),
2671 _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift));
2672 m128iS5 = _mm_packs_epi32(
2673 _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift),
2674 _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift));
2675 m128iS6 = _mm_packs_epi32(
2676 _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift),
2677 _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift));
2678 m128iS7 = _mm_packs_epi32(
2679 _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift),
2680 _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift));
2681
2682 m128iS15 = _mm_packs_epi32(
2683 _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift),
2684 _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift));
2685 m128iS14 = _mm_packs_epi32(
2686 _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift),
2687 _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift));
2688 m128iS13 = _mm_packs_epi32(
2689 _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift),
2690 _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift));
2691 m128iS12 = _mm_packs_epi32(
2692 _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift),
2693 _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift));
2694
2695 m128iS11 = _mm_packs_epi32(
2696 _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift),
2697 _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift));
2698 m128iS10 = _mm_packs_epi32(
2699 _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift),
2700 _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift));
2701 m128iS9 = _mm_packs_epi32(
2702 _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift),
2703 _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift));
2704 m128iS8 = _mm_packs_epi32(
2705 _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift),
2706 _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift));
2707
2708 if (!j) {
2709 /* Inverse the matrix */
2710 E0l = _mm_unpacklo_epi16(m128iS0, m128iS8);
2711 E1l = _mm_unpacklo_epi16(m128iS1, m128iS9);
2712 E2l = _mm_unpacklo_epi16(m128iS2, m128iS10);
2713 E3l = _mm_unpacklo_epi16(m128iS3, m128iS11);
2714 E4l = _mm_unpacklo_epi16(m128iS4, m128iS12);
2715 E5l = _mm_unpacklo_epi16(m128iS5, m128iS13);
2716 E6l = _mm_unpacklo_epi16(m128iS6, m128iS14);
2717 E7l = _mm_unpacklo_epi16(m128iS7, m128iS15);
2718
2719 O0l = _mm_unpackhi_epi16(m128iS0, m128iS8);
2720 O1l = _mm_unpackhi_epi16(m128iS1, m128iS9);
2721 O2l = _mm_unpackhi_epi16(m128iS2, m128iS10);
2722 O3l = _mm_unpackhi_epi16(m128iS3, m128iS11);
2723 O4l = _mm_unpackhi_epi16(m128iS4, m128iS12);
2724 O5l = _mm_unpackhi_epi16(m128iS5, m128iS13);
2725 O6l = _mm_unpackhi_epi16(m128iS6, m128iS14);
2726 O7l = _mm_unpackhi_epi16(m128iS7, m128iS15);
2727
2728 m128Tmp0 = _mm_unpacklo_epi16(E0l, E4l);
2729 m128Tmp1 = _mm_unpacklo_epi16(E1l, E5l);
2730 m128Tmp2 = _mm_unpacklo_epi16(E2l, E6l);
2731 m128Tmp3 = _mm_unpacklo_epi16(E3l, E7l);
2732
2733 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
2734 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
2735 m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2736 m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2737
2738 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
2739 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
2740 m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2741 m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2742
2743 m128Tmp0 = _mm_unpackhi_epi16(E0l, E4l);
2744 m128Tmp1 = _mm_unpackhi_epi16(E1l, E5l);
2745 m128Tmp2 = _mm_unpackhi_epi16(E2l, E6l);
2746 m128Tmp3 = _mm_unpackhi_epi16(E3l, E7l);
2747
2748 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
2749 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
2750 m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2751 m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2752
2753 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
2754 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
2755 m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2756 m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2757
2758 m128Tmp0 = _mm_unpacklo_epi16(O0l, O4l);
2759 m128Tmp1 = _mm_unpacklo_epi16(O1l, O5l);
2760 m128Tmp2 = _mm_unpacklo_epi16(O2l, O6l);
2761 m128Tmp3 = _mm_unpacklo_epi16(O3l, O7l);
2762
2763 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
2764 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
2765 m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2766 m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2767
2768 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
2769 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
2770 m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2771 m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2772
2773 m128Tmp0 = _mm_unpackhi_epi16(O0l, O4l);
2774 m128Tmp1 = _mm_unpackhi_epi16(O1l, O5l);
2775 m128Tmp2 = _mm_unpackhi_epi16(O2l, O6l);
2776 m128Tmp3 = _mm_unpackhi_epi16(O3l, O7l);
2777
2778 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
2779 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
2780 m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2781 m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2782
2783 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
2784 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
2785 m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2786 m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2787
2788 /* */
2789 _mm_store_si128((__m128i *) (src + i), m128iS0);
2790 _mm_store_si128((__m128i *) (src + 16 + i), m128iS1);
2791 _mm_store_si128((__m128i *) (src + 32 + i), m128iS2);
2792 _mm_store_si128((__m128i *) (src + 48 + i), m128iS3);
2793 _mm_store_si128((__m128i *) (src + 64 + i), m128iS4);
2794 _mm_store_si128((__m128i *) (src + 80 + i), m128iS5);
2795 _mm_store_si128((__m128i *) (src + 96 + i), m128iS6);
2796 _mm_store_si128((__m128i *) (src + 112 + i), m128iS7);
2797 _mm_store_si128((__m128i *) (src + 128 + i), m128iS8);
2798 _mm_store_si128((__m128i *) (src + 144 + i), m128iS9);
2799 _mm_store_si128((__m128i *) (src + 160 + i), m128iS10);
2800 _mm_store_si128((__m128i *) (src + 176 + i), m128iS11);
2801 _mm_store_si128((__m128i *) (src + 192 + i), m128iS12);
2802 _mm_store_si128((__m128i *) (src + 208 + i), m128iS13);
2803 _mm_store_si128((__m128i *) (src + 224 + i), m128iS14);
2804 _mm_store_si128((__m128i *) (src + 240 + i), m128iS15);
2805
2806 if (!i) {
2807 m128iS0 = _mm_load_si128((__m128i *) (src + 8));
2808 m128iS1 = _mm_load_si128((__m128i *) (src + 24));
2809 m128iS2 = _mm_load_si128((__m128i *) (src + 40));
2810 m128iS3 = _mm_load_si128((__m128i *) (src + 56));
2811 m128iS4 = _mm_loadu_si128((__m128i *) (src + 72));
2812 m128iS5 = _mm_load_si128((__m128i *) (src + 88));
2813 m128iS6 = _mm_load_si128((__m128i *) (src + 104));
2814 m128iS7 = _mm_load_si128((__m128i *) (src + 120));
2815 m128iS8 = _mm_load_si128((__m128i *) (src + 136));
2816 m128iS9 = _mm_load_si128((__m128i *) (src + 152));
2817 m128iS10 = _mm_load_si128((__m128i *) (src + 168));
2818 m128iS11 = _mm_load_si128((__m128i *) (src + 184));
2819 m128iS12 = _mm_loadu_si128((__m128i *) (src + 200));
2820 m128iS13 = _mm_load_si128((__m128i *) (src + 216));
2821 m128iS14 = _mm_load_si128((__m128i *) (src + 232));
2822 m128iS15 = _mm_load_si128((__m128i *) (src + 248));
2823 } else {
2824 m128iS0 = _mm_load_si128((__m128i *) (src));
2825 m128iS1 = _mm_load_si128((__m128i *) (src + 32));
2826 m128iS2 = _mm_load_si128((__m128i *) (src + 64));
2827 m128iS3 = _mm_load_si128((__m128i *) (src + 96));
2828 m128iS4 = _mm_loadu_si128((__m128i *) (src + 128));
2829 m128iS5 = _mm_load_si128((__m128i *) (src + 160));
2830 m128iS6 = _mm_load_si128((__m128i *) (src + 192));
2831 m128iS7 = _mm_load_si128((__m128i *) (src + 224));
2832 m128iS8 = _mm_load_si128((__m128i *) (src + 8));
2833 m128iS9 = _mm_load_si128((__m128i *) (src + 32 + 8));
2834 m128iS10 = _mm_load_si128((__m128i *) (src + 64 + 8));
2835 m128iS11 = _mm_load_si128((__m128i *) (src + 96 + 8));
2836 m128iS12 = _mm_loadu_si128((__m128i *) (src + 128 + 8));
2837 m128iS13 = _mm_load_si128((__m128i *) (src + 160 + 8));
2838 m128iS14 = _mm_load_si128((__m128i *) (src + 192 + 8));
2839 m128iS15 = _mm_load_si128((__m128i *) (src + 224 + 8));
2840 shift = shift_2nd;
2841 m128iAdd = _mm_set1_epi32(add_2nd);
2842 }
2843
2844 } else {
2845 int k, m = 0;
2846 _mm_storeu_si128((__m128i *) (src), m128iS0);
2847 _mm_storeu_si128((__m128i *) (src + 8), m128iS1);
2848 _mm_storeu_si128((__m128i *) (src + 32), m128iS2);
2849 _mm_storeu_si128((__m128i *) (src + 40), m128iS3);
2850 _mm_storeu_si128((__m128i *) (src + 64), m128iS4);
2851 _mm_storeu_si128((__m128i *) (src + 72), m128iS5);
2852 _mm_storeu_si128((__m128i *) (src + 96), m128iS6);
2853 _mm_storeu_si128((__m128i *) (src + 104), m128iS7);
2854 _mm_storeu_si128((__m128i *) (src + 128), m128iS8);
2855 _mm_storeu_si128((__m128i *) (src + 136), m128iS9);
2856 _mm_storeu_si128((__m128i *) (src + 160), m128iS10);
2857 _mm_storeu_si128((__m128i *) (src + 168), m128iS11);
2858 _mm_storeu_si128((__m128i *) (src + 192), m128iS12);
2859 _mm_storeu_si128((__m128i *) (src + 200), m128iS13);
2860 _mm_storeu_si128((__m128i *) (src + 224), m128iS14);
2861 _mm_storeu_si128((__m128i *) (src + 232), m128iS15);
2862 dst = (uint16_t*) _dst + (i * stride);
2863
2864 for (k = 0; k < 8; k++) {
2865 dst[0] = av_clip_uintp2(dst[0] + src[m],10);
2866 dst[1] = av_clip_uintp2(dst[1] + src[m + 8],10);
2867 dst[2] = av_clip_uintp2(dst[2] + src[m + 32],10);
2868 dst[3] = av_clip_uintp2(dst[3] + src[m + 40],10);
2869 dst[4] = av_clip_uintp2(dst[4] + src[m + 64],10);
2870 dst[5] = av_clip_uintp2(dst[5] + src[m + 72],10);
2871 dst[6] = av_clip_uintp2(dst[6] + src[m + 96],10);
2872 dst[7] = av_clip_uintp2(dst[7] + src[m + 104],10);
2873
2874 dst[8] = av_clip_uintp2(dst[8] + src[m + 128],10);
2875 dst[9] = av_clip_uintp2(dst[9] + src[m + 136],10);
2876 dst[10] = av_clip_uintp2(dst[10] + src[m + 160],10);
2877 dst[11] = av_clip_uintp2(dst[11] + src[m + 168],10);
2878 dst[12] = av_clip_uintp2(dst[12] + src[m + 192],10);
2879 dst[13] = av_clip_uintp2(dst[13] + src[m + 200],10);
2880 dst[14] = av_clip_uintp2(dst[14] + src[m + 224],10);
2881 dst[15] = av_clip_uintp2(dst[15] + src[m + 232],10);
2882 m += 1;
2883 dst += stride;
2884 }
2885 if (!i) {
2886 m128iS0 = _mm_load_si128((__m128i *) (src + 16));
2887 m128iS1 = _mm_load_si128((__m128i *) (src + 48));
2888 m128iS2 = _mm_load_si128((__m128i *) (src + 80));
2889 m128iS3 = _mm_loadu_si128((__m128i *) (src + 112));
2890 m128iS4 = _mm_load_si128((__m128i *) (src + 144));
2891 m128iS5 = _mm_load_si128((__m128i *) (src + 176));
2892 m128iS6 = _mm_load_si128((__m128i *) (src + 208));
2893 m128iS7 = _mm_load_si128((__m128i *) (src + 240));
2894 m128iS8 = _mm_load_si128((__m128i *) (src + 24));
2895 m128iS9 = _mm_load_si128((__m128i *) (src + 56));
2896 m128iS10 = _mm_load_si128((__m128i *) (src + 88));
2897 m128iS11 = _mm_loadu_si128((__m128i *) (src + 120));
2898 m128iS12 = _mm_load_si128((__m128i *) (src + 152));
2899 m128iS13 = _mm_load_si128((__m128i *) (src + 184));
2900 m128iS14 = _mm_load_si128((__m128i *) (src + 216));
2901 m128iS15 = _mm_load_si128((__m128i *) (src + 248));
2902 }
2903 }
2904 }
2905 }
2906
2907 }
2908 #endif
2909
2910
2911 #if HAVE_SSE4_1
ff_hevc_transform_32x32_add_8_sse4(uint8_t * _dst,const int16_t * coeffs,ptrdiff_t _stride)2912 void ff_hevc_transform_32x32_add_8_sse4(uint8_t *_dst, const int16_t *coeffs,
2913 ptrdiff_t _stride) {
2914 uint8_t shift_2nd = 12; // 20 - Bit depth
2915 uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
2916 int i, j;
2917 uint8_t *dst = (uint8_t*) _dst;
2918 ptrdiff_t stride = _stride / sizeof(uint8_t);
2919 int shift;
2920 const int16_t *src = coeffs;
2921
2922 __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
2923 m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13,
2924 m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2,
2925 m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h,
2926 E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h,
2927 O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l,
2928 E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h;
2929 __m128i E4l, E5l, E6l, E7l, E8l, E9l, E10l, E11l, E12l, E13l, E14l, E15l;
2930 __m128i E4h, E5h, E6h, E7h, E8h, E9h, E10h, E11h, E12h, E13h, E14h, E15h,
2931 EEE0l, EEE1l, EEE0h, EEE1h;
2932 __m128i m128iS16, m128iS17, m128iS18, m128iS19, m128iS20, m128iS21,
2933 m128iS22, m128iS23, m128iS24, m128iS25, m128iS26, m128iS27,
2934 m128iS28, m128iS29, m128iS30, m128iS31, m128Tmp8, m128Tmp9,
2935 m128Tmp10, m128Tmp11, m128Tmp12, m128Tmp13, m128Tmp14, m128Tmp15,
2936 O8h, O9h, O10h, O11h, O12h, O13h, O14h, O15h, O8l, O9l, O10l, O11l,
2937 O12l, O13l, O14l, O15l, E02l, E02h, E03l, E03h, EE7l, EE6l, EE5l,
2938 EE4l, EE7h, EE6h, EE5h, EE4h;
2939
2940 __m128i r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15,r16,r17,r18,r19,r20,r21,r22,r23,r24,r25,r26,r27,r28,r29,r30,r31;
2941 __m128i r32,r33,r34,r35,r36,r37,r38,r39,r40,r41,r42,r43,r44,r45,r46,r47,r48,r49,r50,r51,r52,r53,r54,r55,r56,r57,r58,r59,r60,r61,r62,r63;
2942 __m128i r64,r65,r66,r67,r68,r69,r70,r71,r72,r73,r74,r75,r76,r77,r78,r79,r80,r81,r82,r83,r84,r85,r86,r87,r88,r89,r90,r91,r92,r93,r94,r95;
2943 __m128i r96,r97,r98,r99,r100,r101,r102,r103,r104,r105,r106,r107,r108,r109,r110,r111,r112,r113,r114,r115,r116,r117,r118,r119,r120,r121,r122,r123,r124,r125,r126,r127;
2944
2945
2946 m128iS0 = _mm_load_si128((__m128i *) (src));
2947 m128iS1 = _mm_load_si128((__m128i *) (src + 32));
2948 m128iS2 = _mm_load_si128((__m128i *) (src + 64));
2949 m128iS3 = _mm_load_si128((__m128i *) (src + 96));
2950 m128iS4 = _mm_loadu_si128((__m128i *) (src + 128));
2951 m128iS5 = _mm_load_si128((__m128i *) (src + 160));
2952 m128iS6 = _mm_load_si128((__m128i *) (src + 192));
2953 m128iS7 = _mm_load_si128((__m128i *) (src + 224));
2954 m128iS8 = _mm_load_si128((__m128i *) (src + 256));
2955 m128iS9 = _mm_load_si128((__m128i *) (src + 288));
2956 m128iS10 = _mm_load_si128((__m128i *) (src + 320));
2957 m128iS11 = _mm_load_si128((__m128i *) (src + 352));
2958 m128iS12 = _mm_load_si128((__m128i *) (src + 384));
2959 m128iS13 = _mm_load_si128((__m128i *) (src + 416));
2960 m128iS14 = _mm_load_si128((__m128i *) (src + 448));
2961 m128iS15 = _mm_load_si128((__m128i *) (src + 480));
2962 m128iS16 = _mm_load_si128((__m128i *) (src + 512));
2963 m128iS17 = _mm_load_si128((__m128i *) (src + 544));
2964 m128iS18 = _mm_load_si128((__m128i *) (src + 576));
2965 m128iS19 = _mm_load_si128((__m128i *) (src + 608));
2966 m128iS20 = _mm_load_si128((__m128i *) (src + 640));
2967 m128iS21 = _mm_load_si128((__m128i *) (src + 672));
2968 m128iS22 = _mm_load_si128((__m128i *) (src + 704));
2969 m128iS23 = _mm_load_si128((__m128i *) (src + 736));
2970 m128iS24 = _mm_load_si128((__m128i *) (src + 768));
2971 m128iS25 = _mm_load_si128((__m128i *) (src + 800));
2972 m128iS26 = _mm_load_si128((__m128i *) (src + 832));
2973 m128iS27 = _mm_load_si128((__m128i *) (src + 864));
2974 m128iS28 = _mm_load_si128((__m128i *) (src + 896));
2975 m128iS29 = _mm_load_si128((__m128i *) (src + 928));
2976 m128iS30 = _mm_load_si128((__m128i *) (src + 960));
2977 m128iS31 = _mm_load_si128((__m128i *) (src + 992));
2978
2979 shift = shift_1st;
2980 m128iAdd = _mm_set1_epi32(add_1st);
2981
2982 for (j = 0; j < 2; j++) {
2983 for (i = 0; i < 32; i += 8) {
2984 m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
2985 E0l = _mm_madd_epi16(m128Tmp0,
2986 _mm_load_si128((__m128i *) (transform32x32[0][0])));
2987 m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
2988 E0h = _mm_madd_epi16(m128Tmp1,
2989 _mm_load_si128((__m128i *) (transform32x32[0][0])));
2990
2991 m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
2992 E1l = _mm_madd_epi16(m128Tmp2,
2993 _mm_load_si128((__m128i *) (transform32x32[1][0])));
2994 m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
2995 E1h = _mm_madd_epi16(m128Tmp3,
2996 _mm_load_si128((__m128i *) (transform32x32[1][0])));
2997
2998 m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11);
2999 E2l = _mm_madd_epi16(m128Tmp4,
3000 _mm_load_si128((__m128i *) (transform32x32[2][0])));
3001 m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11);
3002 E2h = _mm_madd_epi16(m128Tmp5,
3003 _mm_load_si128((__m128i *) (transform32x32[2][0])));
3004
3005 m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15);
3006 E3l = _mm_madd_epi16(m128Tmp6,
3007 _mm_load_si128((__m128i *) (transform32x32[3][0])));
3008 m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15);
3009 E3h = _mm_madd_epi16(m128Tmp7,
3010 _mm_load_si128((__m128i *) (transform32x32[3][0])));
3011
3012 m128Tmp8 = _mm_unpacklo_epi16(m128iS17, m128iS19);
3013 E4l = _mm_madd_epi16(m128Tmp8,
3014 _mm_load_si128((__m128i *) (transform32x32[4][0])));
3015 m128Tmp9 = _mm_unpackhi_epi16(m128iS17, m128iS19);
3016 E4h = _mm_madd_epi16(m128Tmp9,
3017 _mm_load_si128((__m128i *) (transform32x32[4][0])));
3018
3019 m128Tmp10 = _mm_unpacklo_epi16(m128iS21, m128iS23);
3020 E5l = _mm_madd_epi16(m128Tmp10,
3021 _mm_load_si128((__m128i *) (transform32x32[5][0])));
3022 m128Tmp11 = _mm_unpackhi_epi16(m128iS21, m128iS23);
3023 E5h = _mm_madd_epi16(m128Tmp11,
3024 _mm_load_si128((__m128i *) (transform32x32[5][0])));
3025
3026 m128Tmp12 = _mm_unpacklo_epi16(m128iS25, m128iS27);
3027 E6l = _mm_madd_epi16(m128Tmp12,
3028 _mm_load_si128((__m128i *) (transform32x32[6][0])));
3029 m128Tmp13 = _mm_unpackhi_epi16(m128iS25, m128iS27);
3030 E6h = _mm_madd_epi16(m128Tmp13,
3031 _mm_load_si128((__m128i *) (transform32x32[6][0])));
3032
3033 m128Tmp14 = _mm_unpacklo_epi16(m128iS29, m128iS31);
3034 E7l = _mm_madd_epi16(m128Tmp14,
3035 _mm_load_si128((__m128i *) (transform32x32[7][0])));
3036 m128Tmp15 = _mm_unpackhi_epi16(m128iS29, m128iS31);
3037 E7h = _mm_madd_epi16(m128Tmp15,
3038 _mm_load_si128((__m128i *) (transform32x32[7][0])));
3039
3040 O0l = _mm_add_epi32(E0l, E1l);
3041 O0l = _mm_add_epi32(O0l, E2l);
3042 O0l = _mm_add_epi32(O0l, E3l);
3043 O0l = _mm_add_epi32(O0l, E4l);
3044 O0l = _mm_add_epi32(O0l, E5l);
3045 O0l = _mm_add_epi32(O0l, E6l);
3046 O0l = _mm_add_epi32(O0l, E7l);
3047
3048 O0h = _mm_add_epi32(E0h, E1h);
3049 O0h = _mm_add_epi32(O0h, E2h);
3050 O0h = _mm_add_epi32(O0h, E3h);
3051 O0h = _mm_add_epi32(O0h, E4h);
3052 O0h = _mm_add_epi32(O0h, E5h);
3053 O0h = _mm_add_epi32(O0h, E6h);
3054 O0h = _mm_add_epi32(O0h, E7h);
3055
3056 /* Compute O1*/
3057 E0l = _mm_madd_epi16(m128Tmp0,
3058 _mm_load_si128((__m128i *) (transform32x32[0][1])));
3059 E0h = _mm_madd_epi16(m128Tmp1,
3060 _mm_load_si128((__m128i *) (transform32x32[0][1])));
3061 E1l = _mm_madd_epi16(m128Tmp2,
3062 _mm_load_si128((__m128i *) (transform32x32[1][1])));
3063 E1h = _mm_madd_epi16(m128Tmp3,
3064 _mm_load_si128((__m128i *) (transform32x32[1][1])));
3065 E2l = _mm_madd_epi16(m128Tmp4,
3066 _mm_load_si128((__m128i *) (transform32x32[2][1])));
3067 E2h = _mm_madd_epi16(m128Tmp5,
3068 _mm_load_si128((__m128i *) (transform32x32[2][1])));
3069 E3l = _mm_madd_epi16(m128Tmp6,
3070 _mm_load_si128((__m128i *) (transform32x32[3][1])));
3071 E3h = _mm_madd_epi16(m128Tmp7,
3072 _mm_load_si128((__m128i *) (transform32x32[3][1])));
3073
3074 E4l = _mm_madd_epi16(m128Tmp8,
3075 _mm_load_si128((__m128i *) (transform32x32[4][1])));
3076 E4h = _mm_madd_epi16(m128Tmp9,
3077 _mm_load_si128((__m128i *) (transform32x32[4][1])));
3078 E5l = _mm_madd_epi16(m128Tmp10,
3079 _mm_load_si128((__m128i *) (transform32x32[5][1])));
3080 E5h = _mm_madd_epi16(m128Tmp11,
3081 _mm_load_si128((__m128i *) (transform32x32[5][1])));
3082 E6l = _mm_madd_epi16(m128Tmp12,
3083 _mm_load_si128((__m128i *) (transform32x32[6][1])));
3084 E6h = _mm_madd_epi16(m128Tmp13,
3085 _mm_load_si128((__m128i *) (transform32x32[6][1])));
3086 E7l = _mm_madd_epi16(m128Tmp14,
3087 _mm_load_si128((__m128i *) (transform32x32[7][1])));
3088 E7h = _mm_madd_epi16(m128Tmp15,
3089 _mm_load_si128((__m128i *) (transform32x32[7][1])));
3090
3091 O1l = _mm_add_epi32(E0l, E1l);
3092 O1l = _mm_add_epi32(O1l, E2l);
3093 O1l = _mm_add_epi32(O1l, E3l);
3094 O1l = _mm_add_epi32(O1l, E4l);
3095 O1l = _mm_add_epi32(O1l, E5l);
3096 O1l = _mm_add_epi32(O1l, E6l);
3097 O1l = _mm_add_epi32(O1l, E7l);
3098
3099 O1h = _mm_add_epi32(E0h, E1h);
3100 O1h = _mm_add_epi32(O1h, E2h);
3101 O1h = _mm_add_epi32(O1h, E3h);
3102 O1h = _mm_add_epi32(O1h, E4h);
3103 O1h = _mm_add_epi32(O1h, E5h);
3104 O1h = _mm_add_epi32(O1h, E6h);
3105 O1h = _mm_add_epi32(O1h, E7h);
3106 /* Compute O2*/
3107 E0l = _mm_madd_epi16(m128Tmp0,
3108 _mm_load_si128((__m128i *) (transform32x32[0][2])));
3109 E0h = _mm_madd_epi16(m128Tmp1,
3110 _mm_load_si128((__m128i *) (transform32x32[0][2])));
3111 E1l = _mm_madd_epi16(m128Tmp2,
3112 _mm_load_si128((__m128i *) (transform32x32[1][2])));
3113 E1h = _mm_madd_epi16(m128Tmp3,
3114 _mm_load_si128((__m128i *) (transform32x32[1][2])));
3115 E2l = _mm_madd_epi16(m128Tmp4,
3116 _mm_load_si128((__m128i *) (transform32x32[2][2])));
3117 E2h = _mm_madd_epi16(m128Tmp5,
3118 _mm_load_si128((__m128i *) (transform32x32[2][2])));
3119 E3l = _mm_madd_epi16(m128Tmp6,
3120 _mm_load_si128((__m128i *) (transform32x32[3][2])));
3121 E3h = _mm_madd_epi16(m128Tmp7,
3122 _mm_load_si128((__m128i *) (transform32x32[3][2])));
3123
3124 E4l = _mm_madd_epi16(m128Tmp8,
3125 _mm_load_si128((__m128i *) (transform32x32[4][2])));
3126 E4h = _mm_madd_epi16(m128Tmp9,
3127 _mm_load_si128((__m128i *) (transform32x32[4][2])));
3128 E5l = _mm_madd_epi16(m128Tmp10,
3129 _mm_load_si128((__m128i *) (transform32x32[5][2])));
3130 E5h = _mm_madd_epi16(m128Tmp11,
3131 _mm_load_si128((__m128i *) (transform32x32[5][2])));
3132 E6l = _mm_madd_epi16(m128Tmp12,
3133 _mm_load_si128((__m128i *) (transform32x32[6][2])));
3134 E6h = _mm_madd_epi16(m128Tmp13,
3135 _mm_load_si128((__m128i *) (transform32x32[6][2])));
3136 E7l = _mm_madd_epi16(m128Tmp14,
3137 _mm_load_si128((__m128i *) (transform32x32[7][2])));
3138 E7h = _mm_madd_epi16(m128Tmp15,
3139 _mm_load_si128((__m128i *) (transform32x32[7][2])));
3140
3141 O2l = _mm_add_epi32(E0l, E1l);
3142 O2l = _mm_add_epi32(O2l, E2l);
3143 O2l = _mm_add_epi32(O2l, E3l);
3144 O2l = _mm_add_epi32(O2l, E4l);
3145 O2l = _mm_add_epi32(O2l, E5l);
3146 O2l = _mm_add_epi32(O2l, E6l);
3147 O2l = _mm_add_epi32(O2l, E7l);
3148
3149 O2h = _mm_add_epi32(E0h, E1h);
3150 O2h = _mm_add_epi32(O2h, E2h);
3151 O2h = _mm_add_epi32(O2h, E3h);
3152 O2h = _mm_add_epi32(O2h, E4h);
3153 O2h = _mm_add_epi32(O2h, E5h);
3154 O2h = _mm_add_epi32(O2h, E6h);
3155 O2h = _mm_add_epi32(O2h, E7h);
3156 /* Compute O3*/
3157 E0l = _mm_madd_epi16(m128Tmp0,
3158 _mm_load_si128((__m128i *) (transform32x32[0][3])));
3159 E0h = _mm_madd_epi16(m128Tmp1,
3160 _mm_load_si128((__m128i *) (transform32x32[0][3])));
3161 E1l = _mm_madd_epi16(m128Tmp2,
3162 _mm_load_si128((__m128i *) (transform32x32[1][3])));
3163 E1h = _mm_madd_epi16(m128Tmp3,
3164 _mm_load_si128((__m128i *) (transform32x32[1][3])));
3165 E2l = _mm_madd_epi16(m128Tmp4,
3166 _mm_load_si128((__m128i *) (transform32x32[2][3])));
3167 E2h = _mm_madd_epi16(m128Tmp5,
3168 _mm_load_si128((__m128i *) (transform32x32[2][3])));
3169 E3l = _mm_madd_epi16(m128Tmp6,
3170 _mm_load_si128((__m128i *) (transform32x32[3][3])));
3171 E3h = _mm_madd_epi16(m128Tmp7,
3172 _mm_load_si128((__m128i *) (transform32x32[3][3])));
3173
3174 E4l = _mm_madd_epi16(m128Tmp8,
3175 _mm_load_si128((__m128i *) (transform32x32[4][3])));
3176 E4h = _mm_madd_epi16(m128Tmp9,
3177 _mm_load_si128((__m128i *) (transform32x32[4][3])));
3178 E5l = _mm_madd_epi16(m128Tmp10,
3179 _mm_load_si128((__m128i *) (transform32x32[5][3])));
3180 E5h = _mm_madd_epi16(m128Tmp11,
3181 _mm_load_si128((__m128i *) (transform32x32[5][3])));
3182 E6l = _mm_madd_epi16(m128Tmp12,
3183 _mm_load_si128((__m128i *) (transform32x32[6][3])));
3184 E6h = _mm_madd_epi16(m128Tmp13,
3185 _mm_load_si128((__m128i *) (transform32x32[6][3])));
3186 E7l = _mm_madd_epi16(m128Tmp14,
3187 _mm_load_si128((__m128i *) (transform32x32[7][3])));
3188 E7h = _mm_madd_epi16(m128Tmp15,
3189 _mm_load_si128((__m128i *) (transform32x32[7][3])));
3190
3191 O3l = _mm_add_epi32(E0l, E1l);
3192 O3l = _mm_add_epi32(O3l, E2l);
3193 O3l = _mm_add_epi32(O3l, E3l);
3194 O3l = _mm_add_epi32(O3l, E4l);
3195 O3l = _mm_add_epi32(O3l, E5l);
3196 O3l = _mm_add_epi32(O3l, E6l);
3197 O3l = _mm_add_epi32(O3l, E7l);
3198
3199 O3h = _mm_add_epi32(E0h, E1h);
3200 O3h = _mm_add_epi32(O3h, E2h);
3201 O3h = _mm_add_epi32(O3h, E3h);
3202 O3h = _mm_add_epi32(O3h, E4h);
3203 O3h = _mm_add_epi32(O3h, E5h);
3204 O3h = _mm_add_epi32(O3h, E6h);
3205 O3h = _mm_add_epi32(O3h, E7h);
3206 /* Compute O4*/
3207
3208 E0l = _mm_madd_epi16(m128Tmp0,
3209 _mm_load_si128((__m128i *) (transform32x32[0][4])));
3210 E0h = _mm_madd_epi16(m128Tmp1,
3211 _mm_load_si128((__m128i *) (transform32x32[0][4])));
3212 E1l = _mm_madd_epi16(m128Tmp2,
3213 _mm_load_si128((__m128i *) (transform32x32[1][4])));
3214 E1h = _mm_madd_epi16(m128Tmp3,
3215 _mm_load_si128((__m128i *) (transform32x32[1][4])));
3216 E2l = _mm_madd_epi16(m128Tmp4,
3217 _mm_load_si128((__m128i *) (transform32x32[2][4])));
3218 E2h = _mm_madd_epi16(m128Tmp5,
3219 _mm_load_si128((__m128i *) (transform32x32[2][4])));
3220 E3l = _mm_madd_epi16(m128Tmp6,
3221 _mm_load_si128((__m128i *) (transform32x32[3][4])));
3222 E3h = _mm_madd_epi16(m128Tmp7,
3223 _mm_load_si128((__m128i *) (transform32x32[3][4])));
3224
3225 E4l = _mm_madd_epi16(m128Tmp8,
3226 _mm_load_si128((__m128i *) (transform32x32[4][4])));
3227 E4h = _mm_madd_epi16(m128Tmp9,
3228 _mm_load_si128((__m128i *) (transform32x32[4][4])));
3229 E5l = _mm_madd_epi16(m128Tmp10,
3230 _mm_load_si128((__m128i *) (transform32x32[5][4])));
3231 E5h = _mm_madd_epi16(m128Tmp11,
3232 _mm_load_si128((__m128i *) (transform32x32[5][4])));
3233 E6l = _mm_madd_epi16(m128Tmp12,
3234 _mm_load_si128((__m128i *) (transform32x32[6][4])));
3235 E6h = _mm_madd_epi16(m128Tmp13,
3236 _mm_load_si128((__m128i *) (transform32x32[6][4])));
3237 E7l = _mm_madd_epi16(m128Tmp14,
3238 _mm_load_si128((__m128i *) (transform32x32[7][4])));
3239 E7h = _mm_madd_epi16(m128Tmp15,
3240 _mm_load_si128((__m128i *) (transform32x32[7][4])));
3241
3242 O4l = _mm_add_epi32(E0l, E1l);
3243 O4l = _mm_add_epi32(O4l, E2l);
3244 O4l = _mm_add_epi32(O4l, E3l);
3245 O4l = _mm_add_epi32(O4l, E4l);
3246 O4l = _mm_add_epi32(O4l, E5l);
3247 O4l = _mm_add_epi32(O4l, E6l);
3248 O4l = _mm_add_epi32(O4l, E7l);
3249
3250 O4h = _mm_add_epi32(E0h, E1h);
3251 O4h = _mm_add_epi32(O4h, E2h);
3252 O4h = _mm_add_epi32(O4h, E3h);
3253 O4h = _mm_add_epi32(O4h, E4h);
3254 O4h = _mm_add_epi32(O4h, E5h);
3255 O4h = _mm_add_epi32(O4h, E6h);
3256 O4h = _mm_add_epi32(O4h, E7h);
3257
3258 /* Compute O5*/
3259 E0l = _mm_madd_epi16(m128Tmp0,
3260 _mm_load_si128((__m128i *) (transform32x32[0][5])));
3261 E0h = _mm_madd_epi16(m128Tmp1,
3262 _mm_load_si128((__m128i *) (transform32x32[0][5])));
3263 E1l = _mm_madd_epi16(m128Tmp2,
3264 _mm_load_si128((__m128i *) (transform32x32[1][5])));
3265 E1h = _mm_madd_epi16(m128Tmp3,
3266 _mm_load_si128((__m128i *) (transform32x32[1][5])));
3267 E2l = _mm_madd_epi16(m128Tmp4,
3268 _mm_load_si128((__m128i *) (transform32x32[2][5])));
3269 E2h = _mm_madd_epi16(m128Tmp5,
3270 _mm_load_si128((__m128i *) (transform32x32[2][5])));
3271 E3l = _mm_madd_epi16(m128Tmp6,
3272 _mm_load_si128((__m128i *) (transform32x32[3][5])));
3273 E3h = _mm_madd_epi16(m128Tmp7,
3274 _mm_load_si128((__m128i *) (transform32x32[3][5])));
3275
3276 E4l = _mm_madd_epi16(m128Tmp8,
3277 _mm_load_si128((__m128i *) (transform32x32[4][5])));
3278 E4h = _mm_madd_epi16(m128Tmp9,
3279 _mm_load_si128((__m128i *) (transform32x32[4][5])));
3280 E5l = _mm_madd_epi16(m128Tmp10,
3281 _mm_load_si128((__m128i *) (transform32x32[5][5])));
3282 E5h = _mm_madd_epi16(m128Tmp11,
3283 _mm_load_si128((__m128i *) (transform32x32[5][5])));
3284 E6l = _mm_madd_epi16(m128Tmp12,
3285 _mm_load_si128((__m128i *) (transform32x32[6][5])));
3286 E6h = _mm_madd_epi16(m128Tmp13,
3287 _mm_load_si128((__m128i *) (transform32x32[6][5])));
3288 E7l = _mm_madd_epi16(m128Tmp14,
3289 _mm_load_si128((__m128i *) (transform32x32[7][5])));
3290 E7h = _mm_madd_epi16(m128Tmp15,
3291 _mm_load_si128((__m128i *) (transform32x32[7][5])));
3292
3293 O5l = _mm_add_epi32(E0l, E1l);
3294 O5l = _mm_add_epi32(O5l, E2l);
3295 O5l = _mm_add_epi32(O5l, E3l);
3296 O5l = _mm_add_epi32(O5l, E4l);
3297 O5l = _mm_add_epi32(O5l, E5l);
3298 O5l = _mm_add_epi32(O5l, E6l);
3299 O5l = _mm_add_epi32(O5l, E7l);
3300
3301 O5h = _mm_add_epi32(E0h, E1h);
3302 O5h = _mm_add_epi32(O5h, E2h);
3303 O5h = _mm_add_epi32(O5h, E3h);
3304 O5h = _mm_add_epi32(O5h, E4h);
3305 O5h = _mm_add_epi32(O5h, E5h);
3306 O5h = _mm_add_epi32(O5h, E6h);
3307 O5h = _mm_add_epi32(O5h, E7h);
3308
3309 /* Compute O6*/
3310
3311 E0l = _mm_madd_epi16(m128Tmp0,
3312 _mm_load_si128((__m128i *) (transform32x32[0][6])));
3313 E0h = _mm_madd_epi16(m128Tmp1,
3314 _mm_load_si128((__m128i *) (transform32x32[0][6])));
3315 E1l = _mm_madd_epi16(m128Tmp2,
3316 _mm_load_si128((__m128i *) (transform32x32[1][6])));
3317 E1h = _mm_madd_epi16(m128Tmp3,
3318 _mm_load_si128((__m128i *) (transform32x32[1][6])));
3319 E2l = _mm_madd_epi16(m128Tmp4,
3320 _mm_load_si128((__m128i *) (transform32x32[2][6])));
3321 E2h = _mm_madd_epi16(m128Tmp5,
3322 _mm_load_si128((__m128i *) (transform32x32[2][6])));
3323 E3l = _mm_madd_epi16(m128Tmp6,
3324 _mm_load_si128((__m128i *) (transform32x32[3][6])));
3325 E3h = _mm_madd_epi16(m128Tmp7,
3326 _mm_load_si128((__m128i *) (transform32x32[3][6])));
3327
3328 E4l = _mm_madd_epi16(m128Tmp8,
3329 _mm_load_si128((__m128i *) (transform32x32[4][6])));
3330 E4h = _mm_madd_epi16(m128Tmp9,
3331 _mm_load_si128((__m128i *) (transform32x32[4][6])));
3332 E5l = _mm_madd_epi16(m128Tmp10,
3333 _mm_load_si128((__m128i *) (transform32x32[5][6])));
3334 E5h = _mm_madd_epi16(m128Tmp11,
3335 _mm_load_si128((__m128i *) (transform32x32[5][6])));
3336 E6l = _mm_madd_epi16(m128Tmp12,
3337 _mm_load_si128((__m128i *) (transform32x32[6][6])));
3338 E6h = _mm_madd_epi16(m128Tmp13,
3339 _mm_load_si128((__m128i *) (transform32x32[6][6])));
3340 E7l = _mm_madd_epi16(m128Tmp14,
3341 _mm_load_si128((__m128i *) (transform32x32[7][6])));
3342 E7h = _mm_madd_epi16(m128Tmp15,
3343 _mm_load_si128((__m128i *) (transform32x32[7][6])));
3344
3345 O6l = _mm_add_epi32(E0l, E1l);
3346 O6l = _mm_add_epi32(O6l, E2l);
3347 O6l = _mm_add_epi32(O6l, E3l);
3348 O6l = _mm_add_epi32(O6l, E4l);
3349 O6l = _mm_add_epi32(O6l, E5l);
3350 O6l = _mm_add_epi32(O6l, E6l);
3351 O6l = _mm_add_epi32(O6l, E7l);
3352
3353 O6h = _mm_add_epi32(E0h, E1h);
3354 O6h = _mm_add_epi32(O6h, E2h);
3355 O6h = _mm_add_epi32(O6h, E3h);
3356 O6h = _mm_add_epi32(O6h, E4h);
3357 O6h = _mm_add_epi32(O6h, E5h);
3358 O6h = _mm_add_epi32(O6h, E6h);
3359 O6h = _mm_add_epi32(O6h, E7h);
3360
3361 /* Compute O7*/
3362
3363 E0l = _mm_madd_epi16(m128Tmp0,
3364 _mm_load_si128((__m128i *) (transform32x32[0][7])));
3365 E0h = _mm_madd_epi16(m128Tmp1,
3366 _mm_load_si128((__m128i *) (transform32x32[0][7])));
3367 E1l = _mm_madd_epi16(m128Tmp2,
3368 _mm_load_si128((__m128i *) (transform32x32[1][7])));
3369 E1h = _mm_madd_epi16(m128Tmp3,
3370 _mm_load_si128((__m128i *) (transform32x32[1][7])));
3371 E2l = _mm_madd_epi16(m128Tmp4,
3372 _mm_load_si128((__m128i *) (transform32x32[2][7])));
3373 E2h = _mm_madd_epi16(m128Tmp5,
3374 _mm_load_si128((__m128i *) (transform32x32[2][7])));
3375 E3l = _mm_madd_epi16(m128Tmp6,
3376 _mm_load_si128((__m128i *) (transform32x32[3][7])));
3377 E3h = _mm_madd_epi16(m128Tmp7,
3378 _mm_load_si128((__m128i *) (transform32x32[3][7])));
3379
3380 E4l = _mm_madd_epi16(m128Tmp8,
3381 _mm_load_si128((__m128i *) (transform32x32[4][7])));
3382 E4h = _mm_madd_epi16(m128Tmp9,
3383 _mm_load_si128((__m128i *) (transform32x32[4][7])));
3384 E5l = _mm_madd_epi16(m128Tmp10,
3385 _mm_load_si128((__m128i *) (transform32x32[5][7])));
3386 E5h = _mm_madd_epi16(m128Tmp11,
3387 _mm_load_si128((__m128i *) (transform32x32[5][7])));
3388 E6l = _mm_madd_epi16(m128Tmp12,
3389 _mm_load_si128((__m128i *) (transform32x32[6][7])));
3390 E6h = _mm_madd_epi16(m128Tmp13,
3391 _mm_load_si128((__m128i *) (transform32x32[6][7])));
3392 E7l = _mm_madd_epi16(m128Tmp14,
3393 _mm_load_si128((__m128i *) (transform32x32[7][7])));
3394 E7h = _mm_madd_epi16(m128Tmp15,
3395 _mm_load_si128((__m128i *) (transform32x32[7][7])));
3396
3397 O7l = _mm_add_epi32(E0l, E1l);
3398 O7l = _mm_add_epi32(O7l, E2l);
3399 O7l = _mm_add_epi32(O7l, E3l);
3400 O7l = _mm_add_epi32(O7l, E4l);
3401 O7l = _mm_add_epi32(O7l, E5l);
3402 O7l = _mm_add_epi32(O7l, E6l);
3403 O7l = _mm_add_epi32(O7l, E7l);
3404
3405 O7h = _mm_add_epi32(E0h, E1h);
3406 O7h = _mm_add_epi32(O7h, E2h);
3407 O7h = _mm_add_epi32(O7h, E3h);
3408 O7h = _mm_add_epi32(O7h, E4h);
3409 O7h = _mm_add_epi32(O7h, E5h);
3410 O7h = _mm_add_epi32(O7h, E6h);
3411 O7h = _mm_add_epi32(O7h, E7h);
3412
3413 /* Compute O8*/
3414
3415 E0l = _mm_madd_epi16(m128Tmp0,
3416 _mm_load_si128((__m128i *) (transform32x32[0][8])));
3417 E0h = _mm_madd_epi16(m128Tmp1,
3418 _mm_load_si128((__m128i *) (transform32x32[0][8])));
3419 E1l = _mm_madd_epi16(m128Tmp2,
3420 _mm_load_si128((__m128i *) (transform32x32[1][8])));
3421 E1h = _mm_madd_epi16(m128Tmp3,
3422 _mm_load_si128((__m128i *) (transform32x32[1][8])));
3423 E2l = _mm_madd_epi16(m128Tmp4,
3424 _mm_load_si128((__m128i *) (transform32x32[2][8])));
3425 E2h = _mm_madd_epi16(m128Tmp5,
3426 _mm_load_si128((__m128i *) (transform32x32[2][8])));
3427 E3l = _mm_madd_epi16(m128Tmp6,
3428 _mm_load_si128((__m128i *) (transform32x32[3][8])));
3429 E3h = _mm_madd_epi16(m128Tmp7,
3430 _mm_load_si128((__m128i *) (transform32x32[3][8])));
3431
3432 E4l = _mm_madd_epi16(m128Tmp8,
3433 _mm_load_si128((__m128i *) (transform32x32[4][8])));
3434 E4h = _mm_madd_epi16(m128Tmp9,
3435 _mm_load_si128((__m128i *) (transform32x32[4][8])));
3436 E5l = _mm_madd_epi16(m128Tmp10,
3437 _mm_load_si128((__m128i *) (transform32x32[5][8])));
3438 E5h = _mm_madd_epi16(m128Tmp11,
3439 _mm_load_si128((__m128i *) (transform32x32[5][8])));
3440 E6l = _mm_madd_epi16(m128Tmp12,
3441 _mm_load_si128((__m128i *) (transform32x32[6][8])));
3442 E6h = _mm_madd_epi16(m128Tmp13,
3443 _mm_load_si128((__m128i *) (transform32x32[6][8])));
3444 E7l = _mm_madd_epi16(m128Tmp14,
3445 _mm_load_si128((__m128i *) (transform32x32[7][8])));
3446 E7h = _mm_madd_epi16(m128Tmp15,
3447 _mm_load_si128((__m128i *) (transform32x32[7][8])));
3448
3449 O8l = _mm_add_epi32(E0l, E1l);
3450 O8l = _mm_add_epi32(O8l, E2l);
3451 O8l = _mm_add_epi32(O8l, E3l);
3452 O8l = _mm_add_epi32(O8l, E4l);
3453 O8l = _mm_add_epi32(O8l, E5l);
3454 O8l = _mm_add_epi32(O8l, E6l);
3455 O8l = _mm_add_epi32(O8l, E7l);
3456
3457 O8h = _mm_add_epi32(E0h, E1h);
3458 O8h = _mm_add_epi32(O8h, E2h);
3459 O8h = _mm_add_epi32(O8h, E3h);
3460 O8h = _mm_add_epi32(O8h, E4h);
3461 O8h = _mm_add_epi32(O8h, E5h);
3462 O8h = _mm_add_epi32(O8h, E6h);
3463 O8h = _mm_add_epi32(O8h, E7h);
3464
3465 /* Compute O9*/
3466
3467 E0l = _mm_madd_epi16(m128Tmp0,
3468 _mm_load_si128((__m128i *) (transform32x32[0][9])));
3469 E0h = _mm_madd_epi16(m128Tmp1,
3470 _mm_load_si128((__m128i *) (transform32x32[0][9])));
3471 E1l = _mm_madd_epi16(m128Tmp2,
3472 _mm_load_si128((__m128i *) (transform32x32[1][9])));
3473 E1h = _mm_madd_epi16(m128Tmp3,
3474 _mm_load_si128((__m128i *) (transform32x32[1][9])));
3475 E2l = _mm_madd_epi16(m128Tmp4,
3476 _mm_load_si128((__m128i *) (transform32x32[2][9])));
3477 E2h = _mm_madd_epi16(m128Tmp5,
3478 _mm_load_si128((__m128i *) (transform32x32[2][9])));
3479 E3l = _mm_madd_epi16(m128Tmp6,
3480 _mm_load_si128((__m128i *) (transform32x32[3][9])));
3481 E3h = _mm_madd_epi16(m128Tmp7,
3482 _mm_load_si128((__m128i *) (transform32x32[3][9])));
3483
3484 E4l = _mm_madd_epi16(m128Tmp8,
3485 _mm_load_si128((__m128i *) (transform32x32[4][9])));
3486 E4h = _mm_madd_epi16(m128Tmp9,
3487 _mm_load_si128((__m128i *) (transform32x32[4][9])));
3488 E5l = _mm_madd_epi16(m128Tmp10,
3489 _mm_load_si128((__m128i *) (transform32x32[5][9])));
3490 E5h = _mm_madd_epi16(m128Tmp11,
3491 _mm_load_si128((__m128i *) (transform32x32[5][9])));
3492 E6l = _mm_madd_epi16(m128Tmp12,
3493 _mm_load_si128((__m128i *) (transform32x32[6][9])));
3494 E6h = _mm_madd_epi16(m128Tmp13,
3495 _mm_load_si128((__m128i *) (transform32x32[6][9])));
3496 E7l = _mm_madd_epi16(m128Tmp14,
3497 _mm_load_si128((__m128i *) (transform32x32[7][9])));
3498 E7h = _mm_madd_epi16(m128Tmp15,
3499 _mm_load_si128((__m128i *) (transform32x32[7][9])));
3500
3501 O9l = _mm_add_epi32(E0l, E1l);
3502 O9l = _mm_add_epi32(O9l, E2l);
3503 O9l = _mm_add_epi32(O9l, E3l);
3504 O9l = _mm_add_epi32(O9l, E4l);
3505 O9l = _mm_add_epi32(O9l, E5l);
3506 O9l = _mm_add_epi32(O9l, E6l);
3507 O9l = _mm_add_epi32(O9l, E7l);
3508
3509 O9h = _mm_add_epi32(E0h, E1h);
3510 O9h = _mm_add_epi32(O9h, E2h);
3511 O9h = _mm_add_epi32(O9h, E3h);
3512 O9h = _mm_add_epi32(O9h, E4h);
3513 O9h = _mm_add_epi32(O9h, E5h);
3514 O9h = _mm_add_epi32(O9h, E6h);
3515 O9h = _mm_add_epi32(O9h, E7h);
3516
3517 /* Compute 10*/
3518
3519 E0l = _mm_madd_epi16(m128Tmp0,
3520 _mm_load_si128((__m128i *) (transform32x32[0][10])));
3521 E0h = _mm_madd_epi16(m128Tmp1,
3522 _mm_load_si128((__m128i *) (transform32x32[0][10])));
3523 E1l = _mm_madd_epi16(m128Tmp2,
3524 _mm_load_si128((__m128i *) (transform32x32[1][10])));
3525 E1h = _mm_madd_epi16(m128Tmp3,
3526 _mm_load_si128((__m128i *) (transform32x32[1][10])));
3527 E2l = _mm_madd_epi16(m128Tmp4,
3528 _mm_load_si128((__m128i *) (transform32x32[2][10])));
3529 E2h = _mm_madd_epi16(m128Tmp5,
3530 _mm_load_si128((__m128i *) (transform32x32[2][10])));
3531 E3l = _mm_madd_epi16(m128Tmp6,
3532 _mm_load_si128((__m128i *) (transform32x32[3][10])));
3533 E3h = _mm_madd_epi16(m128Tmp7,
3534 _mm_load_si128((__m128i *) (transform32x32[3][10])));
3535
3536 E4l = _mm_madd_epi16(m128Tmp8,
3537 _mm_load_si128((__m128i *) (transform32x32[4][10])));
3538 E4h = _mm_madd_epi16(m128Tmp9,
3539 _mm_load_si128((__m128i *) (transform32x32[4][10])));
3540 E5l = _mm_madd_epi16(m128Tmp10,
3541 _mm_load_si128((__m128i *) (transform32x32[5][10])));
3542 E5h = _mm_madd_epi16(m128Tmp11,
3543 _mm_load_si128((__m128i *) (transform32x32[5][10])));
3544 E6l = _mm_madd_epi16(m128Tmp12,
3545 _mm_load_si128((__m128i *) (transform32x32[6][10])));
3546 E6h = _mm_madd_epi16(m128Tmp13,
3547 _mm_load_si128((__m128i *) (transform32x32[6][10])));
3548 E7l = _mm_madd_epi16(m128Tmp14,
3549 _mm_load_si128((__m128i *) (transform32x32[7][10])));
3550 E7h = _mm_madd_epi16(m128Tmp15,
3551 _mm_load_si128((__m128i *) (transform32x32[7][10])));
3552
3553 O10l = _mm_add_epi32(E0l, E1l);
3554 O10l = _mm_add_epi32(O10l, E2l);
3555 O10l = _mm_add_epi32(O10l, E3l);
3556 O10l = _mm_add_epi32(O10l, E4l);
3557 O10l = _mm_add_epi32(O10l, E5l);
3558 O10l = _mm_add_epi32(O10l, E6l);
3559 O10l = _mm_add_epi32(O10l, E7l);
3560
3561 O10h = _mm_add_epi32(E0h, E1h);
3562 O10h = _mm_add_epi32(O10h, E2h);
3563 O10h = _mm_add_epi32(O10h, E3h);
3564 O10h = _mm_add_epi32(O10h, E4h);
3565 O10h = _mm_add_epi32(O10h, E5h);
3566 O10h = _mm_add_epi32(O10h, E6h);
3567 O10h = _mm_add_epi32(O10h, E7h);
3568
3569 /* Compute 11*/
3570
3571 E0l = _mm_madd_epi16(m128Tmp0,
3572 _mm_load_si128((__m128i *) (transform32x32[0][11])));
3573 E0h = _mm_madd_epi16(m128Tmp1,
3574 _mm_load_si128((__m128i *) (transform32x32[0][11])));
3575 E1l = _mm_madd_epi16(m128Tmp2,
3576 _mm_load_si128((__m128i *) (transform32x32[1][11])));
3577 E1h = _mm_madd_epi16(m128Tmp3,
3578 _mm_load_si128((__m128i *) (transform32x32[1][11])));
3579 E2l = _mm_madd_epi16(m128Tmp4,
3580 _mm_load_si128((__m128i *) (transform32x32[2][11])));
3581 E2h = _mm_madd_epi16(m128Tmp5,
3582 _mm_load_si128((__m128i *) (transform32x32[2][11])));
3583 E3l = _mm_madd_epi16(m128Tmp6,
3584 _mm_load_si128((__m128i *) (transform32x32[3][11])));
3585 E3h = _mm_madd_epi16(m128Tmp7,
3586 _mm_load_si128((__m128i *) (transform32x32[3][11])));
3587
3588 E4l = _mm_madd_epi16(m128Tmp8,
3589 _mm_load_si128((__m128i *) (transform32x32[4][11])));
3590 E4h = _mm_madd_epi16(m128Tmp9,
3591 _mm_load_si128((__m128i *) (transform32x32[4][11])));
3592 E5l = _mm_madd_epi16(m128Tmp10,
3593 _mm_load_si128((__m128i *) (transform32x32[5][11])));
3594 E5h = _mm_madd_epi16(m128Tmp11,
3595 _mm_load_si128((__m128i *) (transform32x32[5][11])));
3596 E6l = _mm_madd_epi16(m128Tmp12,
3597 _mm_load_si128((__m128i *) (transform32x32[6][11])));
3598 E6h = _mm_madd_epi16(m128Tmp13,
3599 _mm_load_si128((__m128i *) (transform32x32[6][11])));
3600 E7l = _mm_madd_epi16(m128Tmp14,
3601 _mm_load_si128((__m128i *) (transform32x32[7][11])));
3602 E7h = _mm_madd_epi16(m128Tmp15,
3603 _mm_load_si128((__m128i *) (transform32x32[7][11])));
3604
3605 O11l = _mm_add_epi32(E0l, E1l);
3606 O11l = _mm_add_epi32(O11l, E2l);
3607 O11l = _mm_add_epi32(O11l, E3l);
3608 O11l = _mm_add_epi32(O11l, E4l);
3609 O11l = _mm_add_epi32(O11l, E5l);
3610 O11l = _mm_add_epi32(O11l, E6l);
3611 O11l = _mm_add_epi32(O11l, E7l);
3612
3613 O11h = _mm_add_epi32(E0h, E1h);
3614 O11h = _mm_add_epi32(O11h, E2h);
3615 O11h = _mm_add_epi32(O11h, E3h);
3616 O11h = _mm_add_epi32(O11h, E4h);
3617 O11h = _mm_add_epi32(O11h, E5h);
3618 O11h = _mm_add_epi32(O11h, E6h);
3619 O11h = _mm_add_epi32(O11h, E7h);
3620
3621 /* Compute 12*/
3622
3623 E0l = _mm_madd_epi16(m128Tmp0,
3624 _mm_load_si128((__m128i *) (transform32x32[0][12])));
3625 E0h = _mm_madd_epi16(m128Tmp1,
3626 _mm_load_si128((__m128i *) (transform32x32[0][12])));
3627 E1l = _mm_madd_epi16(m128Tmp2,
3628 _mm_load_si128((__m128i *) (transform32x32[1][12])));
3629 E1h = _mm_madd_epi16(m128Tmp3,
3630 _mm_load_si128((__m128i *) (transform32x32[1][12])));
3631 E2l = _mm_madd_epi16(m128Tmp4,
3632 _mm_load_si128((__m128i *) (transform32x32[2][12])));
3633 E2h = _mm_madd_epi16(m128Tmp5,
3634 _mm_load_si128((__m128i *) (transform32x32[2][12])));
3635 E3l = _mm_madd_epi16(m128Tmp6,
3636 _mm_load_si128((__m128i *) (transform32x32[3][12])));
3637 E3h = _mm_madd_epi16(m128Tmp7,
3638 _mm_load_si128((__m128i *) (transform32x32[3][12])));
3639
3640 E4l = _mm_madd_epi16(m128Tmp8,
3641 _mm_load_si128((__m128i *) (transform32x32[4][12])));
3642 E4h = _mm_madd_epi16(m128Tmp9,
3643 _mm_load_si128((__m128i *) (transform32x32[4][12])));
3644 E5l = _mm_madd_epi16(m128Tmp10,
3645 _mm_load_si128((__m128i *) (transform32x32[5][12])));
3646 E5h = _mm_madd_epi16(m128Tmp11,
3647 _mm_load_si128((__m128i *) (transform32x32[5][12])));
3648 E6l = _mm_madd_epi16(m128Tmp12,
3649 _mm_load_si128((__m128i *) (transform32x32[6][12])));
3650 E6h = _mm_madd_epi16(m128Tmp13,
3651 _mm_load_si128((__m128i *) (transform32x32[6][12])));
3652 E7l = _mm_madd_epi16(m128Tmp14,
3653 _mm_load_si128((__m128i *) (transform32x32[7][12])));
3654 E7h = _mm_madd_epi16(m128Tmp15,
3655 _mm_load_si128((__m128i *) (transform32x32[7][12])));
3656
3657 O12l = _mm_add_epi32(E0l, E1l);
3658 O12l = _mm_add_epi32(O12l, E2l);
3659 O12l = _mm_add_epi32(O12l, E3l);
3660 O12l = _mm_add_epi32(O12l, E4l);
3661 O12l = _mm_add_epi32(O12l, E5l);
3662 O12l = _mm_add_epi32(O12l, E6l);
3663 O12l = _mm_add_epi32(O12l, E7l);
3664
3665 O12h = _mm_add_epi32(E0h, E1h);
3666 O12h = _mm_add_epi32(O12h, E2h);
3667 O12h = _mm_add_epi32(O12h, E3h);
3668 O12h = _mm_add_epi32(O12h, E4h);
3669 O12h = _mm_add_epi32(O12h, E5h);
3670 O12h = _mm_add_epi32(O12h, E6h);
3671 O12h = _mm_add_epi32(O12h, E7h);
3672
3673 /* Compute 13*/
3674
3675 E0l = _mm_madd_epi16(m128Tmp0,
3676 _mm_load_si128((__m128i *) (transform32x32[0][13])));
3677 E0h = _mm_madd_epi16(m128Tmp1,
3678 _mm_load_si128((__m128i *) (transform32x32[0][13])));
3679 E1l = _mm_madd_epi16(m128Tmp2,
3680 _mm_load_si128((__m128i *) (transform32x32[1][13])));
3681 E1h = _mm_madd_epi16(m128Tmp3,
3682 _mm_load_si128((__m128i *) (transform32x32[1][13])));
3683 E2l = _mm_madd_epi16(m128Tmp4,
3684 _mm_load_si128((__m128i *) (transform32x32[2][13])));
3685 E2h = _mm_madd_epi16(m128Tmp5,
3686 _mm_load_si128((__m128i *) (transform32x32[2][13])));
3687 E3l = _mm_madd_epi16(m128Tmp6,
3688 _mm_load_si128((__m128i *) (transform32x32[3][13])));
3689 E3h = _mm_madd_epi16(m128Tmp7,
3690 _mm_load_si128((__m128i *) (transform32x32[3][13])));
3691
3692 E4l = _mm_madd_epi16(m128Tmp8,
3693 _mm_load_si128((__m128i *) (transform32x32[4][13])));
3694 E4h = _mm_madd_epi16(m128Tmp9,
3695 _mm_load_si128((__m128i *) (transform32x32[4][13])));
3696 E5l = _mm_madd_epi16(m128Tmp10,
3697 _mm_load_si128((__m128i *) (transform32x32[5][13])));
3698 E5h = _mm_madd_epi16(m128Tmp11,
3699 _mm_load_si128((__m128i *) (transform32x32[5][13])));
3700 E6l = _mm_madd_epi16(m128Tmp12,
3701 _mm_load_si128((__m128i *) (transform32x32[6][13])));
3702 E6h = _mm_madd_epi16(m128Tmp13,
3703 _mm_load_si128((__m128i *) (transform32x32[6][13])));
3704 E7l = _mm_madd_epi16(m128Tmp14,
3705 _mm_load_si128((__m128i *) (transform32x32[7][13])));
3706 E7h = _mm_madd_epi16(m128Tmp15,
3707 _mm_load_si128((__m128i *) (transform32x32[7][13])));
3708
3709 O13l = _mm_add_epi32(E0l, E1l);
3710 O13l = _mm_add_epi32(O13l, E2l);
3711 O13l = _mm_add_epi32(O13l, E3l);
3712 O13l = _mm_add_epi32(O13l, E4l);
3713 O13l = _mm_add_epi32(O13l, E5l);
3714 O13l = _mm_add_epi32(O13l, E6l);
3715 O13l = _mm_add_epi32(O13l, E7l);
3716
3717 O13h = _mm_add_epi32(E0h, E1h);
3718 O13h = _mm_add_epi32(O13h, E2h);
3719 O13h = _mm_add_epi32(O13h, E3h);
3720 O13h = _mm_add_epi32(O13h, E4h);
3721 O13h = _mm_add_epi32(O13h, E5h);
3722 O13h = _mm_add_epi32(O13h, E6h);
3723 O13h = _mm_add_epi32(O13h, E7h);
3724
3725 /* Compute O14 */
3726
3727 E0l = _mm_madd_epi16(m128Tmp0,
3728 _mm_load_si128((__m128i *) (transform32x32[0][14])));
3729 E0h = _mm_madd_epi16(m128Tmp1,
3730 _mm_load_si128((__m128i *) (transform32x32[0][14])));
3731 E1l = _mm_madd_epi16(m128Tmp2,
3732 _mm_load_si128((__m128i *) (transform32x32[1][14])));
3733 E1h = _mm_madd_epi16(m128Tmp3,
3734 _mm_load_si128((__m128i *) (transform32x32[1][14])));
3735 E2l = _mm_madd_epi16(m128Tmp4,
3736 _mm_load_si128((__m128i *) (transform32x32[2][14])));
3737 E2h = _mm_madd_epi16(m128Tmp5,
3738 _mm_load_si128((__m128i *) (transform32x32[2][14])));
3739 E3l = _mm_madd_epi16(m128Tmp6,
3740 _mm_load_si128((__m128i *) (transform32x32[3][14])));
3741 E3h = _mm_madd_epi16(m128Tmp7,
3742 _mm_load_si128((__m128i *) (transform32x32[3][14])));
3743
3744 E4l = _mm_madd_epi16(m128Tmp8,
3745 _mm_load_si128((__m128i *) (transform32x32[4][14])));
3746 E4h = _mm_madd_epi16(m128Tmp9,
3747 _mm_load_si128((__m128i *) (transform32x32[4][14])));
3748 E5l = _mm_madd_epi16(m128Tmp10,
3749 _mm_load_si128((__m128i *) (transform32x32[5][14])));
3750 E5h = _mm_madd_epi16(m128Tmp11,
3751 _mm_load_si128((__m128i *) (transform32x32[5][14])));
3752 E6l = _mm_madd_epi16(m128Tmp12,
3753 _mm_load_si128((__m128i *) (transform32x32[6][14])));
3754 E6h = _mm_madd_epi16(m128Tmp13,
3755 _mm_load_si128((__m128i *) (transform32x32[6][14])));
3756 E7l = _mm_madd_epi16(m128Tmp14,
3757 _mm_load_si128((__m128i *) (transform32x32[7][14])));
3758 E7h = _mm_madd_epi16(m128Tmp15,
3759 _mm_load_si128((__m128i *) (transform32x32[7][14])));
3760
3761 O14l = _mm_add_epi32(E0l, E1l);
3762 O14l = _mm_add_epi32(O14l, E2l);
3763 O14l = _mm_add_epi32(O14l, E3l);
3764 O14l = _mm_add_epi32(O14l, E4l);
3765 O14l = _mm_add_epi32(O14l, E5l);
3766 O14l = _mm_add_epi32(O14l, E6l);
3767 O14l = _mm_add_epi32(O14l, E7l);
3768
3769 O14h = _mm_add_epi32(E0h, E1h);
3770 O14h = _mm_add_epi32(O14h, E2h);
3771 O14h = _mm_add_epi32(O14h, E3h);
3772 O14h = _mm_add_epi32(O14h, E4h);
3773 O14h = _mm_add_epi32(O14h, E5h);
3774 O14h = _mm_add_epi32(O14h, E6h);
3775 O14h = _mm_add_epi32(O14h, E7h);
3776
3777 /* Compute O15*/
3778
3779 E0l = _mm_madd_epi16(m128Tmp0,
3780 _mm_load_si128((__m128i *) (transform32x32[0][15])));
3781 E0h = _mm_madd_epi16(m128Tmp1,
3782 _mm_load_si128((__m128i *) (transform32x32[0][15])));
3783 E1l = _mm_madd_epi16(m128Tmp2,
3784 _mm_load_si128((__m128i *) (transform32x32[1][15])));
3785 E1h = _mm_madd_epi16(m128Tmp3,
3786 _mm_load_si128((__m128i *) (transform32x32[1][15])));
3787 E2l = _mm_madd_epi16(m128Tmp4,
3788 _mm_load_si128((__m128i *) (transform32x32[2][15])));
3789 E2h = _mm_madd_epi16(m128Tmp5,
3790 _mm_load_si128((__m128i *) (transform32x32[2][15])));
3791 E3l = _mm_madd_epi16(m128Tmp6,
3792 _mm_load_si128((__m128i *) (transform32x32[3][15])));
3793 E3h = _mm_madd_epi16(m128Tmp7,
3794 _mm_load_si128((__m128i *) (transform32x32[3][15])));
3795
3796 E4l = _mm_madd_epi16(m128Tmp8,
3797 _mm_load_si128((__m128i *) (transform32x32[4][15])));
3798 E4h = _mm_madd_epi16(m128Tmp9,
3799 _mm_load_si128((__m128i *) (transform32x32[4][15])));
3800 E5l = _mm_madd_epi16(m128Tmp10,
3801 _mm_load_si128((__m128i *) (transform32x32[5][15])));
3802 E5h = _mm_madd_epi16(m128Tmp11,
3803 _mm_load_si128((__m128i *) (transform32x32[5][15])));
3804 E6l = _mm_madd_epi16(m128Tmp12,
3805 _mm_load_si128((__m128i *) (transform32x32[6][15])));
3806 E6h = _mm_madd_epi16(m128Tmp13,
3807 _mm_load_si128((__m128i *) (transform32x32[6][15])));
3808 E7l = _mm_madd_epi16(m128Tmp14,
3809 _mm_load_si128((__m128i *) (transform32x32[7][15])));
3810 E7h = _mm_madd_epi16(m128Tmp15,
3811 _mm_load_si128((__m128i *) (transform32x32[7][15])));
3812
3813 O15l = _mm_add_epi32(E0l, E1l);
3814 O15l = _mm_add_epi32(O15l, E2l);
3815 O15l = _mm_add_epi32(O15l, E3l);
3816 O15l = _mm_add_epi32(O15l, E4l);
3817 O15l = _mm_add_epi32(O15l, E5l);
3818 O15l = _mm_add_epi32(O15l, E6l);
3819 O15l = _mm_add_epi32(O15l, E7l);
3820
3821 O15h = _mm_add_epi32(E0h, E1h);
3822 O15h = _mm_add_epi32(O15h, E2h);
3823 O15h = _mm_add_epi32(O15h, E3h);
3824 O15h = _mm_add_epi32(O15h, E4h);
3825 O15h = _mm_add_epi32(O15h, E5h);
3826 O15h = _mm_add_epi32(O15h, E6h);
3827 O15h = _mm_add_epi32(O15h, E7h);
3828 /* Compute E0 */
3829
3830 m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
3831 E0l = _mm_madd_epi16(m128Tmp0,
3832 _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
3833 m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
3834 E0h = _mm_madd_epi16(m128Tmp1,
3835 _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
3836
3837 m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14);
3838 E0l = _mm_add_epi32(E0l,
3839 _mm_madd_epi16(m128Tmp2,
3840 _mm_load_si128(
3841 (__m128i *) (transform16x16_1[1][0]))));
3842 m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14);
3843 E0h = _mm_add_epi32(E0h,
3844 _mm_madd_epi16(m128Tmp3,
3845 _mm_load_si128(
3846 (__m128i *) (transform16x16_1[1][0]))));
3847
3848 m128Tmp4 = _mm_unpacklo_epi16(m128iS18, m128iS22);
3849 E0l = _mm_add_epi32(E0l,
3850 _mm_madd_epi16(m128Tmp4,
3851 _mm_load_si128(
3852 (__m128i *) (transform16x16_1[2][0]))));
3853 m128Tmp5 = _mm_unpackhi_epi16(m128iS18, m128iS22);
3854 E0h = _mm_add_epi32(E0h,
3855 _mm_madd_epi16(m128Tmp5,
3856 _mm_load_si128(
3857 (__m128i *) (transform16x16_1[2][0]))));
3858
3859 m128Tmp6 = _mm_unpacklo_epi16(m128iS26, m128iS30);
3860 E0l = _mm_add_epi32(E0l,
3861 _mm_madd_epi16(m128Tmp6,
3862 _mm_load_si128(
3863 (__m128i *) (transform16x16_1[3][0]))));
3864 m128Tmp7 = _mm_unpackhi_epi16(m128iS26, m128iS30);
3865 E0h = _mm_add_epi32(E0h,
3866 _mm_madd_epi16(m128Tmp7,
3867 _mm_load_si128(
3868 (__m128i *) (transform16x16_1[3][0]))));
3869
3870 /* Compute E1 */
3871 E1l = _mm_madd_epi16(m128Tmp0,
3872 _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
3873 E1h = _mm_madd_epi16(m128Tmp1,
3874 _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
3875 E1l = _mm_add_epi32(E1l,
3876 _mm_madd_epi16(m128Tmp2,
3877 _mm_load_si128(
3878 (__m128i *) (transform16x16_1[1][1]))));
3879 E1h = _mm_add_epi32(E1h,
3880 _mm_madd_epi16(m128Tmp3,
3881 _mm_load_si128(
3882 (__m128i *) (transform16x16_1[1][1]))));
3883 E1l = _mm_add_epi32(E1l,
3884 _mm_madd_epi16(m128Tmp4,
3885 _mm_load_si128(
3886 (__m128i *) (transform16x16_1[2][1]))));
3887 E1h = _mm_add_epi32(E1h,
3888 _mm_madd_epi16(m128Tmp5,
3889 _mm_load_si128(
3890 (__m128i *) (transform16x16_1[2][1]))));
3891 E1l = _mm_add_epi32(E1l,
3892 _mm_madd_epi16(m128Tmp6,
3893 _mm_load_si128(
3894 (__m128i *) (transform16x16_1[3][1]))));
3895 E1h = _mm_add_epi32(E1h,
3896 _mm_madd_epi16(m128Tmp7,
3897 _mm_load_si128(
3898 (__m128i *) (transform16x16_1[3][1]))));
3899
3900 /* Compute E2 */
3901 E2l = _mm_madd_epi16(m128Tmp0,
3902 _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
3903 E2h = _mm_madd_epi16(m128Tmp1,
3904 _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
3905 E2l = _mm_add_epi32(E2l,
3906 _mm_madd_epi16(m128Tmp2,
3907 _mm_load_si128(
3908 (__m128i *) (transform16x16_1[1][2]))));
3909 E2h = _mm_add_epi32(E2h,
3910 _mm_madd_epi16(m128Tmp3,
3911 _mm_load_si128(
3912 (__m128i *) (transform16x16_1[1][2]))));
3913 E2l = _mm_add_epi32(E2l,
3914 _mm_madd_epi16(m128Tmp4,
3915 _mm_load_si128(
3916 (__m128i *) (transform16x16_1[2][2]))));
3917 E2h = _mm_add_epi32(E2h,
3918 _mm_madd_epi16(m128Tmp5,
3919 _mm_load_si128(
3920 (__m128i *) (transform16x16_1[2][2]))));
3921 E2l = _mm_add_epi32(E2l,
3922 _mm_madd_epi16(m128Tmp6,
3923 _mm_load_si128(
3924 (__m128i *) (transform16x16_1[3][2]))));
3925 E2h = _mm_add_epi32(E2h,
3926 _mm_madd_epi16(m128Tmp7,
3927 _mm_load_si128(
3928 (__m128i *) (transform16x16_1[3][2]))));
3929
3930 /* Compute E3 */
3931 E3l = _mm_madd_epi16(m128Tmp0,
3932 _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
3933 E3h = _mm_madd_epi16(m128Tmp1,
3934 _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
3935 E3l = _mm_add_epi32(E3l,
3936 _mm_madd_epi16(m128Tmp2,
3937 _mm_load_si128(
3938 (__m128i *) (transform16x16_1[1][3]))));
3939 E3h = _mm_add_epi32(E3h,
3940 _mm_madd_epi16(m128Tmp3,
3941 _mm_load_si128(
3942 (__m128i *) (transform16x16_1[1][3]))));
3943 E3l = _mm_add_epi32(E3l,
3944 _mm_madd_epi16(m128Tmp4,
3945 _mm_load_si128(
3946 (__m128i *) (transform16x16_1[2][3]))));
3947 E3h = _mm_add_epi32(E3h,
3948 _mm_madd_epi16(m128Tmp5,
3949 _mm_load_si128(
3950 (__m128i *) (transform16x16_1[2][3]))));
3951 E3l = _mm_add_epi32(E3l,
3952 _mm_madd_epi16(m128Tmp6,
3953 _mm_load_si128(
3954 (__m128i *) (transform16x16_1[3][3]))));
3955 E3h = _mm_add_epi32(E3h,
3956 _mm_madd_epi16(m128Tmp7,
3957 _mm_load_si128(
3958 (__m128i *) (transform16x16_1[3][3]))));
3959
3960 /* Compute E4 */
3961 E4l = _mm_madd_epi16(m128Tmp0,
3962 _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
3963 E4h = _mm_madd_epi16(m128Tmp1,
3964 _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
3965 E4l = _mm_add_epi32(E4l,
3966 _mm_madd_epi16(m128Tmp2,
3967 _mm_load_si128(
3968 (__m128i *) (transform16x16_1[1][4]))));
3969 E4h = _mm_add_epi32(E4h,
3970 _mm_madd_epi16(m128Tmp3,
3971 _mm_load_si128(
3972 (__m128i *) (transform16x16_1[1][4]))));
3973 E4l = _mm_add_epi32(E4l,
3974 _mm_madd_epi16(m128Tmp4,
3975 _mm_load_si128(
3976 (__m128i *) (transform16x16_1[2][4]))));
3977 E4h = _mm_add_epi32(E4h,
3978 _mm_madd_epi16(m128Tmp5,
3979 _mm_load_si128(
3980 (__m128i *) (transform16x16_1[2][4]))));
3981 E4l = _mm_add_epi32(E4l,
3982 _mm_madd_epi16(m128Tmp6,
3983 _mm_load_si128(
3984 (__m128i *) (transform16x16_1[3][4]))));
3985 E4h = _mm_add_epi32(E4h,
3986 _mm_madd_epi16(m128Tmp7,
3987 _mm_load_si128(
3988 (__m128i *) (transform16x16_1[3][4]))));
3989
3990 /* Compute E3 */
3991 E5l = _mm_madd_epi16(m128Tmp0,
3992 _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
3993 E5h = _mm_madd_epi16(m128Tmp1,
3994 _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
3995 E5l = _mm_add_epi32(E5l,
3996 _mm_madd_epi16(m128Tmp2,
3997 _mm_load_si128(
3998 (__m128i *) (transform16x16_1[1][5]))));
3999 E5h = _mm_add_epi32(E5h,
4000 _mm_madd_epi16(m128Tmp3,
4001 _mm_load_si128(
4002 (__m128i *) (transform16x16_1[1][5]))));
4003 E5l = _mm_add_epi32(E5l,
4004 _mm_madd_epi16(m128Tmp4,
4005 _mm_load_si128(
4006 (__m128i *) (transform16x16_1[2][5]))));
4007 E5h = _mm_add_epi32(E5h,
4008 _mm_madd_epi16(m128Tmp5,
4009 _mm_load_si128(
4010 (__m128i *) (transform16x16_1[2][5]))));
4011 E5l = _mm_add_epi32(E5l,
4012 _mm_madd_epi16(m128Tmp6,
4013 _mm_load_si128(
4014 (__m128i *) (transform16x16_1[3][5]))));
4015 E5h = _mm_add_epi32(E5h,
4016 _mm_madd_epi16(m128Tmp7,
4017 _mm_load_si128(
4018 (__m128i *) (transform16x16_1[3][5]))));
4019
4020 /* Compute E6 */
4021 E6l = _mm_madd_epi16(m128Tmp0,
4022 _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
4023 E6h = _mm_madd_epi16(m128Tmp1,
4024 _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
4025 E6l = _mm_add_epi32(E6l,
4026 _mm_madd_epi16(m128Tmp2,
4027 _mm_load_si128(
4028 (__m128i *) (transform16x16_1[1][6]))));
4029 E6h = _mm_add_epi32(E6h,
4030 _mm_madd_epi16(m128Tmp3,
4031 _mm_load_si128(
4032 (__m128i *) (transform16x16_1[1][6]))));
4033 E6l = _mm_add_epi32(E6l,
4034 _mm_madd_epi16(m128Tmp4,
4035 _mm_load_si128(
4036 (__m128i *) (transform16x16_1[2][6]))));
4037 E6h = _mm_add_epi32(E6h,
4038 _mm_madd_epi16(m128Tmp5,
4039 _mm_load_si128(
4040 (__m128i *) (transform16x16_1[2][6]))));
4041 E6l = _mm_add_epi32(E6l,
4042 _mm_madd_epi16(m128Tmp6,
4043 _mm_load_si128(
4044 (__m128i *) (transform16x16_1[3][6]))));
4045 E6h = _mm_add_epi32(E6h,
4046 _mm_madd_epi16(m128Tmp7,
4047 _mm_load_si128(
4048 (__m128i *) (transform16x16_1[3][6]))));
4049
4050 /* Compute E7 */
4051 E7l = _mm_madd_epi16(m128Tmp0,
4052 _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
4053 E7h = _mm_madd_epi16(m128Tmp1,
4054 _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
4055 E7l = _mm_add_epi32(E7l,
4056 _mm_madd_epi16(m128Tmp2,
4057 _mm_load_si128(
4058 (__m128i *) (transform16x16_1[1][7]))));
4059 E7h = _mm_add_epi32(E7h,
4060 _mm_madd_epi16(m128Tmp3,
4061 _mm_load_si128(
4062 (__m128i *) (transform16x16_1[1][7]))));
4063 E7l = _mm_add_epi32(E7l,
4064 _mm_madd_epi16(m128Tmp4,
4065 _mm_load_si128(
4066 (__m128i *) (transform16x16_1[2][7]))));
4067 E7h = _mm_add_epi32(E7h,
4068 _mm_madd_epi16(m128Tmp5,
4069 _mm_load_si128(
4070 (__m128i *) (transform16x16_1[2][7]))));
4071 E7l = _mm_add_epi32(E7l,
4072 _mm_madd_epi16(m128Tmp6,
4073 _mm_load_si128(
4074 (__m128i *) (transform16x16_1[3][7]))));
4075 E7h = _mm_add_epi32(E7h,
4076 _mm_madd_epi16(m128Tmp7,
4077 _mm_load_si128(
4078 (__m128i *) (transform16x16_1[3][7]))));
4079
4080 /* Compute EE0 and EEE */
4081
4082 m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12);
4083 E00l = _mm_madd_epi16(m128Tmp0,
4084 _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
4085 m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12);
4086 E00h = _mm_madd_epi16(m128Tmp1,
4087 _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
4088
4089 m128Tmp2 = _mm_unpacklo_epi16(m128iS20, m128iS28);
4090 E00l = _mm_add_epi32(E00l,
4091 _mm_madd_epi16(m128Tmp2,
4092 _mm_load_si128(
4093 (__m128i *) (transform16x16_2[1][0]))));
4094 m128Tmp3 = _mm_unpackhi_epi16(m128iS20, m128iS28);
4095 E00h = _mm_add_epi32(E00h,
4096 _mm_madd_epi16(m128Tmp3,
4097 _mm_load_si128(
4098 (__m128i *) (transform16x16_2[1][0]))));
4099
4100 E01l = _mm_madd_epi16(m128Tmp0,
4101 _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
4102 E01h = _mm_madd_epi16(m128Tmp1,
4103 _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
4104 E01l = _mm_add_epi32(E01l,
4105 _mm_madd_epi16(m128Tmp2,
4106 _mm_load_si128(
4107 (__m128i *) (transform16x16_2[1][1]))));
4108 E01h = _mm_add_epi32(E01h,
4109 _mm_madd_epi16(m128Tmp3,
4110 _mm_load_si128(
4111 (__m128i *) (transform16x16_2[1][1]))));
4112
4113 E02l = _mm_madd_epi16(m128Tmp0,
4114 _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
4115 E02h = _mm_madd_epi16(m128Tmp1,
4116 _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
4117 E02l = _mm_add_epi32(E02l,
4118 _mm_madd_epi16(m128Tmp2,
4119 _mm_load_si128(
4120 (__m128i *) (transform16x16_2[1][2]))));
4121 E02h = _mm_add_epi32(E02h,
4122 _mm_madd_epi16(m128Tmp3,
4123 _mm_load_si128(
4124 (__m128i *) (transform16x16_2[1][2]))));
4125
4126 E03l = _mm_madd_epi16(m128Tmp0,
4127 _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
4128 E03h = _mm_madd_epi16(m128Tmp1,
4129 _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
4130 E03l = _mm_add_epi32(E03l,
4131 _mm_madd_epi16(m128Tmp2,
4132 _mm_load_si128(
4133 (__m128i *) (transform16x16_2[1][3]))));
4134 E03h = _mm_add_epi32(E03h,
4135 _mm_madd_epi16(m128Tmp3,
4136 _mm_load_si128(
4137 (__m128i *) (transform16x16_2[1][3]))));
4138
4139 /* Compute EE0 and EEE */
4140
4141 m128Tmp0 = _mm_unpacklo_epi16(m128iS8, m128iS24);
4142 EE0l = _mm_madd_epi16(m128Tmp0,
4143 _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
4144 m128Tmp1 = _mm_unpackhi_epi16(m128iS8, m128iS24);
4145 EE0h = _mm_madd_epi16(m128Tmp1,
4146 _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
4147
4148 m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS16);
4149 EEE0l = _mm_madd_epi16(m128Tmp2,
4150 _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
4151 m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS16);
4152 EEE0h = _mm_madd_epi16(m128Tmp3,
4153 _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
4154
4155 EE1l = _mm_madd_epi16(m128Tmp0,
4156 _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
4157 EE1h = _mm_madd_epi16(m128Tmp1,
4158 _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
4159
4160 EEE1l = _mm_madd_epi16(m128Tmp2,
4161 _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
4162 EEE1h = _mm_madd_epi16(m128Tmp3,
4163 _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
4164
4165 /* Compute EE */
4166
4167 EE2l = _mm_sub_epi32(EEE1l, EE1l);
4168 EE3l = _mm_sub_epi32(EEE0l, EE0l);
4169 EE2h = _mm_sub_epi32(EEE1h, EE1h);
4170 EE3h = _mm_sub_epi32(EEE0h, EE0h);
4171
4172 EE0l = _mm_add_epi32(EEE0l, EE0l);
4173 EE1l = _mm_add_epi32(EEE1l, EE1l);
4174 EE0h = _mm_add_epi32(EEE0h, EE0h);
4175 EE1h = _mm_add_epi32(EEE1h, EE1h);
4176 /**/
4177
4178 EE7l = _mm_sub_epi32(EE0l, E00l);
4179 EE6l = _mm_sub_epi32(EE1l, E01l);
4180 EE5l = _mm_sub_epi32(EE2l, E02l);
4181 EE4l = _mm_sub_epi32(EE3l, E03l);
4182
4183 EE7h = _mm_sub_epi32(EE0h, E00h);
4184 EE6h = _mm_sub_epi32(EE1h, E01h);
4185 EE5h = _mm_sub_epi32(EE2h, E02h);
4186 EE4h = _mm_sub_epi32(EE3h, E03h);
4187
4188 EE0l = _mm_add_epi32(EE0l, E00l);
4189 EE1l = _mm_add_epi32(EE1l, E01l);
4190 EE2l = _mm_add_epi32(EE2l, E02l);
4191 EE3l = _mm_add_epi32(EE3l, E03l);
4192
4193 EE0h = _mm_add_epi32(EE0h, E00h);
4194 EE1h = _mm_add_epi32(EE1h, E01h);
4195 EE2h = _mm_add_epi32(EE2h, E02h);
4196 EE3h = _mm_add_epi32(EE3h, E03h);
4197 /* Compute E */
4198
4199 E15l = _mm_sub_epi32(EE0l, E0l);
4200 E15l = _mm_add_epi32(E15l, m128iAdd);
4201 E14l = _mm_sub_epi32(EE1l, E1l);
4202 E14l = _mm_add_epi32(E14l, m128iAdd);
4203 E13l = _mm_sub_epi32(EE2l, E2l);
4204 E13l = _mm_add_epi32(E13l, m128iAdd);
4205 E12l = _mm_sub_epi32(EE3l, E3l);
4206 E12l = _mm_add_epi32(E12l, m128iAdd);
4207 E11l = _mm_sub_epi32(EE4l, E4l);
4208 E11l = _mm_add_epi32(E11l, m128iAdd);
4209 E10l = _mm_sub_epi32(EE5l, E5l);
4210 E10l = _mm_add_epi32(E10l, m128iAdd);
4211 E9l = _mm_sub_epi32(EE6l, E6l);
4212 E9l = _mm_add_epi32(E9l, m128iAdd);
4213 E8l = _mm_sub_epi32(EE7l, E7l);
4214 E8l = _mm_add_epi32(E8l, m128iAdd);
4215
4216 E0l = _mm_add_epi32(EE0l, E0l);
4217 E0l = _mm_add_epi32(E0l, m128iAdd);
4218 E1l = _mm_add_epi32(EE1l, E1l);
4219 E1l = _mm_add_epi32(E1l, m128iAdd);
4220 E2l = _mm_add_epi32(EE2l, E2l);
4221 E2l = _mm_add_epi32(E2l, m128iAdd);
4222 E3l = _mm_add_epi32(EE3l, E3l);
4223 E3l = _mm_add_epi32(E3l, m128iAdd);
4224 E4l = _mm_add_epi32(EE4l, E4l);
4225 E4l = _mm_add_epi32(E4l, m128iAdd);
4226 E5l = _mm_add_epi32(EE5l, E5l);
4227 E5l = _mm_add_epi32(E5l, m128iAdd);
4228 E6l = _mm_add_epi32(EE6l, E6l);
4229 E6l = _mm_add_epi32(E6l, m128iAdd);
4230 E7l = _mm_add_epi32(EE7l, E7l);
4231 E7l = _mm_add_epi32(E7l, m128iAdd);
4232
4233 E15h = _mm_sub_epi32(EE0h, E0h);
4234 E15h = _mm_add_epi32(E15h, m128iAdd);
4235 E14h = _mm_sub_epi32(EE1h, E1h);
4236 E14h = _mm_add_epi32(E14h, m128iAdd);
4237 E13h = _mm_sub_epi32(EE2h, E2h);
4238 E13h = _mm_add_epi32(E13h, m128iAdd);
4239 E12h = _mm_sub_epi32(EE3h, E3h);
4240 E12h = _mm_add_epi32(E12h, m128iAdd);
4241 E11h = _mm_sub_epi32(EE4h, E4h);
4242 E11h = _mm_add_epi32(E11h, m128iAdd);
4243 E10h = _mm_sub_epi32(EE5h, E5h);
4244 E10h = _mm_add_epi32(E10h, m128iAdd);
4245 E9h = _mm_sub_epi32(EE6h, E6h);
4246 E9h = _mm_add_epi32(E9h, m128iAdd);
4247 E8h = _mm_sub_epi32(EE7h, E7h);
4248 E8h = _mm_add_epi32(E8h, m128iAdd);
4249
4250 E0h = _mm_add_epi32(EE0h, E0h);
4251 E0h = _mm_add_epi32(E0h, m128iAdd);
4252 E1h = _mm_add_epi32(EE1h, E1h);
4253 E1h = _mm_add_epi32(E1h, m128iAdd);
4254 E2h = _mm_add_epi32(EE2h, E2h);
4255 E2h = _mm_add_epi32(E2h, m128iAdd);
4256 E3h = _mm_add_epi32(EE3h, E3h);
4257 E3h = _mm_add_epi32(E3h, m128iAdd);
4258 E4h = _mm_add_epi32(EE4h, E4h);
4259 E4h = _mm_add_epi32(E4h, m128iAdd);
4260 E5h = _mm_add_epi32(EE5h, E5h);
4261 E5h = _mm_add_epi32(E5h, m128iAdd);
4262 E6h = _mm_add_epi32(EE6h, E6h);
4263 E6h = _mm_add_epi32(E6h, m128iAdd);
4264 E7h = _mm_add_epi32(EE7h, E7h);
4265 E7h = _mm_add_epi32(E7h, m128iAdd);
4266
4267 m128iS0 = _mm_packs_epi32(
4268 _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift),
4269 _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift));
4270 m128iS1 = _mm_packs_epi32(
4271 _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift),
4272 _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift));
4273 m128iS2 = _mm_packs_epi32(
4274 _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift),
4275 _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift));
4276 m128iS3 = _mm_packs_epi32(
4277 _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift),
4278 _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift));
4279 m128iS4 = _mm_packs_epi32(
4280 _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift),
4281 _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift));
4282 m128iS5 = _mm_packs_epi32(
4283 _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift),
4284 _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift));
4285 m128iS6 = _mm_packs_epi32(
4286 _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift),
4287 _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift));
4288 m128iS7 = _mm_packs_epi32(
4289 _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift),
4290 _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift));
4291 m128iS8 = _mm_packs_epi32(
4292 _mm_srai_epi32(_mm_add_epi32(E8l, O8l), shift),
4293 _mm_srai_epi32(_mm_add_epi32(E8h, O8h), shift));
4294 m128iS9 = _mm_packs_epi32(
4295 _mm_srai_epi32(_mm_add_epi32(E9l, O9l), shift),
4296 _mm_srai_epi32(_mm_add_epi32(E9h, O9h), shift));
4297 m128iS10 = _mm_packs_epi32(
4298 _mm_srai_epi32(_mm_add_epi32(E10l, O10l), shift),
4299 _mm_srai_epi32(_mm_add_epi32(E10h, O10h), shift));
4300 m128iS11 = _mm_packs_epi32(
4301 _mm_srai_epi32(_mm_add_epi32(E11l, O11l), shift),
4302 _mm_srai_epi32(_mm_add_epi32(E11h, O11h), shift));
4303 m128iS12 = _mm_packs_epi32(
4304 _mm_srai_epi32(_mm_add_epi32(E12l, O12l), shift),
4305 _mm_srai_epi32(_mm_add_epi32(E12h, O12h), shift));
4306 m128iS13 = _mm_packs_epi32(
4307 _mm_srai_epi32(_mm_add_epi32(E13l, O13l), shift),
4308 _mm_srai_epi32(_mm_add_epi32(E13h, O13h), shift));
4309 m128iS14 = _mm_packs_epi32(
4310 _mm_srai_epi32(_mm_add_epi32(E14l, O14l), shift),
4311 _mm_srai_epi32(_mm_add_epi32(E14h, O14h), shift));
4312 m128iS15 = _mm_packs_epi32(
4313 _mm_srai_epi32(_mm_add_epi32(E15l, O15l), shift),
4314 _mm_srai_epi32(_mm_add_epi32(E15h, O15h), shift));
4315
4316 m128iS31 = _mm_packs_epi32(
4317 _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift),
4318 _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift));
4319 m128iS30 = _mm_packs_epi32(
4320 _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift),
4321 _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift));
4322 m128iS29 = _mm_packs_epi32(
4323 _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift),
4324 _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift));
4325 m128iS28 = _mm_packs_epi32(
4326 _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift),
4327 _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift));
4328 m128iS27 = _mm_packs_epi32(
4329 _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift),
4330 _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift));
4331 m128iS26 = _mm_packs_epi32(
4332 _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift),
4333 _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift));
4334 m128iS25 = _mm_packs_epi32(
4335 _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift),
4336 _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift));
4337 m128iS24 = _mm_packs_epi32(
4338 _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift),
4339 _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift));
4340 m128iS23 = _mm_packs_epi32(
4341 _mm_srai_epi32(_mm_sub_epi32(E8l, O8l), shift),
4342 _mm_srai_epi32(_mm_sub_epi32(E8h, O8h), shift));
4343 m128iS22 = _mm_packs_epi32(
4344 _mm_srai_epi32(_mm_sub_epi32(E9l, O9l), shift),
4345 _mm_srai_epi32(_mm_sub_epi32(E9h, O9h), shift));
4346 m128iS21 = _mm_packs_epi32(
4347 _mm_srai_epi32(_mm_sub_epi32(E10l, O10l), shift),
4348 _mm_srai_epi32(_mm_sub_epi32(E10h, O10h), shift));
4349 m128iS20 = _mm_packs_epi32(
4350 _mm_srai_epi32(_mm_sub_epi32(E11l, O11l), shift),
4351 _mm_srai_epi32(_mm_sub_epi32(E11h, O11h), shift));
4352 m128iS19 = _mm_packs_epi32(
4353 _mm_srai_epi32(_mm_sub_epi32(E12l, O12l), shift),
4354 _mm_srai_epi32(_mm_sub_epi32(E12h, O12h), shift));
4355 m128iS18 = _mm_packs_epi32(
4356 _mm_srai_epi32(_mm_sub_epi32(E13l, O13l), shift),
4357 _mm_srai_epi32(_mm_sub_epi32(E13h, O13h), shift));
4358 m128iS17 = _mm_packs_epi32(
4359 _mm_srai_epi32(_mm_sub_epi32(E14l, O14l), shift),
4360 _mm_srai_epi32(_mm_sub_epi32(E14h, O14h), shift));
4361 m128iS16 = _mm_packs_epi32(
4362 _mm_srai_epi32(_mm_sub_epi32(E15l, O15l), shift),
4363 _mm_srai_epi32(_mm_sub_epi32(E15h, O15h), shift));
4364
4365 if (!j) {
4366 /* Inverse the matrix */
4367 E0l = _mm_unpacklo_epi16(m128iS0, m128iS16);
4368 E1l = _mm_unpacklo_epi16(m128iS1, m128iS17);
4369 E2l = _mm_unpacklo_epi16(m128iS2, m128iS18);
4370 E3l = _mm_unpacklo_epi16(m128iS3, m128iS19);
4371 E4l = _mm_unpacklo_epi16(m128iS4, m128iS20);
4372 E5l = _mm_unpacklo_epi16(m128iS5, m128iS21);
4373 E6l = _mm_unpacklo_epi16(m128iS6, m128iS22);
4374 E7l = _mm_unpacklo_epi16(m128iS7, m128iS23);
4375 E8l = _mm_unpacklo_epi16(m128iS8, m128iS24);
4376 E9l = _mm_unpacklo_epi16(m128iS9, m128iS25);
4377 E10l = _mm_unpacklo_epi16(m128iS10, m128iS26);
4378 E11l = _mm_unpacklo_epi16(m128iS11, m128iS27);
4379 E12l = _mm_unpacklo_epi16(m128iS12, m128iS28);
4380 E13l = _mm_unpacklo_epi16(m128iS13, m128iS29);
4381 E14l = _mm_unpacklo_epi16(m128iS14, m128iS30);
4382 E15l = _mm_unpacklo_epi16(m128iS15, m128iS31);
4383
4384 O0l = _mm_unpackhi_epi16(m128iS0, m128iS16);
4385 O1l = _mm_unpackhi_epi16(m128iS1, m128iS17);
4386 O2l = _mm_unpackhi_epi16(m128iS2, m128iS18);
4387 O3l = _mm_unpackhi_epi16(m128iS3, m128iS19);
4388 O4l = _mm_unpackhi_epi16(m128iS4, m128iS20);
4389 O5l = _mm_unpackhi_epi16(m128iS5, m128iS21);
4390 O6l = _mm_unpackhi_epi16(m128iS6, m128iS22);
4391 O7l = _mm_unpackhi_epi16(m128iS7, m128iS23);
4392 O8l = _mm_unpackhi_epi16(m128iS8, m128iS24);
4393 O9l = _mm_unpackhi_epi16(m128iS9, m128iS25);
4394 O10l = _mm_unpackhi_epi16(m128iS10, m128iS26);
4395 O11l = _mm_unpackhi_epi16(m128iS11, m128iS27);
4396 O12l = _mm_unpackhi_epi16(m128iS12, m128iS28);
4397 O13l = _mm_unpackhi_epi16(m128iS13, m128iS29);
4398 O14l = _mm_unpackhi_epi16(m128iS14, m128iS30);
4399 O15l = _mm_unpackhi_epi16(m128iS15, m128iS31);
4400
4401 E0h = _mm_unpacklo_epi16(E0l, E8l);
4402 E1h = _mm_unpacklo_epi16(E1l, E9l);
4403 E2h = _mm_unpacklo_epi16(E2l, E10l);
4404 E3h = _mm_unpacklo_epi16(E3l, E11l);
4405 E4h = _mm_unpacklo_epi16(E4l, E12l);
4406 E5h = _mm_unpacklo_epi16(E5l, E13l);
4407 E6h = _mm_unpacklo_epi16(E6l, E14l);
4408 E7h = _mm_unpacklo_epi16(E7l, E15l);
4409
4410 E8h = _mm_unpackhi_epi16(E0l, E8l);
4411 E9h = _mm_unpackhi_epi16(E1l, E9l);
4412 E10h = _mm_unpackhi_epi16(E2l, E10l);
4413 E11h = _mm_unpackhi_epi16(E3l, E11l);
4414 E12h = _mm_unpackhi_epi16(E4l, E12l);
4415 E13h = _mm_unpackhi_epi16(E5l, E13l);
4416 E14h = _mm_unpackhi_epi16(E6l, E14l);
4417 E15h = _mm_unpackhi_epi16(E7l, E15l);
4418
4419 m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h);
4420 m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h);
4421 m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h);
4422 m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h);
4423
4424 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4425 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4426 m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4427 m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4428
4429 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4430 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4431 m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4432 m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4433
4434 m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h);
4435 m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h);
4436 m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h);
4437 m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h);
4438
4439 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4440 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4441 m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4442 m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4443
4444 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4445 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4446 m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4447 m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4448
4449 m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h);
4450 m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h);
4451 m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h);
4452 m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h);
4453
4454 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4455 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4456 m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4457 m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4458
4459 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4460 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4461 m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4462 m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4463
4464 m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h);
4465 m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h);
4466 m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h);
4467 m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h);
4468
4469 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4470 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4471 m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4472 m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4473
4474 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4475 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4476 m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4477 m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4478
4479 /* */
4480 E0h = _mm_unpacklo_epi16(O0l, O8l);
4481 E1h = _mm_unpacklo_epi16(O1l, O9l);
4482 E2h = _mm_unpacklo_epi16(O2l, O10l);
4483 E3h = _mm_unpacklo_epi16(O3l, O11l);
4484 E4h = _mm_unpacklo_epi16(O4l, O12l);
4485 E5h = _mm_unpacklo_epi16(O5l, O13l);
4486 E6h = _mm_unpacklo_epi16(O6l, O14l);
4487 E7h = _mm_unpacklo_epi16(O7l, O15l);
4488
4489 E8h = _mm_unpackhi_epi16(O0l, O8l);
4490 E9h = _mm_unpackhi_epi16(O1l, O9l);
4491 E10h = _mm_unpackhi_epi16(O2l, O10l);
4492 E11h = _mm_unpackhi_epi16(O3l, O11l);
4493 E12h = _mm_unpackhi_epi16(O4l, O12l);
4494 E13h = _mm_unpackhi_epi16(O5l, O13l);
4495 E14h = _mm_unpackhi_epi16(O6l, O14l);
4496 E15h = _mm_unpackhi_epi16(O7l, O15l);
4497
4498 m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h);
4499 m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h);
4500 m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h);
4501 m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h);
4502
4503 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4504 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4505 m128iS16 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4506 m128iS17 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4507
4508 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4509 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4510 m128iS18 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4511 m128iS19 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4512
4513 m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h);
4514 m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h);
4515 m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h);
4516 m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h);
4517
4518 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4519 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4520 m128iS20 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4521 m128iS21 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4522
4523 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4524 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4525 m128iS22 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4526 m128iS23 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4527
4528 m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h);
4529 m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h);
4530 m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h);
4531 m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h);
4532
4533 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4534 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4535 m128iS24 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4536 m128iS25 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4537
4538 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4539 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4540 m128iS26 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4541 m128iS27 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4542
4543 m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h);
4544 m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h);
4545 m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h);
4546 m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h);
4547
4548 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4549 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4550 m128iS28 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4551 m128iS29 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4552
4553 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4554 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4555 m128iS30 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4556 m128iS31 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4557
4558 if(i==0){
4559 int k = 8;
4560 r0=m128iS0;
4561 r1=m128iS1;
4562 r2=m128iS2;
4563 r3=m128iS3;
4564 r4=m128iS4;
4565 r5=m128iS5;
4566 r6=m128iS6;
4567 r7=m128iS7;
4568 r8=m128iS8;
4569 r9=m128iS9;
4570 r10=m128iS10;
4571 r11=m128iS11;
4572 r12=m128iS12;
4573 r13=m128iS13;
4574 r14=m128iS14;
4575 r15=m128iS15;
4576 r16=m128iS16;
4577 r17=m128iS17;
4578 r18=m128iS18;
4579 r19=m128iS19;
4580 r20=m128iS20;
4581 r21=m128iS21;
4582 r22=m128iS22;
4583 r23=m128iS23;
4584 r24=m128iS24;
4585 r25=m128iS25;
4586 r26=m128iS26;
4587 r27=m128iS27;
4588 r28=m128iS28;
4589 r29=m128iS29;
4590 r30=m128iS30;
4591 r31=m128iS31;
4592 m128iS0 = _mm_load_si128((__m128i *) (src + k));
4593 m128iS1 = _mm_load_si128((__m128i *) (src + 32 + k));
4594 m128iS2 = _mm_load_si128((__m128i *) (src + 64 + k));
4595 m128iS3 = _mm_load_si128((__m128i *) (src + 96 + k));
4596 m128iS4 = _mm_load_si128((__m128i *) (src + 128 + k));
4597 m128iS5 = _mm_load_si128((__m128i *) (src + 160 + k));
4598 m128iS6 = _mm_load_si128((__m128i *) (src + 192 + k));
4599 m128iS7 = _mm_load_si128((__m128i *) (src + 224 + k));
4600 m128iS8 = _mm_load_si128((__m128i *) (src + 256 + k));
4601 m128iS9 = _mm_load_si128((__m128i *) (src + 288 + k));
4602 m128iS10 = _mm_load_si128((__m128i *) (src + 320 + k));
4603 m128iS11 = _mm_load_si128((__m128i *) (src + 352 + k));
4604 m128iS12 = _mm_load_si128((__m128i *) (src + 384 + k));
4605 m128iS13 = _mm_load_si128((__m128i *) (src + 416 + k));
4606 m128iS14 = _mm_load_si128((__m128i *) (src + 448 + k));
4607 m128iS15 = _mm_load_si128((__m128i *) (src + 480 + k));
4608
4609 m128iS16 = _mm_load_si128((__m128i *) (src + 512 + k));
4610 m128iS17 = _mm_load_si128((__m128i *) (src + 544 + k));
4611 m128iS18 = _mm_load_si128((__m128i *) (src + 576 + k));
4612 m128iS19 = _mm_load_si128((__m128i *) (src + 608 + k));
4613 m128iS20 = _mm_load_si128((__m128i *) (src + 640 + k));
4614 m128iS21 = _mm_load_si128((__m128i *) (src + 672 + k));
4615 m128iS22 = _mm_load_si128((__m128i *) (src + 704 + k));
4616 m128iS23 = _mm_load_si128((__m128i *) (src + 736 + k));
4617 m128iS24 = _mm_load_si128((__m128i *) (src + 768 + k));
4618 m128iS25 = _mm_load_si128((__m128i *) (src + 800 + k));
4619 m128iS26 = _mm_load_si128((__m128i *) (src + 832 + k));
4620 m128iS27 = _mm_load_si128((__m128i *) (src + 864 + k));
4621 m128iS28 = _mm_load_si128((__m128i *) (src + 896 + k));
4622 m128iS29 = _mm_load_si128((__m128i *) (src + 928 + k));
4623 m128iS30 = _mm_load_si128((__m128i *) (src + 960 + k));
4624 m128iS31 = _mm_load_si128((__m128i *) (src + 992 + k));
4625
4626 }else if(i ==8){
4627
4628 r32=m128iS0;
4629 r33=m128iS1;
4630 r34=m128iS2;
4631 r35=m128iS3;
4632 r36=m128iS4;
4633 r37=m128iS5;
4634 r38=m128iS6;
4635 r39=m128iS7;
4636 r40=m128iS8;
4637 r41=m128iS9;
4638 r42=m128iS10;
4639 r43=m128iS11;
4640 r44=m128iS12;
4641 r45=m128iS13;
4642 r46=m128iS14;
4643 r47=m128iS15;
4644 r48=m128iS16;
4645 r49=m128iS17;
4646 r50=m128iS18;
4647 r51=m128iS19;
4648 r52=m128iS20;
4649 r53=m128iS21;
4650 r54=m128iS22;
4651 r55=m128iS23;
4652 r56=m128iS24;
4653 r57=m128iS25;
4654 r58=m128iS26;
4655 r59=m128iS27;
4656 r60=m128iS28;
4657 r61=m128iS29;
4658 r62=m128iS30;
4659 r63=m128iS31;
4660
4661 m128iS0 = _mm_load_si128((__m128i *) (src + 16));
4662 m128iS1 = _mm_load_si128((__m128i *) (src + 48));
4663 m128iS2 = _mm_load_si128((__m128i *) (src + 80));
4664 m128iS3 = _mm_load_si128((__m128i *) (src + 112));
4665 m128iS4 = _mm_load_si128((__m128i *) (src + 144));
4666 m128iS5 = _mm_load_si128((__m128i *) (src + 176));
4667 m128iS6 = _mm_load_si128((__m128i *) (src + 192 + 16));
4668 m128iS7 = _mm_load_si128((__m128i *) (src + 224 + 16));
4669 m128iS8 = _mm_load_si128((__m128i *) (src + 256 + 16));
4670 m128iS9 = _mm_load_si128((__m128i *) (src + 288 + 16));
4671 m128iS10 = _mm_load_si128((__m128i *) (src + 320 + 16));
4672 m128iS11 = _mm_load_si128((__m128i *) (src + 352 + 16));
4673 m128iS12 = _mm_load_si128((__m128i *) (src + 384 + 16));
4674 m128iS13 = _mm_load_si128((__m128i *) (src + 416 + 16));
4675 m128iS14 = _mm_load_si128((__m128i *) (src + 448 + 16));
4676 m128iS15 = _mm_load_si128((__m128i *) (src + 480 + 16));
4677
4678 m128iS16 = _mm_load_si128((__m128i *) (src + 512 + 16));
4679 m128iS17 = _mm_load_si128((__m128i *) (src + 544 + 16));
4680 m128iS18 = _mm_load_si128((__m128i *) (src + 576 + 16));
4681 m128iS19 = _mm_load_si128((__m128i *) (src + 608 + 16));
4682 m128iS20 = _mm_load_si128((__m128i *) (src + 640 + 16));
4683 m128iS21 = _mm_load_si128((__m128i *) (src + 672 + 16));
4684 m128iS22 = _mm_load_si128((__m128i *) (src + 704 + 16));
4685 m128iS23 = _mm_load_si128((__m128i *) (src + 736 + 16));
4686 m128iS24 = _mm_load_si128((__m128i *) (src + 768 + 16));
4687 m128iS25 = _mm_load_si128((__m128i *) (src + 800 + 16));
4688 m128iS26 = _mm_load_si128((__m128i *) (src + 832 + 16));
4689 m128iS27 = _mm_load_si128((__m128i *) (src + 864 + 16));
4690 m128iS28 = _mm_load_si128((__m128i *) (src + 896 + 16));
4691 m128iS29 = _mm_load_si128((__m128i *) (src + 928 + 16));
4692 m128iS30 = _mm_load_si128((__m128i *) (src + 960 + 16));
4693 m128iS31 = _mm_load_si128((__m128i *) (src + 992 + 16));
4694
4695
4696 }else if(i ==16){
4697
4698 r64=m128iS0;
4699 r65=m128iS1;
4700 r66=m128iS2;
4701 r67=m128iS3;
4702 r68=m128iS4;
4703 r69=m128iS5;
4704 r70=m128iS6;
4705 r71=m128iS7;
4706 r72=m128iS8;
4707 r73=m128iS9;
4708 r74=m128iS10;
4709 r75=m128iS11;
4710 r76=m128iS12;
4711 r77=m128iS13;
4712 r78=m128iS14;
4713 r79=m128iS15;
4714 r80=m128iS16;
4715 r81=m128iS17;
4716 r82=m128iS18;
4717 r83=m128iS19;
4718 r84=m128iS20;
4719 r85=m128iS21;
4720 r86=m128iS22;
4721 r87=m128iS23;
4722 r88=m128iS24;
4723 r89=m128iS25;
4724 r90=m128iS26;
4725 r91=m128iS27;
4726 r92=m128iS28;
4727 r93=m128iS29;
4728 r94=m128iS30;
4729 r95=m128iS31;
4730
4731 m128iS0 = _mm_load_si128((__m128i *) (src + 24));
4732 m128iS1 = _mm_load_si128((__m128i *) (src + 56));
4733 m128iS2 = _mm_load_si128((__m128i *) (src + 64 + 24));
4734 m128iS3 = _mm_load_si128((__m128i *) (src + 96 + 24));
4735 m128iS4 = _mm_load_si128((__m128i *) (src + 128 + 24));
4736 m128iS5 = _mm_load_si128((__m128i *) (src + 160 + 24));
4737 m128iS6 = _mm_load_si128((__m128i *) (src + 192 + 24));
4738 m128iS7 = _mm_load_si128((__m128i *) (src + 224 + 24));
4739 m128iS8 = _mm_load_si128((__m128i *) (src + 256 + 24));
4740 m128iS9 = _mm_load_si128((__m128i *) (src + 288 + 24));
4741 m128iS10 = _mm_load_si128((__m128i *) (src + 320 + 24));
4742 m128iS11 = _mm_load_si128((__m128i *) (src + 352 + 24));
4743 m128iS12 = _mm_load_si128((__m128i *) (src + 384 + 24));
4744 m128iS13 = _mm_load_si128((__m128i *) (src + 416 + 24));
4745 m128iS14 = _mm_load_si128((__m128i *) (src + 448 + 24));
4746 m128iS15 = _mm_load_si128((__m128i *) (src + 480 + 24));
4747
4748 m128iS16 = _mm_load_si128((__m128i *) (src + 512 + 24));
4749 m128iS17 = _mm_load_si128((__m128i *) (src + 544 + 24));
4750 m128iS18 = _mm_load_si128((__m128i *) (src + 576 + 24));
4751 m128iS19 = _mm_load_si128((__m128i *) (src + 608 + 24));
4752 m128iS20 = _mm_load_si128((__m128i *) (src + 640 + 24));
4753 m128iS21 = _mm_load_si128((__m128i *) (src + 672 + 24));
4754 m128iS22 = _mm_load_si128((__m128i *) (src + 704 + 24));
4755 m128iS23 = _mm_load_si128((__m128i *) (src + 736 + 24));
4756 m128iS24 = _mm_load_si128((__m128i *) (src + 768 + 24));
4757 m128iS25 = _mm_load_si128((__m128i *) (src + 800 + 24));
4758 m128iS26 = _mm_load_si128((__m128i *) (src + 832 + 24));
4759 m128iS27 = _mm_load_si128((__m128i *) (src + 864 + 24));
4760 m128iS28 = _mm_load_si128((__m128i *) (src + 896 + 24));
4761 m128iS29 = _mm_load_si128((__m128i *) (src + 928 + 24));
4762 m128iS30 = _mm_load_si128((__m128i *) (src + 960 + 24));
4763 m128iS31 = _mm_load_si128((__m128i *) (src + 992 + 24));
4764
4765 }else{
4766 r96=m128iS0;
4767 r97=m128iS1;
4768 r98=m128iS2;
4769 r99=m128iS3;
4770 r100=m128iS4;
4771 r101=m128iS5;
4772 r102=m128iS6;
4773 r103=m128iS7;
4774 r104=m128iS8;
4775 r105=m128iS9;
4776 r106=m128iS10;
4777 r107=m128iS11;
4778 r108=m128iS12;
4779 r109=m128iS13;
4780 r110=m128iS14;
4781 r111=m128iS15;
4782 r112=m128iS16;
4783 r113=m128iS17;
4784 r114=m128iS18;
4785 r115=m128iS19;
4786 r116=m128iS20;
4787 r117=m128iS21;
4788 r118=m128iS22;
4789 r119=m128iS23;
4790 r120=m128iS24;
4791 r121=m128iS25;
4792 r122=m128iS26;
4793 r123=m128iS27;
4794 r124=m128iS28;
4795 r125=m128iS29;
4796 r126=m128iS30;
4797 r127=m128iS31;
4798
4799 //load data for next j :
4800 m128iS0 = r0;
4801 m128iS1 = r4;
4802 m128iS2 = r8;
4803 m128iS3 = r12;
4804 m128iS4 = r16;
4805 m128iS5 = r20;
4806 m128iS6 = r24;
4807 m128iS7 = r28;
4808 m128iS8 = r32;
4809 m128iS9 = r36;
4810 m128iS10 = r40;
4811 m128iS11 = r44;
4812 m128iS12 = r48;
4813 m128iS13 = r52;
4814 m128iS14 = r56;
4815 m128iS15 = r60;
4816 m128iS16 = r64;
4817 m128iS17 = r68;
4818 m128iS18 = r72;
4819 m128iS19 = r76;
4820 m128iS20 = r80;
4821 m128iS21 = r84;
4822 m128iS22 = r88;
4823 m128iS23 = r92;
4824 m128iS24 = r96;
4825 m128iS25 = r100;
4826 m128iS26 = r104;
4827 m128iS27 = r108;
4828 m128iS28 = r112;
4829 m128iS29 = r116;
4830 m128iS30 = r120;
4831 m128iS31 =r124;
4832 shift = shift_2nd;
4833 m128iAdd = _mm_set1_epi32(add_2nd);
4834
4835
4836 }
4837
4838 } else {
4839
4840 //Transpose Matrix
4841
4842 E0l= _mm_unpacklo_epi16(m128iS0,m128iS1);
4843 E1l= _mm_unpacklo_epi16(m128iS2,m128iS3);
4844 E2l= _mm_unpacklo_epi16(m128iS4,m128iS5);
4845 E3l= _mm_unpacklo_epi16(m128iS6,m128iS7);
4846 E4l= _mm_unpacklo_epi16(m128iS8,m128iS9);
4847 E5l= _mm_unpacklo_epi16(m128iS10,m128iS11);
4848 E6l= _mm_unpacklo_epi16(m128iS12,m128iS13);
4849 E7l= _mm_unpacklo_epi16(m128iS14,m128iS15);
4850 E8l= _mm_unpacklo_epi16(m128iS16,m128iS17);
4851 E9l= _mm_unpacklo_epi16(m128iS18,m128iS19);
4852 E10l= _mm_unpacklo_epi16(m128iS20,m128iS21);
4853 E11l= _mm_unpacklo_epi16(m128iS22,m128iS23);
4854 E12l= _mm_unpacklo_epi16(m128iS24,m128iS25);
4855 E13l= _mm_unpacklo_epi16(m128iS26,m128iS27);
4856 E14l= _mm_unpacklo_epi16(m128iS28,m128iS29);
4857 E15l= _mm_unpacklo_epi16(m128iS30,m128iS31);
4858
4859
4860 E0h= _mm_unpackhi_epi16(m128iS0,m128iS1);
4861 E1h= _mm_unpackhi_epi16(m128iS2,m128iS3);
4862 E2h= _mm_unpackhi_epi16(m128iS4,m128iS5);
4863 E3h= _mm_unpackhi_epi16(m128iS6,m128iS7);
4864 E4h= _mm_unpackhi_epi16(m128iS8,m128iS9);
4865 E5h= _mm_unpackhi_epi16(m128iS10,m128iS11);
4866 E6h= _mm_unpackhi_epi16(m128iS12,m128iS13);
4867 E7h= _mm_unpackhi_epi16(m128iS14,m128iS15);
4868 E8h= _mm_unpackhi_epi16(m128iS16,m128iS17);
4869 E9h= _mm_unpackhi_epi16(m128iS18,m128iS19);
4870 E10h= _mm_unpackhi_epi16(m128iS20,m128iS21);
4871 E11h= _mm_unpackhi_epi16(m128iS22,m128iS23);
4872 E12h= _mm_unpackhi_epi16(m128iS24,m128iS25);
4873 E13h= _mm_unpackhi_epi16(m128iS26,m128iS27);
4874 E14h= _mm_unpackhi_epi16(m128iS28,m128iS29);
4875 E15h= _mm_unpackhi_epi16(m128iS30,m128iS31);
4876
4877 m128Tmp0= _mm_unpacklo_epi32(E0l,E1l);
4878 m128Tmp1= _mm_unpacklo_epi32(E2l,E3l);
4879 m128Tmp2= _mm_unpacklo_epi32(E4l,E5l);
4880 m128Tmp3= _mm_unpacklo_epi32(E6l,E7l);
4881 m128Tmp4= _mm_unpacklo_epi32(E8l,E9l);
4882 m128Tmp5= _mm_unpacklo_epi32(E10l,E11l);
4883 m128Tmp6= _mm_unpacklo_epi32(E12l,E13l);
4884 m128Tmp7= _mm_unpacklo_epi32(E14l,E15l);
4885
4886 m128iS0= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter 1st row
4887 m128iS1= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter 1st row
4888
4889
4890 m128iS2= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter 1st row
4891 m128iS3= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); //last quarter 1st row
4892
4893 //second row
4894
4895 m128iS4= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter
4896 m128iS5= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter
4897
4898 m128iS6= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter
4899 m128iS7= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter
4900
4901 //third row
4902
4903 m128Tmp0= _mm_unpackhi_epi32(E0l,E1l);
4904 m128Tmp1= _mm_unpackhi_epi32(E2l,E3l);
4905 m128Tmp2= _mm_unpackhi_epi32(E4l,E5l);
4906 m128Tmp3= _mm_unpackhi_epi32(E6l,E7l);
4907 m128Tmp4= _mm_unpackhi_epi32(E8l,E9l);
4908 m128Tmp5= _mm_unpackhi_epi32(E10l,E11l);
4909 m128Tmp6= _mm_unpackhi_epi32(E12l,E13l);
4910 m128Tmp7= _mm_unpackhi_epi32(E14l,E15l);
4911
4912
4913 m128iS8= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter
4914 m128iS9= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter
4915
4916 m128iS10= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter
4917 m128iS11= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); //last quarter
4918
4919 //fourth row
4920
4921 m128iS12= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter
4922 m128iS13= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter
4923
4924 m128iS14= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter
4925 m128iS15= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter
4926
4927 //fith row
4928
4929 m128Tmp0= _mm_unpacklo_epi32(E0h,E1h);
4930 m128Tmp1= _mm_unpacklo_epi32(E2h,E3h);
4931 m128Tmp2= _mm_unpacklo_epi32(E4h,E5h);
4932 m128Tmp3= _mm_unpacklo_epi32(E6h,E7h);
4933 m128Tmp4= _mm_unpacklo_epi32(E8h,E9h);
4934 m128Tmp5= _mm_unpacklo_epi32(E10h,E11h);
4935 m128Tmp6= _mm_unpacklo_epi32(E12h,E13h);
4936 m128Tmp7= _mm_unpacklo_epi32(E14h,E15h);
4937
4938 m128iS16= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter
4939 m128iS17= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter
4940
4941
4942 m128iS18= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter
4943 m128iS19= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7);
4944
4945 //sixth row
4946
4947 m128iS20= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter
4948 m128iS21= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter
4949
4950
4951 m128iS22= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter
4952 m128iS23= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter
4953
4954 //seventh row
4955
4956 m128Tmp0= _mm_unpackhi_epi32(E0h,E1h);
4957 m128Tmp1= _mm_unpackhi_epi32(E2h,E3h);
4958 m128Tmp2= _mm_unpackhi_epi32(E4h,E5h);
4959 m128Tmp3= _mm_unpackhi_epi32(E6h,E7h);
4960 m128Tmp4= _mm_unpackhi_epi32(E8h,E9h);
4961 m128Tmp5= _mm_unpackhi_epi32(E10h,E11h);
4962 m128Tmp6= _mm_unpackhi_epi32(E12h,E13h);
4963 m128Tmp7= _mm_unpackhi_epi32(E14h,E15h);
4964
4965
4966 m128iS24= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter
4967 m128iS25= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter
4968
4969
4970 m128iS26= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter
4971 m128iS27= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); //last quarter
4972
4973 //last row
4974
4975
4976 m128iS28= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter
4977 m128iS29= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter
4978
4979 m128iS30= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter
4980 m128iS31= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter
4981
4982
4983 m128Tmp0=_mm_setzero_si128();
4984
4985
4986 //store
4987 dst = (uint8_t*) _dst + i*stride;
4988
4989
4990 E0l= _mm_load_si128((__m128i*)dst); //16 values
4991 E1l= _mm_load_si128((__m128i*)(dst+16));
4992 E2l= _mm_load_si128((__m128i*)(dst+stride));
4993 E3l= _mm_load_si128((__m128i*)(dst+stride+16));
4994 E4l= _mm_load_si128((__m128i*)(dst+2*stride));
4995 E5l= _mm_load_si128((__m128i*)(dst+2*stride+16));
4996 E6l= _mm_load_si128((__m128i*)(dst+3*stride));
4997 E7l= _mm_load_si128((__m128i*)(dst+3*stride+16));
4998 E8l= _mm_load_si128((__m128i*)(dst+4*stride));
4999 E9l= _mm_load_si128((__m128i*)(dst+4*stride+16));
5000 E10l= _mm_load_si128((__m128i*)(dst+5*stride));
5001 E11l= _mm_load_si128((__m128i*)(dst+5*stride+16));
5002 E12l= _mm_load_si128((__m128i*)(dst+6*stride));
5003 E13l= _mm_load_si128((__m128i*)(dst+6*stride+16));
5004 E14l= _mm_load_si128((__m128i*)(dst+7*stride));
5005 E15l= _mm_load_si128((__m128i*)(dst+7*stride+16));
5006
5007 m128iS0= _mm_adds_epi16(m128iS0,_mm_unpacklo_epi8(E0l,m128Tmp0));
5008 m128iS1= _mm_adds_epi16(m128iS1,_mm_unpackhi_epi8(E0l,m128Tmp0));
5009 m128iS0= _mm_packus_epi16(m128iS0,m128iS1);
5010
5011 m128iS2= _mm_adds_epi16(m128iS2,_mm_unpacklo_epi8(E1l,m128Tmp0));
5012 m128iS3= _mm_adds_epi16(m128iS3,_mm_unpackhi_epi8(E1l,m128Tmp0));
5013 m128iS2= _mm_packus_epi16(m128iS2,m128iS3);
5014
5015 m128iS4= _mm_adds_epi16(m128iS4,_mm_unpacklo_epi8(E2l,m128Tmp0));
5016 m128iS5= _mm_adds_epi16(m128iS5,_mm_unpackhi_epi8(E2l,m128Tmp0));
5017 m128iS4= _mm_packus_epi16(m128iS4,m128iS5);
5018
5019 m128iS6= _mm_adds_epi16(m128iS6,_mm_unpacklo_epi8(E3l,m128Tmp0));
5020 m128iS7= _mm_adds_epi16(m128iS7,_mm_unpackhi_epi8(E3l,m128Tmp0));
5021 m128iS6= _mm_packus_epi16(m128iS6,m128iS7);
5022
5023 m128iS8= _mm_adds_epi16(m128iS8,_mm_unpacklo_epi8(E4l,m128Tmp0));
5024 m128iS9= _mm_adds_epi16(m128iS9,_mm_unpackhi_epi8(E4l,m128Tmp0));
5025 m128iS8= _mm_packus_epi16(m128iS8,m128iS9);
5026
5027 m128iS10= _mm_adds_epi16(m128iS10,_mm_unpacklo_epi8(E5l,m128Tmp0));
5028 m128iS11= _mm_adds_epi16(m128iS11,_mm_unpackhi_epi8(E5l,m128Tmp0));
5029 m128iS10= _mm_packus_epi16(m128iS10,m128iS11);
5030
5031 m128iS12= _mm_adds_epi16(m128iS12,_mm_unpacklo_epi8(E6l,m128Tmp0));
5032 m128iS13= _mm_adds_epi16(m128iS13,_mm_unpackhi_epi8(E6l,m128Tmp0));
5033 m128iS12= _mm_packus_epi16(m128iS12,m128iS13);
5034
5035 m128iS14= _mm_adds_epi16(m128iS14,_mm_unpacklo_epi8(E7l,m128Tmp0));
5036 m128iS15= _mm_adds_epi16(m128iS15,_mm_unpackhi_epi8(E7l,m128Tmp0));
5037 m128iS14= _mm_packus_epi16(m128iS14,m128iS15);
5038
5039 m128iS16= _mm_adds_epi16(m128iS16,_mm_unpacklo_epi8(E8l,m128Tmp0));
5040 m128iS17= _mm_adds_epi16(m128iS17,_mm_unpackhi_epi8(E8l,m128Tmp0));
5041 m128iS16= _mm_packus_epi16(m128iS16,m128iS17);
5042
5043 m128iS18= _mm_adds_epi16(m128iS18,_mm_unpacklo_epi8(E9l,m128Tmp0));
5044 m128iS19= _mm_adds_epi16(m128iS19,_mm_unpackhi_epi8(E9l,m128Tmp0));
5045 m128iS18= _mm_packus_epi16(m128iS18,m128iS19);
5046
5047 m128iS20= _mm_adds_epi16(m128iS20,_mm_unpacklo_epi8(E10l,m128Tmp0));
5048 m128iS21= _mm_adds_epi16(m128iS21,_mm_unpackhi_epi8(E10l,m128Tmp0));
5049 m128iS20= _mm_packus_epi16(m128iS20,m128iS21);
5050
5051 m128iS22= _mm_adds_epi16(m128iS22,_mm_unpacklo_epi8(E11l,m128Tmp0));
5052 m128iS23= _mm_adds_epi16(m128iS23,_mm_unpackhi_epi8(E11l,m128Tmp0));
5053 m128iS22= _mm_packus_epi16(m128iS22,m128iS23);
5054
5055 m128iS24= _mm_adds_epi16(m128iS24,_mm_unpacklo_epi8(E12l,m128Tmp0));
5056 m128iS25= _mm_adds_epi16(m128iS25,_mm_unpackhi_epi8(E12l,m128Tmp0));
5057 m128iS24= _mm_packus_epi16(m128iS24,m128iS25);
5058
5059 m128iS26= _mm_adds_epi16(m128iS26,_mm_unpacklo_epi8(E13l,m128Tmp0));
5060 m128iS27= _mm_adds_epi16(m128iS27,_mm_unpackhi_epi8(E13l,m128Tmp0));
5061 m128iS26= _mm_packus_epi16(m128iS26,m128iS27);
5062
5063 m128iS28= _mm_adds_epi16(m128iS28,_mm_unpacklo_epi8(E14l,m128Tmp0));
5064 m128iS29= _mm_adds_epi16(m128iS29,_mm_unpackhi_epi8(E14l,m128Tmp0));
5065 m128iS28= _mm_packus_epi16(m128iS28,m128iS29);
5066
5067 m128iS30= _mm_adds_epi16(m128iS30,_mm_unpacklo_epi8(E15l,m128Tmp0));
5068 m128iS31= _mm_adds_epi16(m128iS31,_mm_unpackhi_epi8(E15l,m128Tmp0));
5069 m128iS30= _mm_packus_epi16(m128iS30,m128iS31);
5070
5071
5072 _mm_store_si128((__m128i*)dst,m128iS0);
5073 _mm_store_si128((__m128i*)(dst+16),m128iS2);
5074 _mm_store_si128((__m128i*)(dst+stride),m128iS4);
5075 _mm_store_si128((__m128i*)(dst+stride+16),m128iS6);
5076 _mm_store_si128((__m128i*)(dst+2*stride),m128iS8);
5077 _mm_store_si128((__m128i*)(dst+2*stride+16),m128iS10);
5078 _mm_store_si128((__m128i*)(dst+3*stride),m128iS12);
5079 _mm_store_si128((__m128i*)(dst+3*stride+16),m128iS14);
5080 _mm_store_si128((__m128i*)(dst+4*stride),m128iS16);
5081 _mm_store_si128((__m128i*)(dst+4*stride+16),m128iS18);
5082 _mm_store_si128((__m128i*)(dst+5*stride),m128iS20);
5083 _mm_store_si128((__m128i*)(dst+5*stride+16),m128iS22);
5084 _mm_store_si128((__m128i*)(dst+6*stride),m128iS24);
5085 _mm_store_si128((__m128i*)(dst+6*stride+16),m128iS26);
5086 _mm_store_si128((__m128i*)(dst+7*stride),m128iS28);
5087 _mm_store_si128((__m128i*)(dst+7*stride+16),m128iS30);
5088
5089
5090 if(i==0){
5091 //load next values :
5092 m128iS0 = r1;
5093 m128iS1 = r5;
5094 m128iS2 = r9;
5095 m128iS3 = r13;
5096 m128iS4 = r17;
5097 m128iS5 = r21;
5098 m128iS6 = r25;
5099 m128iS7 = r29;
5100 m128iS8 = r33;
5101 m128iS9 = r37;
5102 m128iS10 = r41;
5103 m128iS11 = r45;
5104 m128iS12 = r49;
5105 m128iS13 = r53;
5106 m128iS14 = r57;
5107 m128iS15 = r61;
5108 m128iS16 = r65;
5109 m128iS17 = r69;
5110 m128iS18 = r73;
5111 m128iS19 = r77;
5112 m128iS20 = r81;
5113 m128iS21 = r85;
5114 m128iS22 = r89;
5115 m128iS23 = r93;
5116 m128iS24 = r97;
5117 m128iS25 = r101;
5118 m128iS26 = r105;
5119 m128iS27 = r109;
5120 m128iS28 = r113;
5121 m128iS29 = r117;
5122 m128iS30 = r121;
5123 m128iS31 =r125;
5124
5125 }else if(i ==8){
5126 //load next values :
5127 m128iS0 = r2;
5128 m128iS1 = r6;
5129 m128iS2 = r10;
5130 m128iS3 = r14;
5131 m128iS4 = r18;
5132 m128iS5 = r22;
5133 m128iS6 = r26;
5134 m128iS7 = r30;
5135 m128iS8 = r34;
5136 m128iS9 = r38;
5137 m128iS10 = r42;
5138 m128iS11 = r46;
5139 m128iS12 = r50;
5140 m128iS13 = r54;
5141 m128iS14 = r58;
5142 m128iS15 = r62;
5143 m128iS16 = r66;
5144 m128iS17 = r70;
5145 m128iS18 = r74;
5146 m128iS19 = r78;
5147 m128iS20 = r82;
5148 m128iS21 = r86;
5149 m128iS22 = r90;
5150 m128iS23 = r94;
5151 m128iS24 = r98;
5152 m128iS25 = r102;
5153 m128iS26 = r106;
5154 m128iS27 = r110;
5155 m128iS28 = r114;
5156 m128iS29 = r118;
5157 m128iS30 = r122;
5158 m128iS31 =r126;
5159
5160 }else if(i==16)
5161 {
5162 //load next values :
5163 m128iS0 = r3;
5164 m128iS1 = r7;
5165 m128iS2 = r11;
5166 m128iS3 = r15;
5167 m128iS4 = r19;
5168 m128iS5 = r23;
5169 m128iS6 = r27;
5170 m128iS7 = r31;
5171 m128iS8 = r35;
5172 m128iS9 = r39;
5173 m128iS10 = r43;
5174 m128iS11 = r47;
5175 m128iS12 = r51;
5176 m128iS13 = r55;
5177 m128iS14 = r59;
5178 m128iS15 = r63;
5179 m128iS16 = r67;
5180 m128iS17 = r71;
5181 m128iS18 = r75;
5182 m128iS19 = r79;
5183 m128iS20 = r83;
5184 m128iS21 = r87;
5185 m128iS22 = r91;
5186 m128iS23 = r95;
5187 m128iS24 = r99;
5188 m128iS25 = r103;
5189 m128iS26 = r107;
5190 m128iS27 = r111;
5191 m128iS28 = r115;
5192 m128iS29 = r119;
5193 m128iS30 = r123;
5194 m128iS31 =r127;
5195 }
5196 }
5197 }
5198 }
5199 }
5200 #endif
5201
5202
5203 #if 0
5204 void ff_hevc_transform_32x32_add_10_sse4(uint8_t *_dst, const int16_t *coeffs,
5205 ptrdiff_t _stride) {
5206 int i, j;
5207 uint16_t *dst = (uint16_t*) _dst;
5208 ptrdiff_t stride = _stride / 2;
5209 int shift;
5210 uint8_t shift_2nd = 10; //20 - bit depth
5211 uint16_t add_2nd = 1<<9; //shift2 - 1
5212 int16_t *src = coeffs;
5213
5214 __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
5215 m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13,
5216 m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2,
5217 m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h,
5218 E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h,
5219 O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l,
5220 E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h;
5221 __m128i E4l, E5l, E6l, E7l, E8l, E9l, E10l, E11l, E12l, E13l, E14l, E15l;
5222 __m128i E4h, E5h, E6h, E7h, E8h, E9h, E10h, E11h, E12h, E13h, E14h, E15h,
5223 EEE0l, EEE1l, EEE0h, EEE1h;
5224 __m128i m128iS16, m128iS17, m128iS18, m128iS19, m128iS20, m128iS21,
5225 m128iS22, m128iS23, m128iS24, m128iS25, m128iS26, m128iS27,
5226 m128iS28, m128iS29, m128iS30, m128iS31, m128Tmp8, m128Tmp9,
5227 m128Tmp10, m128Tmp11, m128Tmp12, m128Tmp13, m128Tmp14, m128Tmp15,
5228 O8h, O9h, O10h, O11h, O12h, O13h, O14h, O15h, O8l, O9l, O10l, O11l,
5229 O12l, O13l, O14l, O15l, E02l, E02h, E03l, E03h, EE7l, EE6l, EE5l,
5230 EE4l, EE7h, EE6h, EE5h, EE4h;
5231 m128iS0 = _mm_load_si128((__m128i *) (src));
5232 m128iS1 = _mm_load_si128((__m128i *) (src + 32));
5233 m128iS2 = _mm_load_si128((__m128i *) (src + 64));
5234 m128iS3 = _mm_load_si128((__m128i *) (src + 96));
5235 m128iS4 = _mm_loadu_si128((__m128i *) (src + 128));
5236 m128iS5 = _mm_load_si128((__m128i *) (src + 160));
5237 m128iS6 = _mm_load_si128((__m128i *) (src + 192));
5238 m128iS7 = _mm_load_si128((__m128i *) (src + 224));
5239 m128iS8 = _mm_load_si128((__m128i *) (src + 256));
5240 m128iS9 = _mm_load_si128((__m128i *) (src + 288));
5241 m128iS10 = _mm_load_si128((__m128i *) (src + 320));
5242 m128iS11 = _mm_load_si128((__m128i *) (src + 352));
5243 m128iS12 = _mm_loadu_si128((__m128i *) (src + 384));
5244 m128iS13 = _mm_load_si128((__m128i *) (src + 416));
5245 m128iS14 = _mm_load_si128((__m128i *) (src + 448));
5246 m128iS15 = _mm_load_si128((__m128i *) (src + 480));
5247 m128iS16 = _mm_load_si128((__m128i *) (src + 512));
5248 m128iS17 = _mm_load_si128((__m128i *) (src + 544));
5249 m128iS18 = _mm_load_si128((__m128i *) (src + 576));
5250 m128iS19 = _mm_load_si128((__m128i *) (src + 608));
5251 m128iS20 = _mm_load_si128((__m128i *) (src + 640));
5252 m128iS21 = _mm_load_si128((__m128i *) (src + 672));
5253 m128iS22 = _mm_load_si128((__m128i *) (src + 704));
5254 m128iS23 = _mm_load_si128((__m128i *) (src + 736));
5255 m128iS24 = _mm_load_si128((__m128i *) (src + 768));
5256 m128iS25 = _mm_load_si128((__m128i *) (src + 800));
5257 m128iS26 = _mm_load_si128((__m128i *) (src + 832));
5258 m128iS27 = _mm_load_si128((__m128i *) (src + 864));
5259 m128iS28 = _mm_load_si128((__m128i *) (src + 896));
5260 m128iS29 = _mm_load_si128((__m128i *) (src + 928));
5261 m128iS30 = _mm_load_si128((__m128i *) (src + 960));
5262 m128iS31 = _mm_load_si128((__m128i *) (src + 992));
5263
5264 shift = shift_1st;
5265 m128iAdd = _mm_set1_epi32(add_1st);
5266
5267 for (j = 0; j < 2; j++) {
5268 for (i = 0; i < 32; i += 8) {
5269 m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
5270 E0l = _mm_madd_epi16(m128Tmp0,
5271 _mm_load_si128((__m128i *) (transform32x32[0][0])));
5272 m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
5273 E0h = _mm_madd_epi16(m128Tmp1,
5274 _mm_load_si128((__m128i *) (transform32x32[0][0])));
5275
5276 m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
5277 E1l = _mm_madd_epi16(m128Tmp2,
5278 _mm_load_si128((__m128i *) (transform32x32[1][0])));
5279 m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
5280 E1h = _mm_madd_epi16(m128Tmp3,
5281 _mm_load_si128((__m128i *) (transform32x32[1][0])));
5282
5283 m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11);
5284 E2l = _mm_madd_epi16(m128Tmp4,
5285 _mm_load_si128((__m128i *) (transform32x32[2][0])));
5286 m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11);
5287 E2h = _mm_madd_epi16(m128Tmp5,
5288 _mm_load_si128((__m128i *) (transform32x32[2][0])));
5289
5290 m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15);
5291 E3l = _mm_madd_epi16(m128Tmp6,
5292 _mm_load_si128((__m128i *) (transform32x32[3][0])));
5293 m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15);
5294 E3h = _mm_madd_epi16(m128Tmp7,
5295 _mm_load_si128((__m128i *) (transform32x32[3][0])));
5296
5297 m128Tmp8 = _mm_unpacklo_epi16(m128iS17, m128iS19);
5298 E4l = _mm_madd_epi16(m128Tmp8,
5299 _mm_load_si128((__m128i *) (transform32x32[4][0])));
5300 m128Tmp9 = _mm_unpackhi_epi16(m128iS17, m128iS19);
5301 E4h = _mm_madd_epi16(m128Tmp9,
5302 _mm_load_si128((__m128i *) (transform32x32[4][0])));
5303
5304 m128Tmp10 = _mm_unpacklo_epi16(m128iS21, m128iS23);
5305 E5l = _mm_madd_epi16(m128Tmp10,
5306 _mm_load_si128((__m128i *) (transform32x32[5][0])));
5307 m128Tmp11 = _mm_unpackhi_epi16(m128iS21, m128iS23);
5308 E5h = _mm_madd_epi16(m128Tmp11,
5309 _mm_load_si128((__m128i *) (transform32x32[5][0])));
5310
5311 m128Tmp12 = _mm_unpacklo_epi16(m128iS25, m128iS27);
5312 E6l = _mm_madd_epi16(m128Tmp12,
5313 _mm_load_si128((__m128i *) (transform32x32[6][0])));
5314 m128Tmp13 = _mm_unpackhi_epi16(m128iS25, m128iS27);
5315 E6h = _mm_madd_epi16(m128Tmp13,
5316 _mm_load_si128((__m128i *) (transform32x32[6][0])));
5317
5318 m128Tmp14 = _mm_unpacklo_epi16(m128iS29, m128iS31);
5319 E7l = _mm_madd_epi16(m128Tmp14,
5320 _mm_load_si128((__m128i *) (transform32x32[7][0])));
5321 m128Tmp15 = _mm_unpackhi_epi16(m128iS29, m128iS31);
5322 E7h = _mm_madd_epi16(m128Tmp15,
5323 _mm_load_si128((__m128i *) (transform32x32[7][0])));
5324
5325 O0l = _mm_add_epi32(E0l, E1l);
5326 O0l = _mm_add_epi32(O0l, E2l);
5327 O0l = _mm_add_epi32(O0l, E3l);
5328 O0l = _mm_add_epi32(O0l, E4l);
5329 O0l = _mm_add_epi32(O0l, E5l);
5330 O0l = _mm_add_epi32(O0l, E6l);
5331 O0l = _mm_add_epi32(O0l, E7l);
5332
5333 O0h = _mm_add_epi32(E0h, E1h);
5334 O0h = _mm_add_epi32(O0h, E2h);
5335 O0h = _mm_add_epi32(O0h, E3h);
5336 O0h = _mm_add_epi32(O0h, E4h);
5337 O0h = _mm_add_epi32(O0h, E5h);
5338 O0h = _mm_add_epi32(O0h, E6h);
5339 O0h = _mm_add_epi32(O0h, E7h);
5340
5341 /* Compute O1*/
5342 E0l = _mm_madd_epi16(m128Tmp0,
5343 _mm_load_si128((__m128i *) (transform32x32[0][1])));
5344 E0h = _mm_madd_epi16(m128Tmp1,
5345 _mm_load_si128((__m128i *) (transform32x32[0][1])));
5346 E1l = _mm_madd_epi16(m128Tmp2,
5347 _mm_load_si128((__m128i *) (transform32x32[1][1])));
5348 E1h = _mm_madd_epi16(m128Tmp3,
5349 _mm_load_si128((__m128i *) (transform32x32[1][1])));
5350 E2l = _mm_madd_epi16(m128Tmp4,
5351 _mm_load_si128((__m128i *) (transform32x32[2][1])));
5352 E2h = _mm_madd_epi16(m128Tmp5,
5353 _mm_load_si128((__m128i *) (transform32x32[2][1])));
5354 E3l = _mm_madd_epi16(m128Tmp6,
5355 _mm_load_si128((__m128i *) (transform32x32[3][1])));
5356 E3h = _mm_madd_epi16(m128Tmp7,
5357 _mm_load_si128((__m128i *) (transform32x32[3][1])));
5358
5359 E4l = _mm_madd_epi16(m128Tmp8,
5360 _mm_load_si128((__m128i *) (transform32x32[4][1])));
5361 E4h = _mm_madd_epi16(m128Tmp9,
5362 _mm_load_si128((__m128i *) (transform32x32[4][1])));
5363 E5l = _mm_madd_epi16(m128Tmp10,
5364 _mm_load_si128((__m128i *) (transform32x32[5][1])));
5365 E5h = _mm_madd_epi16(m128Tmp11,
5366 _mm_load_si128((__m128i *) (transform32x32[5][1])));
5367 E6l = _mm_madd_epi16(m128Tmp12,
5368 _mm_load_si128((__m128i *) (transform32x32[6][1])));
5369 E6h = _mm_madd_epi16(m128Tmp13,
5370 _mm_load_si128((__m128i *) (transform32x32[6][1])));
5371 E7l = _mm_madd_epi16(m128Tmp14,
5372 _mm_load_si128((__m128i *) (transform32x32[7][1])));
5373 E7h = _mm_madd_epi16(m128Tmp15,
5374 _mm_load_si128((__m128i *) (transform32x32[7][1])));
5375
5376 O1l = _mm_add_epi32(E0l, E1l);
5377 O1l = _mm_add_epi32(O1l, E2l);
5378 O1l = _mm_add_epi32(O1l, E3l);
5379 O1l = _mm_add_epi32(O1l, E4l);
5380 O1l = _mm_add_epi32(O1l, E5l);
5381 O1l = _mm_add_epi32(O1l, E6l);
5382 O1l = _mm_add_epi32(O1l, E7l);
5383
5384 O1h = _mm_add_epi32(E0h, E1h);
5385 O1h = _mm_add_epi32(O1h, E2h);
5386 O1h = _mm_add_epi32(O1h, E3h);
5387 O1h = _mm_add_epi32(O1h, E4h);
5388 O1h = _mm_add_epi32(O1h, E5h);
5389 O1h = _mm_add_epi32(O1h, E6h);
5390 O1h = _mm_add_epi32(O1h, E7h);
5391 /* Compute O2*/
5392 E0l = _mm_madd_epi16(m128Tmp0,
5393 _mm_load_si128((__m128i *) (transform32x32[0][2])));
5394 E0h = _mm_madd_epi16(m128Tmp1,
5395 _mm_load_si128((__m128i *) (transform32x32[0][2])));
5396 E1l = _mm_madd_epi16(m128Tmp2,
5397 _mm_load_si128((__m128i *) (transform32x32[1][2])));
5398 E1h = _mm_madd_epi16(m128Tmp3,
5399 _mm_load_si128((__m128i *) (transform32x32[1][2])));
5400 E2l = _mm_madd_epi16(m128Tmp4,
5401 _mm_load_si128((__m128i *) (transform32x32[2][2])));
5402 E2h = _mm_madd_epi16(m128Tmp5,
5403 _mm_load_si128((__m128i *) (transform32x32[2][2])));
5404 E3l = _mm_madd_epi16(m128Tmp6,
5405 _mm_load_si128((__m128i *) (transform32x32[3][2])));
5406 E3h = _mm_madd_epi16(m128Tmp7,
5407 _mm_load_si128((__m128i *) (transform32x32[3][2])));
5408
5409 E4l = _mm_madd_epi16(m128Tmp8,
5410 _mm_load_si128((__m128i *) (transform32x32[4][2])));
5411 E4h = _mm_madd_epi16(m128Tmp9,
5412 _mm_load_si128((__m128i *) (transform32x32[4][2])));
5413 E5l = _mm_madd_epi16(m128Tmp10,
5414 _mm_load_si128((__m128i *) (transform32x32[5][2])));
5415 E5h = _mm_madd_epi16(m128Tmp11,
5416 _mm_load_si128((__m128i *) (transform32x32[5][2])));
5417 E6l = _mm_madd_epi16(m128Tmp12,
5418 _mm_load_si128((__m128i *) (transform32x32[6][2])));
5419 E6h = _mm_madd_epi16(m128Tmp13,
5420 _mm_load_si128((__m128i *) (transform32x32[6][2])));
5421 E7l = _mm_madd_epi16(m128Tmp14,
5422 _mm_load_si128((__m128i *) (transform32x32[7][2])));
5423 E7h = _mm_madd_epi16(m128Tmp15,
5424 _mm_load_si128((__m128i *) (transform32x32[7][2])));
5425
5426 O2l = _mm_add_epi32(E0l, E1l);
5427 O2l = _mm_add_epi32(O2l, E2l);
5428 O2l = _mm_add_epi32(O2l, E3l);
5429 O2l = _mm_add_epi32(O2l, E4l);
5430 O2l = _mm_add_epi32(O2l, E5l);
5431 O2l = _mm_add_epi32(O2l, E6l);
5432 O2l = _mm_add_epi32(O2l, E7l);
5433
5434 O2h = _mm_add_epi32(E0h, E1h);
5435 O2h = _mm_add_epi32(O2h, E2h);
5436 O2h = _mm_add_epi32(O2h, E3h);
5437 O2h = _mm_add_epi32(O2h, E4h);
5438 O2h = _mm_add_epi32(O2h, E5h);
5439 O2h = _mm_add_epi32(O2h, E6h);
5440 O2h = _mm_add_epi32(O2h, E7h);
5441 /* Compute O3*/
5442 E0l = _mm_madd_epi16(m128Tmp0,
5443 _mm_load_si128((__m128i *) (transform32x32[0][3])));
5444 E0h = _mm_madd_epi16(m128Tmp1,
5445 _mm_load_si128((__m128i *) (transform32x32[0][3])));
5446 E1l = _mm_madd_epi16(m128Tmp2,
5447 _mm_load_si128((__m128i *) (transform32x32[1][3])));
5448 E1h = _mm_madd_epi16(m128Tmp3,
5449 _mm_load_si128((__m128i *) (transform32x32[1][3])));
5450 E2l = _mm_madd_epi16(m128Tmp4,
5451 _mm_load_si128((__m128i *) (transform32x32[2][3])));
5452 E2h = _mm_madd_epi16(m128Tmp5,
5453 _mm_load_si128((__m128i *) (transform32x32[2][3])));
5454 E3l = _mm_madd_epi16(m128Tmp6,
5455 _mm_load_si128((__m128i *) (transform32x32[3][3])));
5456 E3h = _mm_madd_epi16(m128Tmp7,
5457 _mm_load_si128((__m128i *) (transform32x32[3][3])));
5458
5459 E4l = _mm_madd_epi16(m128Tmp8,
5460 _mm_load_si128((__m128i *) (transform32x32[4][3])));
5461 E4h = _mm_madd_epi16(m128Tmp9,
5462 _mm_load_si128((__m128i *) (transform32x32[4][3])));
5463 E5l = _mm_madd_epi16(m128Tmp10,
5464 _mm_load_si128((__m128i *) (transform32x32[5][3])));
5465 E5h = _mm_madd_epi16(m128Tmp11,
5466 _mm_load_si128((__m128i *) (transform32x32[5][3])));
5467 E6l = _mm_madd_epi16(m128Tmp12,
5468 _mm_load_si128((__m128i *) (transform32x32[6][3])));
5469 E6h = _mm_madd_epi16(m128Tmp13,
5470 _mm_load_si128((__m128i *) (transform32x32[6][3])));
5471 E7l = _mm_madd_epi16(m128Tmp14,
5472 _mm_load_si128((__m128i *) (transform32x32[7][3])));
5473 E7h = _mm_madd_epi16(m128Tmp15,
5474 _mm_load_si128((__m128i *) (transform32x32[7][3])));
5475
5476 O3l = _mm_add_epi32(E0l, E1l);
5477 O3l = _mm_add_epi32(O3l, E2l);
5478 O3l = _mm_add_epi32(O3l, E3l);
5479 O3l = _mm_add_epi32(O3l, E4l);
5480 O3l = _mm_add_epi32(O3l, E5l);
5481 O3l = _mm_add_epi32(O3l, E6l);
5482 O3l = _mm_add_epi32(O3l, E7l);
5483
5484 O3h = _mm_add_epi32(E0h, E1h);
5485 O3h = _mm_add_epi32(O3h, E2h);
5486 O3h = _mm_add_epi32(O3h, E3h);
5487 O3h = _mm_add_epi32(O3h, E4h);
5488 O3h = _mm_add_epi32(O3h, E5h);
5489 O3h = _mm_add_epi32(O3h, E6h);
5490 O3h = _mm_add_epi32(O3h, E7h);
5491 /* Compute O4*/
5492
5493 E0l = _mm_madd_epi16(m128Tmp0,
5494 _mm_load_si128((__m128i *) (transform32x32[0][4])));
5495 E0h = _mm_madd_epi16(m128Tmp1,
5496 _mm_load_si128((__m128i *) (transform32x32[0][4])));
5497 E1l = _mm_madd_epi16(m128Tmp2,
5498 _mm_load_si128((__m128i *) (transform32x32[1][4])));
5499 E1h = _mm_madd_epi16(m128Tmp3,
5500 _mm_load_si128((__m128i *) (transform32x32[1][4])));
5501 E2l = _mm_madd_epi16(m128Tmp4,
5502 _mm_load_si128((__m128i *) (transform32x32[2][4])));
5503 E2h = _mm_madd_epi16(m128Tmp5,
5504 _mm_load_si128((__m128i *) (transform32x32[2][4])));
5505 E3l = _mm_madd_epi16(m128Tmp6,
5506 _mm_load_si128((__m128i *) (transform32x32[3][4])));
5507 E3h = _mm_madd_epi16(m128Tmp7,
5508 _mm_load_si128((__m128i *) (transform32x32[3][4])));
5509
5510 E4l = _mm_madd_epi16(m128Tmp8,
5511 _mm_load_si128((__m128i *) (transform32x32[4][4])));
5512 E4h = _mm_madd_epi16(m128Tmp9,
5513 _mm_load_si128((__m128i *) (transform32x32[4][4])));
5514 E5l = _mm_madd_epi16(m128Tmp10,
5515 _mm_load_si128((__m128i *) (transform32x32[5][4])));
5516 E5h = _mm_madd_epi16(m128Tmp11,
5517 _mm_load_si128((__m128i *) (transform32x32[5][4])));
5518 E6l = _mm_madd_epi16(m128Tmp12,
5519 _mm_load_si128((__m128i *) (transform32x32[6][4])));
5520 E6h = _mm_madd_epi16(m128Tmp13,
5521 _mm_load_si128((__m128i *) (transform32x32[6][4])));
5522 E7l = _mm_madd_epi16(m128Tmp14,
5523 _mm_load_si128((__m128i *) (transform32x32[7][4])));
5524 E7h = _mm_madd_epi16(m128Tmp15,
5525 _mm_load_si128((__m128i *) (transform32x32[7][4])));
5526
5527 O4l = _mm_add_epi32(E0l, E1l);
5528 O4l = _mm_add_epi32(O4l, E2l);
5529 O4l = _mm_add_epi32(O4l, E3l);
5530 O4l = _mm_add_epi32(O4l, E4l);
5531 O4l = _mm_add_epi32(O4l, E5l);
5532 O4l = _mm_add_epi32(O4l, E6l);
5533 O4l = _mm_add_epi32(O4l, E7l);
5534
5535 O4h = _mm_add_epi32(E0h, E1h);
5536 O4h = _mm_add_epi32(O4h, E2h);
5537 O4h = _mm_add_epi32(O4h, E3h);
5538 O4h = _mm_add_epi32(O4h, E4h);
5539 O4h = _mm_add_epi32(O4h, E5h);
5540 O4h = _mm_add_epi32(O4h, E6h);
5541 O4h = _mm_add_epi32(O4h, E7h);
5542
5543 /* Compute O5*/
5544 E0l = _mm_madd_epi16(m128Tmp0,
5545 _mm_load_si128((__m128i *) (transform32x32[0][5])));
5546 E0h = _mm_madd_epi16(m128Tmp1,
5547 _mm_load_si128((__m128i *) (transform32x32[0][5])));
5548 E1l = _mm_madd_epi16(m128Tmp2,
5549 _mm_load_si128((__m128i *) (transform32x32[1][5])));
5550 E1h = _mm_madd_epi16(m128Tmp3,
5551 _mm_load_si128((__m128i *) (transform32x32[1][5])));
5552 E2l = _mm_madd_epi16(m128Tmp4,
5553 _mm_load_si128((__m128i *) (transform32x32[2][5])));
5554 E2h = _mm_madd_epi16(m128Tmp5,
5555 _mm_load_si128((__m128i *) (transform32x32[2][5])));
5556 E3l = _mm_madd_epi16(m128Tmp6,
5557 _mm_load_si128((__m128i *) (transform32x32[3][5])));
5558 E3h = _mm_madd_epi16(m128Tmp7,
5559 _mm_load_si128((__m128i *) (transform32x32[3][5])));
5560
5561 E4l = _mm_madd_epi16(m128Tmp8,
5562 _mm_load_si128((__m128i *) (transform32x32[4][5])));
5563 E4h = _mm_madd_epi16(m128Tmp9,
5564 _mm_load_si128((__m128i *) (transform32x32[4][5])));
5565 E5l = _mm_madd_epi16(m128Tmp10,
5566 _mm_load_si128((__m128i *) (transform32x32[5][5])));
5567 E5h = _mm_madd_epi16(m128Tmp11,
5568 _mm_load_si128((__m128i *) (transform32x32[5][5])));
5569 E6l = _mm_madd_epi16(m128Tmp12,
5570 _mm_load_si128((__m128i *) (transform32x32[6][5])));
5571 E6h = _mm_madd_epi16(m128Tmp13,
5572 _mm_load_si128((__m128i *) (transform32x32[6][5])));
5573 E7l = _mm_madd_epi16(m128Tmp14,
5574 _mm_load_si128((__m128i *) (transform32x32[7][5])));
5575 E7h = _mm_madd_epi16(m128Tmp15,
5576 _mm_load_si128((__m128i *) (transform32x32[7][5])));
5577
5578 O5l = _mm_add_epi32(E0l, E1l);
5579 O5l = _mm_add_epi32(O5l, E2l);
5580 O5l = _mm_add_epi32(O5l, E3l);
5581 O5l = _mm_add_epi32(O5l, E4l);
5582 O5l = _mm_add_epi32(O5l, E5l);
5583 O5l = _mm_add_epi32(O5l, E6l);
5584 O5l = _mm_add_epi32(O5l, E7l);
5585
5586 O5h = _mm_add_epi32(E0h, E1h);
5587 O5h = _mm_add_epi32(O5h, E2h);
5588 O5h = _mm_add_epi32(O5h, E3h);
5589 O5h = _mm_add_epi32(O5h, E4h);
5590 O5h = _mm_add_epi32(O5h, E5h);
5591 O5h = _mm_add_epi32(O5h, E6h);
5592 O5h = _mm_add_epi32(O5h, E7h);
5593
5594 /* Compute O6*/
5595
5596 E0l = _mm_madd_epi16(m128Tmp0,
5597 _mm_load_si128((__m128i *) (transform32x32[0][6])));
5598 E0h = _mm_madd_epi16(m128Tmp1,
5599 _mm_load_si128((__m128i *) (transform32x32[0][6])));
5600 E1l = _mm_madd_epi16(m128Tmp2,
5601 _mm_load_si128((__m128i *) (transform32x32[1][6])));
5602 E1h = _mm_madd_epi16(m128Tmp3,
5603 _mm_load_si128((__m128i *) (transform32x32[1][6])));
5604 E2l = _mm_madd_epi16(m128Tmp4,
5605 _mm_load_si128((__m128i *) (transform32x32[2][6])));
5606 E2h = _mm_madd_epi16(m128Tmp5,
5607 _mm_load_si128((__m128i *) (transform32x32[2][6])));
5608 E3l = _mm_madd_epi16(m128Tmp6,
5609 _mm_load_si128((__m128i *) (transform32x32[3][6])));
5610 E3h = _mm_madd_epi16(m128Tmp7,
5611 _mm_load_si128((__m128i *) (transform32x32[3][6])));
5612
5613 E4l = _mm_madd_epi16(m128Tmp8,
5614 _mm_load_si128((__m128i *) (transform32x32[4][6])));
5615 E4h = _mm_madd_epi16(m128Tmp9,
5616 _mm_load_si128((__m128i *) (transform32x32[4][6])));
5617 E5l = _mm_madd_epi16(m128Tmp10,
5618 _mm_load_si128((__m128i *) (transform32x32[5][6])));
5619 E5h = _mm_madd_epi16(m128Tmp11,
5620 _mm_load_si128((__m128i *) (transform32x32[5][6])));
5621 E6l = _mm_madd_epi16(m128Tmp12,
5622 _mm_load_si128((__m128i *) (transform32x32[6][6])));
5623 E6h = _mm_madd_epi16(m128Tmp13,
5624 _mm_load_si128((__m128i *) (transform32x32[6][6])));
5625 E7l = _mm_madd_epi16(m128Tmp14,
5626 _mm_load_si128((__m128i *) (transform32x32[7][6])));
5627 E7h = _mm_madd_epi16(m128Tmp15,
5628 _mm_load_si128((__m128i *) (transform32x32[7][6])));
5629
5630 O6l = _mm_add_epi32(E0l, E1l);
5631 O6l = _mm_add_epi32(O6l, E2l);
5632 O6l = _mm_add_epi32(O6l, E3l);
5633 O6l = _mm_add_epi32(O6l, E4l);
5634 O6l = _mm_add_epi32(O6l, E5l);
5635 O6l = _mm_add_epi32(O6l, E6l);
5636 O6l = _mm_add_epi32(O6l, E7l);
5637
5638 O6h = _mm_add_epi32(E0h, E1h);
5639 O6h = _mm_add_epi32(O6h, E2h);
5640 O6h = _mm_add_epi32(O6h, E3h);
5641 O6h = _mm_add_epi32(O6h, E4h);
5642 O6h = _mm_add_epi32(O6h, E5h);
5643 O6h = _mm_add_epi32(O6h, E6h);
5644 O6h = _mm_add_epi32(O6h, E7h);
5645
5646 /* Compute O7*/
5647
5648 E0l = _mm_madd_epi16(m128Tmp0,
5649 _mm_load_si128((__m128i *) (transform32x32[0][7])));
5650 E0h = _mm_madd_epi16(m128Tmp1,
5651 _mm_load_si128((__m128i *) (transform32x32[0][7])));
5652 E1l = _mm_madd_epi16(m128Tmp2,
5653 _mm_load_si128((__m128i *) (transform32x32[1][7])));
5654 E1h = _mm_madd_epi16(m128Tmp3,
5655 _mm_load_si128((__m128i *) (transform32x32[1][7])));
5656 E2l = _mm_madd_epi16(m128Tmp4,
5657 _mm_load_si128((__m128i *) (transform32x32[2][7])));
5658 E2h = _mm_madd_epi16(m128Tmp5,
5659 _mm_load_si128((__m128i *) (transform32x32[2][7])));
5660 E3l = _mm_madd_epi16(m128Tmp6,
5661 _mm_load_si128((__m128i *) (transform32x32[3][7])));
5662 E3h = _mm_madd_epi16(m128Tmp7,
5663 _mm_load_si128((__m128i *) (transform32x32[3][7])));
5664
5665 E4l = _mm_madd_epi16(m128Tmp8,
5666 _mm_load_si128((__m128i *) (transform32x32[4][7])));
5667 E4h = _mm_madd_epi16(m128Tmp9,
5668 _mm_load_si128((__m128i *) (transform32x32[4][7])));
5669 E5l = _mm_madd_epi16(m128Tmp10,
5670 _mm_load_si128((__m128i *) (transform32x32[5][7])));
5671 E5h = _mm_madd_epi16(m128Tmp11,
5672 _mm_load_si128((__m128i *) (transform32x32[5][7])));
5673 E6l = _mm_madd_epi16(m128Tmp12,
5674 _mm_load_si128((__m128i *) (transform32x32[6][7])));
5675 E6h = _mm_madd_epi16(m128Tmp13,
5676 _mm_load_si128((__m128i *) (transform32x32[6][7])));
5677 E7l = _mm_madd_epi16(m128Tmp14,
5678 _mm_load_si128((__m128i *) (transform32x32[7][7])));
5679 E7h = _mm_madd_epi16(m128Tmp15,
5680 _mm_load_si128((__m128i *) (transform32x32[7][7])));
5681
5682 O7l = _mm_add_epi32(E0l, E1l);
5683 O7l = _mm_add_epi32(O7l, E2l);
5684 O7l = _mm_add_epi32(O7l, E3l);
5685 O7l = _mm_add_epi32(O7l, E4l);
5686 O7l = _mm_add_epi32(O7l, E5l);
5687 O7l = _mm_add_epi32(O7l, E6l);
5688 O7l = _mm_add_epi32(O7l, E7l);
5689
5690 O7h = _mm_add_epi32(E0h, E1h);
5691 O7h = _mm_add_epi32(O7h, E2h);
5692 O7h = _mm_add_epi32(O7h, E3h);
5693 O7h = _mm_add_epi32(O7h, E4h);
5694 O7h = _mm_add_epi32(O7h, E5h);
5695 O7h = _mm_add_epi32(O7h, E6h);
5696 O7h = _mm_add_epi32(O7h, E7h);
5697
5698 /* Compute O8*/
5699
5700 E0l = _mm_madd_epi16(m128Tmp0,
5701 _mm_load_si128((__m128i *) (transform32x32[0][8])));
5702 E0h = _mm_madd_epi16(m128Tmp1,
5703 _mm_load_si128((__m128i *) (transform32x32[0][8])));
5704 E1l = _mm_madd_epi16(m128Tmp2,
5705 _mm_load_si128((__m128i *) (transform32x32[1][8])));
5706 E1h = _mm_madd_epi16(m128Tmp3,
5707 _mm_load_si128((__m128i *) (transform32x32[1][8])));
5708 E2l = _mm_madd_epi16(m128Tmp4,
5709 _mm_load_si128((__m128i *) (transform32x32[2][8])));
5710 E2h = _mm_madd_epi16(m128Tmp5,
5711 _mm_load_si128((__m128i *) (transform32x32[2][8])));
5712 E3l = _mm_madd_epi16(m128Tmp6,
5713 _mm_load_si128((__m128i *) (transform32x32[3][8])));
5714 E3h = _mm_madd_epi16(m128Tmp7,
5715 _mm_load_si128((__m128i *) (transform32x32[3][8])));
5716
5717 E4l = _mm_madd_epi16(m128Tmp8,
5718 _mm_load_si128((__m128i *) (transform32x32[4][8])));
5719 E4h = _mm_madd_epi16(m128Tmp9,
5720 _mm_load_si128((__m128i *) (transform32x32[4][8])));
5721 E5l = _mm_madd_epi16(m128Tmp10,
5722 _mm_load_si128((__m128i *) (transform32x32[5][8])));
5723 E5h = _mm_madd_epi16(m128Tmp11,
5724 _mm_load_si128((__m128i *) (transform32x32[5][8])));
5725 E6l = _mm_madd_epi16(m128Tmp12,
5726 _mm_load_si128((__m128i *) (transform32x32[6][8])));
5727 E6h = _mm_madd_epi16(m128Tmp13,
5728 _mm_load_si128((__m128i *) (transform32x32[6][8])));
5729 E7l = _mm_madd_epi16(m128Tmp14,
5730 _mm_load_si128((__m128i *) (transform32x32[7][8])));
5731 E7h = _mm_madd_epi16(m128Tmp15,
5732 _mm_load_si128((__m128i *) (transform32x32[7][8])));
5733
5734 O8l = _mm_add_epi32(E0l, E1l);
5735 O8l = _mm_add_epi32(O8l, E2l);
5736 O8l = _mm_add_epi32(O8l, E3l);
5737 O8l = _mm_add_epi32(O8l, E4l);
5738 O8l = _mm_add_epi32(O8l, E5l);
5739 O8l = _mm_add_epi32(O8l, E6l);
5740 O8l = _mm_add_epi32(O8l, E7l);
5741
5742 O8h = _mm_add_epi32(E0h, E1h);
5743 O8h = _mm_add_epi32(O8h, E2h);
5744 O8h = _mm_add_epi32(O8h, E3h);
5745 O8h = _mm_add_epi32(O8h, E4h);
5746 O8h = _mm_add_epi32(O8h, E5h);
5747 O8h = _mm_add_epi32(O8h, E6h);
5748 O8h = _mm_add_epi32(O8h, E7h);
5749
5750 /* Compute O9*/
5751
5752 E0l = _mm_madd_epi16(m128Tmp0,
5753 _mm_load_si128((__m128i *) (transform32x32[0][9])));
5754 E0h = _mm_madd_epi16(m128Tmp1,
5755 _mm_load_si128((__m128i *) (transform32x32[0][9])));
5756 E1l = _mm_madd_epi16(m128Tmp2,
5757 _mm_load_si128((__m128i *) (transform32x32[1][9])));
5758 E1h = _mm_madd_epi16(m128Tmp3,
5759 _mm_load_si128((__m128i *) (transform32x32[1][9])));
5760 E2l = _mm_madd_epi16(m128Tmp4,
5761 _mm_load_si128((__m128i *) (transform32x32[2][9])));
5762 E2h = _mm_madd_epi16(m128Tmp5,
5763 _mm_load_si128((__m128i *) (transform32x32[2][9])));
5764 E3l = _mm_madd_epi16(m128Tmp6,
5765 _mm_load_si128((__m128i *) (transform32x32[3][9])));
5766 E3h = _mm_madd_epi16(m128Tmp7,
5767 _mm_load_si128((__m128i *) (transform32x32[3][9])));
5768
5769 E4l = _mm_madd_epi16(m128Tmp8,
5770 _mm_load_si128((__m128i *) (transform32x32[4][9])));
5771 E4h = _mm_madd_epi16(m128Tmp9,
5772 _mm_load_si128((__m128i *) (transform32x32[4][9])));
5773 E5l = _mm_madd_epi16(m128Tmp10,
5774 _mm_load_si128((__m128i *) (transform32x32[5][9])));
5775 E5h = _mm_madd_epi16(m128Tmp11,
5776 _mm_load_si128((__m128i *) (transform32x32[5][9])));
5777 E6l = _mm_madd_epi16(m128Tmp12,
5778 _mm_load_si128((__m128i *) (transform32x32[6][9])));
5779 E6h = _mm_madd_epi16(m128Tmp13,
5780 _mm_load_si128((__m128i *) (transform32x32[6][9])));
5781 E7l = _mm_madd_epi16(m128Tmp14,
5782 _mm_load_si128((__m128i *) (transform32x32[7][9])));
5783 E7h = _mm_madd_epi16(m128Tmp15,
5784 _mm_load_si128((__m128i *) (transform32x32[7][9])));
5785
5786 O9l = _mm_add_epi32(E0l, E1l);
5787 O9l = _mm_add_epi32(O9l, E2l);
5788 O9l = _mm_add_epi32(O9l, E3l);
5789 O9l = _mm_add_epi32(O9l, E4l);
5790 O9l = _mm_add_epi32(O9l, E5l);
5791 O9l = _mm_add_epi32(O9l, E6l);
5792 O9l = _mm_add_epi32(O9l, E7l);
5793
5794 O9h = _mm_add_epi32(E0h, E1h);
5795 O9h = _mm_add_epi32(O9h, E2h);
5796 O9h = _mm_add_epi32(O9h, E3h);
5797 O9h = _mm_add_epi32(O9h, E4h);
5798 O9h = _mm_add_epi32(O9h, E5h);
5799 O9h = _mm_add_epi32(O9h, E6h);
5800 O9h = _mm_add_epi32(O9h, E7h);
5801
5802 /* Compute 10*/
5803
5804 E0l = _mm_madd_epi16(m128Tmp0,
5805 _mm_load_si128((__m128i *) (transform32x32[0][10])));
5806 E0h = _mm_madd_epi16(m128Tmp1,
5807 _mm_load_si128((__m128i *) (transform32x32[0][10])));
5808 E1l = _mm_madd_epi16(m128Tmp2,
5809 _mm_load_si128((__m128i *) (transform32x32[1][10])));
5810 E1h = _mm_madd_epi16(m128Tmp3,
5811 _mm_load_si128((__m128i *) (transform32x32[1][10])));
5812 E2l = _mm_madd_epi16(m128Tmp4,
5813 _mm_load_si128((__m128i *) (transform32x32[2][10])));
5814 E2h = _mm_madd_epi16(m128Tmp5,
5815 _mm_load_si128((__m128i *) (transform32x32[2][10])));
5816 E3l = _mm_madd_epi16(m128Tmp6,
5817 _mm_load_si128((__m128i *) (transform32x32[3][10])));
5818 E3h = _mm_madd_epi16(m128Tmp7,
5819 _mm_load_si128((__m128i *) (transform32x32[3][10])));
5820
5821 E4l = _mm_madd_epi16(m128Tmp8,
5822 _mm_load_si128((__m128i *) (transform32x32[4][10])));
5823 E4h = _mm_madd_epi16(m128Tmp9,
5824 _mm_load_si128((__m128i *) (transform32x32[4][10])));
5825 E5l = _mm_madd_epi16(m128Tmp10,
5826 _mm_load_si128((__m128i *) (transform32x32[5][10])));
5827 E5h = _mm_madd_epi16(m128Tmp11,
5828 _mm_load_si128((__m128i *) (transform32x32[5][10])));
5829 E6l = _mm_madd_epi16(m128Tmp12,
5830 _mm_load_si128((__m128i *) (transform32x32[6][10])));
5831 E6h = _mm_madd_epi16(m128Tmp13,
5832 _mm_load_si128((__m128i *) (transform32x32[6][10])));
5833 E7l = _mm_madd_epi16(m128Tmp14,
5834 _mm_load_si128((__m128i *) (transform32x32[7][10])));
5835 E7h = _mm_madd_epi16(m128Tmp15,
5836 _mm_load_si128((__m128i *) (transform32x32[7][10])));
5837
5838 O10l = _mm_add_epi32(E0l, E1l);
5839 O10l = _mm_add_epi32(O10l, E2l);
5840 O10l = _mm_add_epi32(O10l, E3l);
5841 O10l = _mm_add_epi32(O10l, E4l);
5842 O10l = _mm_add_epi32(O10l, E5l);
5843 O10l = _mm_add_epi32(O10l, E6l);
5844 O10l = _mm_add_epi32(O10l, E7l);
5845
5846 O10h = _mm_add_epi32(E0h, E1h);
5847 O10h = _mm_add_epi32(O10h, E2h);
5848 O10h = _mm_add_epi32(O10h, E3h);
5849 O10h = _mm_add_epi32(O10h, E4h);
5850 O10h = _mm_add_epi32(O10h, E5h);
5851 O10h = _mm_add_epi32(O10h, E6h);
5852 O10h = _mm_add_epi32(O10h, E7h);
5853
5854 /* Compute 11*/
5855
5856 E0l = _mm_madd_epi16(m128Tmp0,
5857 _mm_load_si128((__m128i *) (transform32x32[0][11])));
5858 E0h = _mm_madd_epi16(m128Tmp1,
5859 _mm_load_si128((__m128i *) (transform32x32[0][11])));
5860 E1l = _mm_madd_epi16(m128Tmp2,
5861 _mm_load_si128((__m128i *) (transform32x32[1][11])));
5862 E1h = _mm_madd_epi16(m128Tmp3,
5863 _mm_load_si128((__m128i *) (transform32x32[1][11])));
5864 E2l = _mm_madd_epi16(m128Tmp4,
5865 _mm_load_si128((__m128i *) (transform32x32[2][11])));
5866 E2h = _mm_madd_epi16(m128Tmp5,
5867 _mm_load_si128((__m128i *) (transform32x32[2][11])));
5868 E3l = _mm_madd_epi16(m128Tmp6,
5869 _mm_load_si128((__m128i *) (transform32x32[3][11])));
5870 E3h = _mm_madd_epi16(m128Tmp7,
5871 _mm_load_si128((__m128i *) (transform32x32[3][11])));
5872
5873 E4l = _mm_madd_epi16(m128Tmp8,
5874 _mm_load_si128((__m128i *) (transform32x32[4][11])));
5875 E4h = _mm_madd_epi16(m128Tmp9,
5876 _mm_load_si128((__m128i *) (transform32x32[4][11])));
5877 E5l = _mm_madd_epi16(m128Tmp10,
5878 _mm_load_si128((__m128i *) (transform32x32[5][11])));
5879 E5h = _mm_madd_epi16(m128Tmp11,
5880 _mm_load_si128((__m128i *) (transform32x32[5][11])));
5881 E6l = _mm_madd_epi16(m128Tmp12,
5882 _mm_load_si128((__m128i *) (transform32x32[6][11])));
5883 E6h = _mm_madd_epi16(m128Tmp13,
5884 _mm_load_si128((__m128i *) (transform32x32[6][11])));
5885 E7l = _mm_madd_epi16(m128Tmp14,
5886 _mm_load_si128((__m128i *) (transform32x32[7][11])));
5887 E7h = _mm_madd_epi16(m128Tmp15,
5888 _mm_load_si128((__m128i *) (transform32x32[7][11])));
5889
5890 O11l = _mm_add_epi32(E0l, E1l);
5891 O11l = _mm_add_epi32(O11l, E2l);
5892 O11l = _mm_add_epi32(O11l, E3l);
5893 O11l = _mm_add_epi32(O11l, E4l);
5894 O11l = _mm_add_epi32(O11l, E5l);
5895 O11l = _mm_add_epi32(O11l, E6l);
5896 O11l = _mm_add_epi32(O11l, E7l);
5897
5898 O11h = _mm_add_epi32(E0h, E1h);
5899 O11h = _mm_add_epi32(O11h, E2h);
5900 O11h = _mm_add_epi32(O11h, E3h);
5901 O11h = _mm_add_epi32(O11h, E4h);
5902 O11h = _mm_add_epi32(O11h, E5h);
5903 O11h = _mm_add_epi32(O11h, E6h);
5904 O11h = _mm_add_epi32(O11h, E7h);
5905
5906 /* Compute 12*/
5907
5908 E0l = _mm_madd_epi16(m128Tmp0,
5909 _mm_load_si128((__m128i *) (transform32x32[0][12])));
5910 E0h = _mm_madd_epi16(m128Tmp1,
5911 _mm_load_si128((__m128i *) (transform32x32[0][12])));
5912 E1l = _mm_madd_epi16(m128Tmp2,
5913 _mm_load_si128((__m128i *) (transform32x32[1][12])));
5914 E1h = _mm_madd_epi16(m128Tmp3,
5915 _mm_load_si128((__m128i *) (transform32x32[1][12])));
5916 E2l = _mm_madd_epi16(m128Tmp4,
5917 _mm_load_si128((__m128i *) (transform32x32[2][12])));
5918 E2h = _mm_madd_epi16(m128Tmp5,
5919 _mm_load_si128((__m128i *) (transform32x32[2][12])));
5920 E3l = _mm_madd_epi16(m128Tmp6,
5921 _mm_load_si128((__m128i *) (transform32x32[3][12])));
5922 E3h = _mm_madd_epi16(m128Tmp7,
5923 _mm_load_si128((__m128i *) (transform32x32[3][12])));
5924
5925 E4l = _mm_madd_epi16(m128Tmp8,
5926 _mm_load_si128((__m128i *) (transform32x32[4][12])));
5927 E4h = _mm_madd_epi16(m128Tmp9,
5928 _mm_load_si128((__m128i *) (transform32x32[4][12])));
5929 E5l = _mm_madd_epi16(m128Tmp10,
5930 _mm_load_si128((__m128i *) (transform32x32[5][12])));
5931 E5h = _mm_madd_epi16(m128Tmp11,
5932 _mm_load_si128((__m128i *) (transform32x32[5][12])));
5933 E6l = _mm_madd_epi16(m128Tmp12,
5934 _mm_load_si128((__m128i *) (transform32x32[6][12])));
5935 E6h = _mm_madd_epi16(m128Tmp13,
5936 _mm_load_si128((__m128i *) (transform32x32[6][12])));
5937 E7l = _mm_madd_epi16(m128Tmp14,
5938 _mm_load_si128((__m128i *) (transform32x32[7][12])));
5939 E7h = _mm_madd_epi16(m128Tmp15,
5940 _mm_load_si128((__m128i *) (transform32x32[7][12])));
5941
5942 O12l = _mm_add_epi32(E0l, E1l);
5943 O12l = _mm_add_epi32(O12l, E2l);
5944 O12l = _mm_add_epi32(O12l, E3l);
5945 O12l = _mm_add_epi32(O12l, E4l);
5946 O12l = _mm_add_epi32(O12l, E5l);
5947 O12l = _mm_add_epi32(O12l, E6l);
5948 O12l = _mm_add_epi32(O12l, E7l);
5949
5950 O12h = _mm_add_epi32(E0h, E1h);
5951 O12h = _mm_add_epi32(O12h, E2h);
5952 O12h = _mm_add_epi32(O12h, E3h);
5953 O12h = _mm_add_epi32(O12h, E4h);
5954 O12h = _mm_add_epi32(O12h, E5h);
5955 O12h = _mm_add_epi32(O12h, E6h);
5956 O12h = _mm_add_epi32(O12h, E7h);
5957
5958 /* Compute 13*/
5959
5960 E0l = _mm_madd_epi16(m128Tmp0,
5961 _mm_load_si128((__m128i *) (transform32x32[0][13])));
5962 E0h = _mm_madd_epi16(m128Tmp1,
5963 _mm_load_si128((__m128i *) (transform32x32[0][13])));
5964 E1l = _mm_madd_epi16(m128Tmp2,
5965 _mm_load_si128((__m128i *) (transform32x32[1][13])));
5966 E1h = _mm_madd_epi16(m128Tmp3,
5967 _mm_load_si128((__m128i *) (transform32x32[1][13])));
5968 E2l = _mm_madd_epi16(m128Tmp4,
5969 _mm_load_si128((__m128i *) (transform32x32[2][13])));
5970 E2h = _mm_madd_epi16(m128Tmp5,
5971 _mm_load_si128((__m128i *) (transform32x32[2][13])));
5972 E3l = _mm_madd_epi16(m128Tmp6,
5973 _mm_load_si128((__m128i *) (transform32x32[3][13])));
5974 E3h = _mm_madd_epi16(m128Tmp7,
5975 _mm_load_si128((__m128i *) (transform32x32[3][13])));
5976
5977 E4l = _mm_madd_epi16(m128Tmp8,
5978 _mm_load_si128((__m128i *) (transform32x32[4][13])));
5979 E4h = _mm_madd_epi16(m128Tmp9,
5980 _mm_load_si128((__m128i *) (transform32x32[4][13])));
5981 E5l = _mm_madd_epi16(m128Tmp10,
5982 _mm_load_si128((__m128i *) (transform32x32[5][13])));
5983 E5h = _mm_madd_epi16(m128Tmp11,
5984 _mm_load_si128((__m128i *) (transform32x32[5][13])));
5985 E6l = _mm_madd_epi16(m128Tmp12,
5986 _mm_load_si128((__m128i *) (transform32x32[6][13])));
5987 E6h = _mm_madd_epi16(m128Tmp13,
5988 _mm_load_si128((__m128i *) (transform32x32[6][13])));
5989 E7l = _mm_madd_epi16(m128Tmp14,
5990 _mm_load_si128((__m128i *) (transform32x32[7][13])));
5991 E7h = _mm_madd_epi16(m128Tmp15,
5992 _mm_load_si128((__m128i *) (transform32x32[7][13])));
5993
5994 O13l = _mm_add_epi32(E0l, E1l);
5995 O13l = _mm_add_epi32(O13l, E2l);
5996 O13l = _mm_add_epi32(O13l, E3l);
5997 O13l = _mm_add_epi32(O13l, E4l);
5998 O13l = _mm_add_epi32(O13l, E5l);
5999 O13l = _mm_add_epi32(O13l, E6l);
6000 O13l = _mm_add_epi32(O13l, E7l);
6001
6002 O13h = _mm_add_epi32(E0h, E1h);
6003 O13h = _mm_add_epi32(O13h, E2h);
6004 O13h = _mm_add_epi32(O13h, E3h);
6005 O13h = _mm_add_epi32(O13h, E4h);
6006 O13h = _mm_add_epi32(O13h, E5h);
6007 O13h = _mm_add_epi32(O13h, E6h);
6008 O13h = _mm_add_epi32(O13h, E7h);
6009
6010 /* Compute O14 */
6011
6012 E0l = _mm_madd_epi16(m128Tmp0,
6013 _mm_load_si128((__m128i *) (transform32x32[0][14])));
6014 E0h = _mm_madd_epi16(m128Tmp1,
6015 _mm_load_si128((__m128i *) (transform32x32[0][14])));
6016 E1l = _mm_madd_epi16(m128Tmp2,
6017 _mm_load_si128((__m128i *) (transform32x32[1][14])));
6018 E1h = _mm_madd_epi16(m128Tmp3,
6019 _mm_load_si128((__m128i *) (transform32x32[1][14])));
6020 E2l = _mm_madd_epi16(m128Tmp4,
6021 _mm_load_si128((__m128i *) (transform32x32[2][14])));
6022 E2h = _mm_madd_epi16(m128Tmp5,
6023 _mm_load_si128((__m128i *) (transform32x32[2][14])));
6024 E3l = _mm_madd_epi16(m128Tmp6,
6025 _mm_load_si128((__m128i *) (transform32x32[3][14])));
6026 E3h = _mm_madd_epi16(m128Tmp7,
6027 _mm_load_si128((__m128i *) (transform32x32[3][14])));
6028
6029 E4l = _mm_madd_epi16(m128Tmp8,
6030 _mm_load_si128((__m128i *) (transform32x32[4][14])));
6031 E4h = _mm_madd_epi16(m128Tmp9,
6032 _mm_load_si128((__m128i *) (transform32x32[4][14])));
6033 E5l = _mm_madd_epi16(m128Tmp10,
6034 _mm_load_si128((__m128i *) (transform32x32[5][14])));
6035 E5h = _mm_madd_epi16(m128Tmp11,
6036 _mm_load_si128((__m128i *) (transform32x32[5][14])));
6037 E6l = _mm_madd_epi16(m128Tmp12,
6038 _mm_load_si128((__m128i *) (transform32x32[6][14])));
6039 E6h = _mm_madd_epi16(m128Tmp13,
6040 _mm_load_si128((__m128i *) (transform32x32[6][14])));
6041 E7l = _mm_madd_epi16(m128Tmp14,
6042 _mm_load_si128((__m128i *) (transform32x32[7][14])));
6043 E7h = _mm_madd_epi16(m128Tmp15,
6044 _mm_load_si128((__m128i *) (transform32x32[7][14])));
6045
6046 O14l = _mm_add_epi32(E0l, E1l);
6047 O14l = _mm_add_epi32(O14l, E2l);
6048 O14l = _mm_add_epi32(O14l, E3l);
6049 O14l = _mm_add_epi32(O14l, E4l);
6050 O14l = _mm_add_epi32(O14l, E5l);
6051 O14l = _mm_add_epi32(O14l, E6l);
6052 O14l = _mm_add_epi32(O14l, E7l);
6053
6054 O14h = _mm_add_epi32(E0h, E1h);
6055 O14h = _mm_add_epi32(O14h, E2h);
6056 O14h = _mm_add_epi32(O14h, E3h);
6057 O14h = _mm_add_epi32(O14h, E4h);
6058 O14h = _mm_add_epi32(O14h, E5h);
6059 O14h = _mm_add_epi32(O14h, E6h);
6060 O14h = _mm_add_epi32(O14h, E7h);
6061
6062 /* Compute O15*/
6063
6064 E0l = _mm_madd_epi16(m128Tmp0,
6065 _mm_load_si128((__m128i *) (transform32x32[0][15])));
6066 E0h = _mm_madd_epi16(m128Tmp1,
6067 _mm_load_si128((__m128i *) (transform32x32[0][15])));
6068 E1l = _mm_madd_epi16(m128Tmp2,
6069 _mm_load_si128((__m128i *) (transform32x32[1][15])));
6070 E1h = _mm_madd_epi16(m128Tmp3,
6071 _mm_load_si128((__m128i *) (transform32x32[1][15])));
6072 E2l = _mm_madd_epi16(m128Tmp4,
6073 _mm_load_si128((__m128i *) (transform32x32[2][15])));
6074 E2h = _mm_madd_epi16(m128Tmp5,
6075 _mm_load_si128((__m128i *) (transform32x32[2][15])));
6076 E3l = _mm_madd_epi16(m128Tmp6,
6077 _mm_load_si128((__m128i *) (transform32x32[3][15])));
6078 E3h = _mm_madd_epi16(m128Tmp7,
6079 _mm_load_si128((__m128i *) (transform32x32[3][15])));
6080
6081 E4l = _mm_madd_epi16(m128Tmp8,
6082 _mm_load_si128((__m128i *) (transform32x32[4][15])));
6083 E4h = _mm_madd_epi16(m128Tmp9,
6084 _mm_load_si128((__m128i *) (transform32x32[4][15])));
6085 E5l = _mm_madd_epi16(m128Tmp10,
6086 _mm_load_si128((__m128i *) (transform32x32[5][15])));
6087 E5h = _mm_madd_epi16(m128Tmp11,
6088 _mm_load_si128((__m128i *) (transform32x32[5][15])));
6089 E6l = _mm_madd_epi16(m128Tmp12,
6090 _mm_load_si128((__m128i *) (transform32x32[6][15])));
6091 E6h = _mm_madd_epi16(m128Tmp13,
6092 _mm_load_si128((__m128i *) (transform32x32[6][15])));
6093 E7l = _mm_madd_epi16(m128Tmp14,
6094 _mm_load_si128((__m128i *) (transform32x32[7][15])));
6095 E7h = _mm_madd_epi16(m128Tmp15,
6096 _mm_load_si128((__m128i *) (transform32x32[7][15])));
6097
6098 O15l = _mm_add_epi32(E0l, E1l);
6099 O15l = _mm_add_epi32(O15l, E2l);
6100 O15l = _mm_add_epi32(O15l, E3l);
6101 O15l = _mm_add_epi32(O15l, E4l);
6102 O15l = _mm_add_epi32(O15l, E5l);
6103 O15l = _mm_add_epi32(O15l, E6l);
6104 O15l = _mm_add_epi32(O15l, E7l);
6105
6106 O15h = _mm_add_epi32(E0h, E1h);
6107 O15h = _mm_add_epi32(O15h, E2h);
6108 O15h = _mm_add_epi32(O15h, E3h);
6109 O15h = _mm_add_epi32(O15h, E4h);
6110 O15h = _mm_add_epi32(O15h, E5h);
6111 O15h = _mm_add_epi32(O15h, E6h);
6112 O15h = _mm_add_epi32(O15h, E7h);
6113 /* Compute E0 */
6114
6115 m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
6116 E0l = _mm_madd_epi16(m128Tmp0,
6117 _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
6118 m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
6119 E0h = _mm_madd_epi16(m128Tmp1,
6120 _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
6121
6122 m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14);
6123 E0l = _mm_add_epi32(E0l,
6124 _mm_madd_epi16(m128Tmp2,
6125 _mm_load_si128(
6126 (__m128i *) (transform16x16_1[1][0]))));
6127 m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14);
6128 E0h = _mm_add_epi32(E0h,
6129 _mm_madd_epi16(m128Tmp3,
6130 _mm_load_si128(
6131 (__m128i *) (transform16x16_1[1][0]))));
6132
6133 m128Tmp4 = _mm_unpacklo_epi16(m128iS18, m128iS22);
6134 E0l = _mm_add_epi32(E0l,
6135 _mm_madd_epi16(m128Tmp4,
6136 _mm_load_si128(
6137 (__m128i *) (transform16x16_1[2][0]))));
6138 m128Tmp5 = _mm_unpackhi_epi16(m128iS18, m128iS22);
6139 E0h = _mm_add_epi32(E0h,
6140 _mm_madd_epi16(m128Tmp5,
6141 _mm_load_si128(
6142 (__m128i *) (transform16x16_1[2][0]))));
6143
6144 m128Tmp6 = _mm_unpacklo_epi16(m128iS26, m128iS30);
6145 E0l = _mm_add_epi32(E0l,
6146 _mm_madd_epi16(m128Tmp6,
6147 _mm_load_si128(
6148 (__m128i *) (transform16x16_1[3][0]))));
6149 m128Tmp7 = _mm_unpackhi_epi16(m128iS26, m128iS30);
6150 E0h = _mm_add_epi32(E0h,
6151 _mm_madd_epi16(m128Tmp7,
6152 _mm_load_si128(
6153 (__m128i *) (transform16x16_1[3][0]))));
6154
6155 /* Compute E1 */
6156 E1l = _mm_madd_epi16(m128Tmp0,
6157 _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
6158 E1h = _mm_madd_epi16(m128Tmp1,
6159 _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
6160 E1l = _mm_add_epi32(E1l,
6161 _mm_madd_epi16(m128Tmp2,
6162 _mm_load_si128(
6163 (__m128i *) (transform16x16_1[1][1]))));
6164 E1h = _mm_add_epi32(E1h,
6165 _mm_madd_epi16(m128Tmp3,
6166 _mm_load_si128(
6167 (__m128i *) (transform16x16_1[1][1]))));
6168 E1l = _mm_add_epi32(E1l,
6169 _mm_madd_epi16(m128Tmp4,
6170 _mm_load_si128(
6171 (__m128i *) (transform16x16_1[2][1]))));
6172 E1h = _mm_add_epi32(E1h,
6173 _mm_madd_epi16(m128Tmp5,
6174 _mm_load_si128(
6175 (__m128i *) (transform16x16_1[2][1]))));
6176 E1l = _mm_add_epi32(E1l,
6177 _mm_madd_epi16(m128Tmp6,
6178 _mm_load_si128(
6179 (__m128i *) (transform16x16_1[3][1]))));
6180 E1h = _mm_add_epi32(E1h,
6181 _mm_madd_epi16(m128Tmp7,
6182 _mm_load_si128(
6183 (__m128i *) (transform16x16_1[3][1]))));
6184
6185 /* Compute E2 */
6186 E2l = _mm_madd_epi16(m128Tmp0,
6187 _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
6188 E2h = _mm_madd_epi16(m128Tmp1,
6189 _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
6190 E2l = _mm_add_epi32(E2l,
6191 _mm_madd_epi16(m128Tmp2,
6192 _mm_load_si128(
6193 (__m128i *) (transform16x16_1[1][2]))));
6194 E2h = _mm_add_epi32(E2h,
6195 _mm_madd_epi16(m128Tmp3,
6196 _mm_load_si128(
6197 (__m128i *) (transform16x16_1[1][2]))));
6198 E2l = _mm_add_epi32(E2l,
6199 _mm_madd_epi16(m128Tmp4,
6200 _mm_load_si128(
6201 (__m128i *) (transform16x16_1[2][2]))));
6202 E2h = _mm_add_epi32(E2h,
6203 _mm_madd_epi16(m128Tmp5,
6204 _mm_load_si128(
6205 (__m128i *) (transform16x16_1[2][2]))));
6206 E2l = _mm_add_epi32(E2l,
6207 _mm_madd_epi16(m128Tmp6,
6208 _mm_load_si128(
6209 (__m128i *) (transform16x16_1[3][2]))));
6210 E2h = _mm_add_epi32(E2h,
6211 _mm_madd_epi16(m128Tmp7,
6212 _mm_load_si128(
6213 (__m128i *) (transform16x16_1[3][2]))));
6214
6215 /* Compute E3 */
6216 E3l = _mm_madd_epi16(m128Tmp0,
6217 _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
6218 E3h = _mm_madd_epi16(m128Tmp1,
6219 _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
6220 E3l = _mm_add_epi32(E3l,
6221 _mm_madd_epi16(m128Tmp2,
6222 _mm_load_si128(
6223 (__m128i *) (transform16x16_1[1][3]))));
6224 E3h = _mm_add_epi32(E3h,
6225 _mm_madd_epi16(m128Tmp3,
6226 _mm_load_si128(
6227 (__m128i *) (transform16x16_1[1][3]))));
6228 E3l = _mm_add_epi32(E3l,
6229 _mm_madd_epi16(m128Tmp4,
6230 _mm_load_si128(
6231 (__m128i *) (transform16x16_1[2][3]))));
6232 E3h = _mm_add_epi32(E3h,
6233 _mm_madd_epi16(m128Tmp5,
6234 _mm_load_si128(
6235 (__m128i *) (transform16x16_1[2][3]))));
6236 E3l = _mm_add_epi32(E3l,
6237 _mm_madd_epi16(m128Tmp6,
6238 _mm_load_si128(
6239 (__m128i *) (transform16x16_1[3][3]))));
6240 E3h = _mm_add_epi32(E3h,
6241 _mm_madd_epi16(m128Tmp7,
6242 _mm_load_si128(
6243 (__m128i *) (transform16x16_1[3][3]))));
6244
6245 /* Compute E4 */
6246 E4l = _mm_madd_epi16(m128Tmp0,
6247 _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
6248 E4h = _mm_madd_epi16(m128Tmp1,
6249 _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
6250 E4l = _mm_add_epi32(E4l,
6251 _mm_madd_epi16(m128Tmp2,
6252 _mm_load_si128(
6253 (__m128i *) (transform16x16_1[1][4]))));
6254 E4h = _mm_add_epi32(E4h,
6255 _mm_madd_epi16(m128Tmp3,
6256 _mm_load_si128(
6257 (__m128i *) (transform16x16_1[1][4]))));
6258 E4l = _mm_add_epi32(E4l,
6259 _mm_madd_epi16(m128Tmp4,
6260 _mm_load_si128(
6261 (__m128i *) (transform16x16_1[2][4]))));
6262 E4h = _mm_add_epi32(E4h,
6263 _mm_madd_epi16(m128Tmp5,
6264 _mm_load_si128(
6265 (__m128i *) (transform16x16_1[2][4]))));
6266 E4l = _mm_add_epi32(E4l,
6267 _mm_madd_epi16(m128Tmp6,
6268 _mm_load_si128(
6269 (__m128i *) (transform16x16_1[3][4]))));
6270 E4h = _mm_add_epi32(E4h,
6271 _mm_madd_epi16(m128Tmp7,
6272 _mm_load_si128(
6273 (__m128i *) (transform16x16_1[3][4]))));
6274
6275 /* Compute E3 */
6276 E5l = _mm_madd_epi16(m128Tmp0,
6277 _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
6278 E5h = _mm_madd_epi16(m128Tmp1,
6279 _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
6280 E5l = _mm_add_epi32(E5l,
6281 _mm_madd_epi16(m128Tmp2,
6282 _mm_load_si128(
6283 (__m128i *) (transform16x16_1[1][5]))));
6284 E5h = _mm_add_epi32(E5h,
6285 _mm_madd_epi16(m128Tmp3,
6286 _mm_load_si128(
6287 (__m128i *) (transform16x16_1[1][5]))));
6288 E5l = _mm_add_epi32(E5l,
6289 _mm_madd_epi16(m128Tmp4,
6290 _mm_load_si128(
6291 (__m128i *) (transform16x16_1[2][5]))));
6292 E5h = _mm_add_epi32(E5h,
6293 _mm_madd_epi16(m128Tmp5,
6294 _mm_load_si128(
6295 (__m128i *) (transform16x16_1[2][5]))));
6296 E5l = _mm_add_epi32(E5l,
6297 _mm_madd_epi16(m128Tmp6,
6298 _mm_load_si128(
6299 (__m128i *) (transform16x16_1[3][5]))));
6300 E5h = _mm_add_epi32(E5h,
6301 _mm_madd_epi16(m128Tmp7,
6302 _mm_load_si128(
6303 (__m128i *) (transform16x16_1[3][5]))));
6304
6305 /* Compute E6 */
6306 E6l = _mm_madd_epi16(m128Tmp0,
6307 _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
6308 E6h = _mm_madd_epi16(m128Tmp1,
6309 _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
6310 E6l = _mm_add_epi32(E6l,
6311 _mm_madd_epi16(m128Tmp2,
6312 _mm_load_si128(
6313 (__m128i *) (transform16x16_1[1][6]))));
6314 E6h = _mm_add_epi32(E6h,
6315 _mm_madd_epi16(m128Tmp3,
6316 _mm_load_si128(
6317 (__m128i *) (transform16x16_1[1][6]))));
6318 E6l = _mm_add_epi32(E6l,
6319 _mm_madd_epi16(m128Tmp4,
6320 _mm_load_si128(
6321 (__m128i *) (transform16x16_1[2][6]))));
6322 E6h = _mm_add_epi32(E6h,
6323 _mm_madd_epi16(m128Tmp5,
6324 _mm_load_si128(
6325 (__m128i *) (transform16x16_1[2][6]))));
6326 E6l = _mm_add_epi32(E6l,
6327 _mm_madd_epi16(m128Tmp6,
6328 _mm_load_si128(
6329 (__m128i *) (transform16x16_1[3][6]))));
6330 E6h = _mm_add_epi32(E6h,
6331 _mm_madd_epi16(m128Tmp7,
6332 _mm_load_si128(
6333 (__m128i *) (transform16x16_1[3][6]))));
6334
6335 /* Compute E7 */
6336 E7l = _mm_madd_epi16(m128Tmp0,
6337 _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
6338 E7h = _mm_madd_epi16(m128Tmp1,
6339 _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
6340 E7l = _mm_add_epi32(E7l,
6341 _mm_madd_epi16(m128Tmp2,
6342 _mm_load_si128(
6343 (__m128i *) (transform16x16_1[1][7]))));
6344 E7h = _mm_add_epi32(E7h,
6345 _mm_madd_epi16(m128Tmp3,
6346 _mm_load_si128(
6347 (__m128i *) (transform16x16_1[1][7]))));
6348 E7l = _mm_add_epi32(E7l,
6349 _mm_madd_epi16(m128Tmp4,
6350 _mm_load_si128(
6351 (__m128i *) (transform16x16_1[2][7]))));
6352 E7h = _mm_add_epi32(E7h,
6353 _mm_madd_epi16(m128Tmp5,
6354 _mm_load_si128(
6355 (__m128i *) (transform16x16_1[2][7]))));
6356 E7l = _mm_add_epi32(E7l,
6357 _mm_madd_epi16(m128Tmp6,
6358 _mm_load_si128(
6359 (__m128i *) (transform16x16_1[3][7]))));
6360 E7h = _mm_add_epi32(E7h,
6361 _mm_madd_epi16(m128Tmp7,
6362 _mm_load_si128(
6363 (__m128i *) (transform16x16_1[3][7]))));
6364
6365 /* Compute EE0 and EEE */
6366
6367 m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12);
6368 E00l = _mm_madd_epi16(m128Tmp0,
6369 _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
6370 m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12);
6371 E00h = _mm_madd_epi16(m128Tmp1,
6372 _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
6373
6374 m128Tmp2 = _mm_unpacklo_epi16(m128iS20, m128iS28);
6375 E00l = _mm_add_epi32(E00l,
6376 _mm_madd_epi16(m128Tmp2,
6377 _mm_load_si128(
6378 (__m128i *) (transform16x16_2[1][0]))));
6379 m128Tmp3 = _mm_unpackhi_epi16(m128iS20, m128iS28);
6380 E00h = _mm_add_epi32(E00h,
6381 _mm_madd_epi16(m128Tmp3,
6382 _mm_load_si128(
6383 (__m128i *) (transform16x16_2[1][0]))));
6384
6385 E01l = _mm_madd_epi16(m128Tmp0,
6386 _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
6387 E01h = _mm_madd_epi16(m128Tmp1,
6388 _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
6389 E01l = _mm_add_epi32(E01l,
6390 _mm_madd_epi16(m128Tmp2,
6391 _mm_load_si128(
6392 (__m128i *) (transform16x16_2[1][1]))));
6393 E01h = _mm_add_epi32(E01h,
6394 _mm_madd_epi16(m128Tmp3,
6395 _mm_load_si128(
6396 (__m128i *) (transform16x16_2[1][1]))));
6397
6398 E02l = _mm_madd_epi16(m128Tmp0,
6399 _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
6400 E02h = _mm_madd_epi16(m128Tmp1,
6401 _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
6402 E02l = _mm_add_epi32(E02l,
6403 _mm_madd_epi16(m128Tmp2,
6404 _mm_load_si128(
6405 (__m128i *) (transform16x16_2[1][2]))));
6406 E02h = _mm_add_epi32(E02h,
6407 _mm_madd_epi16(m128Tmp3,
6408 _mm_load_si128(
6409 (__m128i *) (transform16x16_2[1][2]))));
6410
6411 E03l = _mm_madd_epi16(m128Tmp0,
6412 _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
6413 E03h = _mm_madd_epi16(m128Tmp1,
6414 _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
6415 E03l = _mm_add_epi32(E03l,
6416 _mm_madd_epi16(m128Tmp2,
6417 _mm_load_si128(
6418 (__m128i *) (transform16x16_2[1][3]))));
6419 E03h = _mm_add_epi32(E03h,
6420 _mm_madd_epi16(m128Tmp3,
6421 _mm_load_si128(
6422 (__m128i *) (transform16x16_2[1][3]))));
6423
6424 /* Compute EE0 and EEE */
6425
6426 m128Tmp0 = _mm_unpacklo_epi16(m128iS8, m128iS24);
6427 EE0l = _mm_madd_epi16(m128Tmp0,
6428 _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
6429 m128Tmp1 = _mm_unpackhi_epi16(m128iS8, m128iS24);
6430 EE0h = _mm_madd_epi16(m128Tmp1,
6431 _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
6432
6433 m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS16);
6434 EEE0l = _mm_madd_epi16(m128Tmp2,
6435 _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
6436 m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS16);
6437 EEE0h = _mm_madd_epi16(m128Tmp3,
6438 _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
6439
6440 EE1l = _mm_madd_epi16(m128Tmp0,
6441 _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
6442 EE1h = _mm_madd_epi16(m128Tmp1,
6443 _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
6444
6445 EEE1l = _mm_madd_epi16(m128Tmp2,
6446 _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
6447 EEE1h = _mm_madd_epi16(m128Tmp3,
6448 _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
6449
6450 /* Compute EE */
6451
6452 EE2l = _mm_sub_epi32(EEE1l, EE1l);
6453 EE3l = _mm_sub_epi32(EEE0l, EE0l);
6454 EE2h = _mm_sub_epi32(EEE1h, EE1h);
6455 EE3h = _mm_sub_epi32(EEE0h, EE0h);
6456
6457 EE0l = _mm_add_epi32(EEE0l, EE0l);
6458 EE1l = _mm_add_epi32(EEE1l, EE1l);
6459 EE0h = _mm_add_epi32(EEE0h, EE0h);
6460 EE1h = _mm_add_epi32(EEE1h, EE1h);
6461 /**/
6462
6463 EE7l = _mm_sub_epi32(EE0l, E00l);
6464 EE6l = _mm_sub_epi32(EE1l, E01l);
6465 EE5l = _mm_sub_epi32(EE2l, E02l);
6466 EE4l = _mm_sub_epi32(EE3l, E03l);
6467
6468 EE7h = _mm_sub_epi32(EE0h, E00h);
6469 EE6h = _mm_sub_epi32(EE1h, E01h);
6470 EE5h = _mm_sub_epi32(EE2h, E02h);
6471 EE4h = _mm_sub_epi32(EE3h, E03h);
6472
6473 EE0l = _mm_add_epi32(EE0l, E00l);
6474 EE1l = _mm_add_epi32(EE1l, E01l);
6475 EE2l = _mm_add_epi32(EE2l, E02l);
6476 EE3l = _mm_add_epi32(EE3l, E03l);
6477
6478 EE0h = _mm_add_epi32(EE0h, E00h);
6479 EE1h = _mm_add_epi32(EE1h, E01h);
6480 EE2h = _mm_add_epi32(EE2h, E02h);
6481 EE3h = _mm_add_epi32(EE3h, E03h);
6482 /* Compute E */
6483
6484 E15l = _mm_sub_epi32(EE0l, E0l);
6485 E15l = _mm_add_epi32(E15l, m128iAdd);
6486 E14l = _mm_sub_epi32(EE1l, E1l);
6487 E14l = _mm_add_epi32(E14l, m128iAdd);
6488 E13l = _mm_sub_epi32(EE2l, E2l);
6489 E13l = _mm_add_epi32(E13l, m128iAdd);
6490 E12l = _mm_sub_epi32(EE3l, E3l);
6491 E12l = _mm_add_epi32(E12l, m128iAdd);
6492 E11l = _mm_sub_epi32(EE4l, E4l);
6493 E11l = _mm_add_epi32(E11l, m128iAdd);
6494 E10l = _mm_sub_epi32(EE5l, E5l);
6495 E10l = _mm_add_epi32(E10l, m128iAdd);
6496 E9l = _mm_sub_epi32(EE6l, E6l);
6497 E9l = _mm_add_epi32(E9l, m128iAdd);
6498 E8l = _mm_sub_epi32(EE7l, E7l);
6499 E8l = _mm_add_epi32(E8l, m128iAdd);
6500
6501 E0l = _mm_add_epi32(EE0l, E0l);
6502 E0l = _mm_add_epi32(E0l, m128iAdd);
6503 E1l = _mm_add_epi32(EE1l, E1l);
6504 E1l = _mm_add_epi32(E1l, m128iAdd);
6505 E2l = _mm_add_epi32(EE2l, E2l);
6506 E2l = _mm_add_epi32(E2l, m128iAdd);
6507 E3l = _mm_add_epi32(EE3l, E3l);
6508 E3l = _mm_add_epi32(E3l, m128iAdd);
6509 E4l = _mm_add_epi32(EE4l, E4l);
6510 E4l = _mm_add_epi32(E4l, m128iAdd);
6511 E5l = _mm_add_epi32(EE5l, E5l);
6512 E5l = _mm_add_epi32(E5l, m128iAdd);
6513 E6l = _mm_add_epi32(EE6l, E6l);
6514 E6l = _mm_add_epi32(E6l, m128iAdd);
6515 E7l = _mm_add_epi32(EE7l, E7l);
6516 E7l = _mm_add_epi32(E7l, m128iAdd);
6517
6518 E15h = _mm_sub_epi32(EE0h, E0h);
6519 E15h = _mm_add_epi32(E15h, m128iAdd);
6520 E14h = _mm_sub_epi32(EE1h, E1h);
6521 E14h = _mm_add_epi32(E14h, m128iAdd);
6522 E13h = _mm_sub_epi32(EE2h, E2h);
6523 E13h = _mm_add_epi32(E13h, m128iAdd);
6524 E12h = _mm_sub_epi32(EE3h, E3h);
6525 E12h = _mm_add_epi32(E12h, m128iAdd);
6526 E11h = _mm_sub_epi32(EE4h, E4h);
6527 E11h = _mm_add_epi32(E11h, m128iAdd);
6528 E10h = _mm_sub_epi32(EE5h, E5h);
6529 E10h = _mm_add_epi32(E10h, m128iAdd);
6530 E9h = _mm_sub_epi32(EE6h, E6h);
6531 E9h = _mm_add_epi32(E9h, m128iAdd);
6532 E8h = _mm_sub_epi32(EE7h, E7h);
6533 E8h = _mm_add_epi32(E8h, m128iAdd);
6534
6535 E0h = _mm_add_epi32(EE0h, E0h);
6536 E0h = _mm_add_epi32(E0h, m128iAdd);
6537 E1h = _mm_add_epi32(EE1h, E1h);
6538 E1h = _mm_add_epi32(E1h, m128iAdd);
6539 E2h = _mm_add_epi32(EE2h, E2h);
6540 E2h = _mm_add_epi32(E2h, m128iAdd);
6541 E3h = _mm_add_epi32(EE3h, E3h);
6542 E3h = _mm_add_epi32(E3h, m128iAdd);
6543 E4h = _mm_add_epi32(EE4h, E4h);
6544 E4h = _mm_add_epi32(E4h, m128iAdd);
6545 E5h = _mm_add_epi32(EE5h, E5h);
6546 E5h = _mm_add_epi32(E5h, m128iAdd);
6547 E6h = _mm_add_epi32(EE6h, E6h);
6548 E6h = _mm_add_epi32(E6h, m128iAdd);
6549 E7h = _mm_add_epi32(EE7h, E7h);
6550 E7h = _mm_add_epi32(E7h, m128iAdd);
6551
6552 m128iS0 = _mm_packs_epi32(
6553 _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift),
6554 _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift));
6555 m128iS1 = _mm_packs_epi32(
6556 _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift),
6557 _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift));
6558 m128iS2 = _mm_packs_epi32(
6559 _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift),
6560 _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift));
6561 m128iS3 = _mm_packs_epi32(
6562 _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift),
6563 _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift));
6564 m128iS4 = _mm_packs_epi32(
6565 _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift),
6566 _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift));
6567 m128iS5 = _mm_packs_epi32(
6568 _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift),
6569 _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift));
6570 m128iS6 = _mm_packs_epi32(
6571 _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift),
6572 _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift));
6573 m128iS7 = _mm_packs_epi32(
6574 _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift),
6575 _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift));
6576 m128iS8 = _mm_packs_epi32(
6577 _mm_srai_epi32(_mm_add_epi32(E8l, O8l), shift),
6578 _mm_srai_epi32(_mm_add_epi32(E8h, O8h), shift));
6579 m128iS9 = _mm_packs_epi32(
6580 _mm_srai_epi32(_mm_add_epi32(E9l, O9l), shift),
6581 _mm_srai_epi32(_mm_add_epi32(E9h, O9h), shift));
6582 m128iS10 = _mm_packs_epi32(
6583 _mm_srai_epi32(_mm_add_epi32(E10l, O10l), shift),
6584 _mm_srai_epi32(_mm_add_epi32(E10h, O10h), shift));
6585 m128iS11 = _mm_packs_epi32(
6586 _mm_srai_epi32(_mm_add_epi32(E11l, O11l), shift),
6587 _mm_srai_epi32(_mm_add_epi32(E11h, O11h), shift));
6588 m128iS12 = _mm_packs_epi32(
6589 _mm_srai_epi32(_mm_add_epi32(E12l, O12l), shift),
6590 _mm_srai_epi32(_mm_add_epi32(E12h, O12h), shift));
6591 m128iS13 = _mm_packs_epi32(
6592 _mm_srai_epi32(_mm_add_epi32(E13l, O13l), shift),
6593 _mm_srai_epi32(_mm_add_epi32(E13h, O13h), shift));
6594 m128iS14 = _mm_packs_epi32(
6595 _mm_srai_epi32(_mm_add_epi32(E14l, O14l), shift),
6596 _mm_srai_epi32(_mm_add_epi32(E14h, O14h), shift));
6597 m128iS15 = _mm_packs_epi32(
6598 _mm_srai_epi32(_mm_add_epi32(E15l, O15l), shift),
6599 _mm_srai_epi32(_mm_add_epi32(E15h, O15h), shift));
6600
6601 m128iS31 = _mm_packs_epi32(
6602 _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift),
6603 _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift));
6604 m128iS30 = _mm_packs_epi32(
6605 _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift),
6606 _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift));
6607 m128iS29 = _mm_packs_epi32(
6608 _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift),
6609 _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift));
6610 m128iS28 = _mm_packs_epi32(
6611 _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift),
6612 _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift));
6613 m128iS27 = _mm_packs_epi32(
6614 _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift),
6615 _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift));
6616 m128iS26 = _mm_packs_epi32(
6617 _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift),
6618 _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift));
6619 m128iS25 = _mm_packs_epi32(
6620 _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift),
6621 _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift));
6622 m128iS24 = _mm_packs_epi32(
6623 _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift),
6624 _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift));
6625 m128iS23 = _mm_packs_epi32(
6626 _mm_srai_epi32(_mm_sub_epi32(E8l, O8l), shift),
6627 _mm_srai_epi32(_mm_sub_epi32(E8h, O8h), shift));
6628 m128iS22 = _mm_packs_epi32(
6629 _mm_srai_epi32(_mm_sub_epi32(E9l, O9l), shift),
6630 _mm_srai_epi32(_mm_sub_epi32(E9h, O9h), shift));
6631 m128iS21 = _mm_packs_epi32(
6632 _mm_srai_epi32(_mm_sub_epi32(E10l, O10l), shift),
6633 _mm_srai_epi32(_mm_sub_epi32(E10h, O10h), shift));
6634 m128iS20 = _mm_packs_epi32(
6635 _mm_srai_epi32(_mm_sub_epi32(E11l, O11l), shift),
6636 _mm_srai_epi32(_mm_sub_epi32(E11h, O11h), shift));
6637 m128iS19 = _mm_packs_epi32(
6638 _mm_srai_epi32(_mm_sub_epi32(E12l, O12l), shift),
6639 _mm_srai_epi32(_mm_sub_epi32(E12h, O12h), shift));
6640 m128iS18 = _mm_packs_epi32(
6641 _mm_srai_epi32(_mm_sub_epi32(E13l, O13l), shift),
6642 _mm_srai_epi32(_mm_sub_epi32(E13h, O13h), shift));
6643 m128iS17 = _mm_packs_epi32(
6644 _mm_srai_epi32(_mm_sub_epi32(E14l, O14l), shift),
6645 _mm_srai_epi32(_mm_sub_epi32(E14h, O14h), shift));
6646 m128iS16 = _mm_packs_epi32(
6647 _mm_srai_epi32(_mm_sub_epi32(E15l, O15l), shift),
6648 _mm_srai_epi32(_mm_sub_epi32(E15h, O15h), shift));
6649
6650 if (!j) {
6651 /* Inverse the matrix */
6652 E0l = _mm_unpacklo_epi16(m128iS0, m128iS16);
6653 E1l = _mm_unpacklo_epi16(m128iS1, m128iS17);
6654 E2l = _mm_unpacklo_epi16(m128iS2, m128iS18);
6655 E3l = _mm_unpacklo_epi16(m128iS3, m128iS19);
6656 E4l = _mm_unpacklo_epi16(m128iS4, m128iS20);
6657 E5l = _mm_unpacklo_epi16(m128iS5, m128iS21);
6658 E6l = _mm_unpacklo_epi16(m128iS6, m128iS22);
6659 E7l = _mm_unpacklo_epi16(m128iS7, m128iS23);
6660 E8l = _mm_unpacklo_epi16(m128iS8, m128iS24);
6661 E9l = _mm_unpacklo_epi16(m128iS9, m128iS25);
6662 E10l = _mm_unpacklo_epi16(m128iS10, m128iS26);
6663 E11l = _mm_unpacklo_epi16(m128iS11, m128iS27);
6664 E12l = _mm_unpacklo_epi16(m128iS12, m128iS28);
6665 E13l = _mm_unpacklo_epi16(m128iS13, m128iS29);
6666 E14l = _mm_unpacklo_epi16(m128iS14, m128iS30);
6667 E15l = _mm_unpacklo_epi16(m128iS15, m128iS31);
6668
6669 O0l = _mm_unpackhi_epi16(m128iS0, m128iS16);
6670 O1l = _mm_unpackhi_epi16(m128iS1, m128iS17);
6671 O2l = _mm_unpackhi_epi16(m128iS2, m128iS18);
6672 O3l = _mm_unpackhi_epi16(m128iS3, m128iS19);
6673 O4l = _mm_unpackhi_epi16(m128iS4, m128iS20);
6674 O5l = _mm_unpackhi_epi16(m128iS5, m128iS21);
6675 O6l = _mm_unpackhi_epi16(m128iS6, m128iS22);
6676 O7l = _mm_unpackhi_epi16(m128iS7, m128iS23);
6677 O8l = _mm_unpackhi_epi16(m128iS8, m128iS24);
6678 O9l = _mm_unpackhi_epi16(m128iS9, m128iS25);
6679 O10l = _mm_unpackhi_epi16(m128iS10, m128iS26);
6680 O11l = _mm_unpackhi_epi16(m128iS11, m128iS27);
6681 O12l = _mm_unpackhi_epi16(m128iS12, m128iS28);
6682 O13l = _mm_unpackhi_epi16(m128iS13, m128iS29);
6683 O14l = _mm_unpackhi_epi16(m128iS14, m128iS30);
6684 O15l = _mm_unpackhi_epi16(m128iS15, m128iS31);
6685
6686 E0h = _mm_unpacklo_epi16(E0l, E8l);
6687 E1h = _mm_unpacklo_epi16(E1l, E9l);
6688 E2h = _mm_unpacklo_epi16(E2l, E10l);
6689 E3h = _mm_unpacklo_epi16(E3l, E11l);
6690 E4h = _mm_unpacklo_epi16(E4l, E12l);
6691 E5h = _mm_unpacklo_epi16(E5l, E13l);
6692 E6h = _mm_unpacklo_epi16(E6l, E14l);
6693 E7h = _mm_unpacklo_epi16(E7l, E15l);
6694
6695 E8h = _mm_unpackhi_epi16(E0l, E8l);
6696 E9h = _mm_unpackhi_epi16(E1l, E9l);
6697 E10h = _mm_unpackhi_epi16(E2l, E10l);
6698 E11h = _mm_unpackhi_epi16(E3l, E11l);
6699 E12h = _mm_unpackhi_epi16(E4l, E12l);
6700 E13h = _mm_unpackhi_epi16(E5l, E13l);
6701 E14h = _mm_unpackhi_epi16(E6l, E14l);
6702 E15h = _mm_unpackhi_epi16(E7l, E15l);
6703
6704 m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h);
6705 m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h);
6706 m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h);
6707 m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h);
6708
6709 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6710 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6711 m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6712 m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6713
6714 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6715 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6716 m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6717 m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6718
6719 m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h);
6720 m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h);
6721 m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h);
6722 m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h);
6723
6724 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6725 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6726 m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6727 m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6728
6729 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6730 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6731 m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6732 m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6733
6734 m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h);
6735 m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h);
6736 m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h);
6737 m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h);
6738
6739 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6740 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6741 m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6742 m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6743
6744 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6745 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6746 m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6747 m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6748
6749 m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h);
6750 m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h);
6751 m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h);
6752 m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h);
6753
6754 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6755 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6756 m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6757 m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6758
6759 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6760 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6761 m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6762 m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6763
6764 /* */
6765 E0h = _mm_unpacklo_epi16(O0l, O8l);
6766 E1h = _mm_unpacklo_epi16(O1l, O9l);
6767 E2h = _mm_unpacklo_epi16(O2l, O10l);
6768 E3h = _mm_unpacklo_epi16(O3l, O11l);
6769 E4h = _mm_unpacklo_epi16(O4l, O12l);
6770 E5h = _mm_unpacklo_epi16(O5l, O13l);
6771 E6h = _mm_unpacklo_epi16(O6l, O14l);
6772 E7h = _mm_unpacklo_epi16(O7l, O15l);
6773
6774 E8h = _mm_unpackhi_epi16(O0l, O8l);
6775 E9h = _mm_unpackhi_epi16(O1l, O9l);
6776 E10h = _mm_unpackhi_epi16(O2l, O10l);
6777 E11h = _mm_unpackhi_epi16(O3l, O11l);
6778 E12h = _mm_unpackhi_epi16(O4l, O12l);
6779 E13h = _mm_unpackhi_epi16(O5l, O13l);
6780 E14h = _mm_unpackhi_epi16(O6l, O14l);
6781 E15h = _mm_unpackhi_epi16(O7l, O15l);
6782
6783 m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h);
6784 m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h);
6785 m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h);
6786 m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h);
6787
6788 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6789 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6790 m128iS16 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6791 m128iS17 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6792
6793 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6794 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6795 m128iS18 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6796 m128iS19 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6797
6798 m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h);
6799 m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h);
6800 m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h);
6801 m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h);
6802
6803 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6804 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6805 m128iS20 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6806 m128iS21 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6807
6808 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6809 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6810 m128iS22 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6811 m128iS23 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6812
6813 m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h);
6814 m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h);
6815 m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h);
6816 m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h);
6817
6818 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6819 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6820 m128iS24 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6821 m128iS25 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6822
6823 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6824 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6825 m128iS26 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6826 m128iS27 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6827
6828 m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h);
6829 m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h);
6830 m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h);
6831 m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h);
6832
6833 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6834 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6835 m128iS28 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6836 m128iS29 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6837
6838 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6839 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6840 m128iS30 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6841 m128iS31 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6842 /* */
6843 _mm_store_si128((__m128i *) (src + i), m128iS0);
6844 _mm_store_si128((__m128i *) (src + 32 + i), m128iS1);
6845 _mm_store_si128((__m128i *) (src + 64 + i), m128iS2);
6846 _mm_store_si128((__m128i *) (src + 96 + i), m128iS3);
6847 _mm_store_si128((__m128i *) (src + 128 + i), m128iS4);
6848 _mm_store_si128((__m128i *) (src + 160 + i), m128iS5);
6849 _mm_store_si128((__m128i *) (src + 192 + i), m128iS6);
6850 _mm_store_si128((__m128i *) (src + 224 + i), m128iS7);
6851 _mm_store_si128((__m128i *) (src + 256 + i), m128iS8);
6852 _mm_store_si128((__m128i *) (src + 288 + i), m128iS9);
6853 _mm_store_si128((__m128i *) (src + 320 + i), m128iS10);
6854 _mm_store_si128((__m128i *) (src + 352 + i), m128iS11);
6855 _mm_store_si128((__m128i *) (src + 384 + i), m128iS12);
6856 _mm_store_si128((__m128i *) (src + 416 + i), m128iS13);
6857 _mm_store_si128((__m128i *) (src + 448 + i), m128iS14);
6858 _mm_store_si128((__m128i *) (src + 480 + i), m128iS15);
6859 _mm_store_si128((__m128i *) (src + 512 + i), m128iS16);
6860 _mm_store_si128((__m128i *) (src + 544 + i), m128iS17);
6861 _mm_store_si128((__m128i *) (src + 576 + i), m128iS18);
6862 _mm_store_si128((__m128i *) (src + 608 + i), m128iS19);
6863 _mm_store_si128((__m128i *) (src + 640 + i), m128iS20);
6864 _mm_store_si128((__m128i *) (src + 672 + i), m128iS21);
6865 _mm_store_si128((__m128i *) (src + 704 + i), m128iS22);
6866 _mm_store_si128((__m128i *) (src + 736 + i), m128iS23);
6867 _mm_store_si128((__m128i *) (src + 768 + i), m128iS24);
6868 _mm_store_si128((__m128i *) (src + 800 + i), m128iS25);
6869 _mm_store_si128((__m128i *) (src + 832 + i), m128iS26);
6870 _mm_store_si128((__m128i *) (src + 864 + i), m128iS27);
6871 _mm_store_si128((__m128i *) (src + 896 + i), m128iS28);
6872 _mm_store_si128((__m128i *) (src + 928 + i), m128iS29);
6873 _mm_store_si128((__m128i *) (src + 960 + i), m128iS30);
6874 _mm_store_si128((__m128i *) (src + 992 + i), m128iS31);
6875
6876 if (i <= 16) {
6877 int k = i + 8;
6878 m128iS0 = _mm_load_si128((__m128i *) (src + k));
6879 m128iS1 = _mm_load_si128((__m128i *) (src + 32 + k));
6880 m128iS2 = _mm_load_si128((__m128i *) (src + 64 + k));
6881 m128iS3 = _mm_load_si128((__m128i *) (src + 96 + k));
6882 m128iS4 = _mm_load_si128((__m128i *) (src + 128 + k));
6883 m128iS5 = _mm_load_si128((__m128i *) (src + 160 + k));
6884 m128iS6 = _mm_load_si128((__m128i *) (src + 192 + k));
6885 m128iS7 = _mm_load_si128((__m128i *) (src + 224 + k));
6886 m128iS8 = _mm_load_si128((__m128i *) (src + 256 + k));
6887 m128iS9 = _mm_load_si128((__m128i *) (src + 288 + k));
6888 m128iS10 = _mm_load_si128((__m128i *) (src + 320 + k));
6889 m128iS11 = _mm_load_si128((__m128i *) (src + 352 + k));
6890 m128iS12 = _mm_load_si128((__m128i *) (src + 384 + k));
6891 m128iS13 = _mm_load_si128((__m128i *) (src + 416 + k));
6892 m128iS14 = _mm_load_si128((__m128i *) (src + 448 + k));
6893 m128iS15 = _mm_load_si128((__m128i *) (src + 480 + k));
6894
6895 m128iS16 = _mm_load_si128((__m128i *) (src + 512 + k));
6896 m128iS17 = _mm_load_si128((__m128i *) (src + 544 + k));
6897 m128iS18 = _mm_load_si128((__m128i *) (src + 576 + k));
6898 m128iS19 = _mm_load_si128((__m128i *) (src + 608 + k));
6899 m128iS20 = _mm_load_si128((__m128i *) (src + 640 + k));
6900 m128iS21 = _mm_load_si128((__m128i *) (src + 672 + k));
6901 m128iS22 = _mm_load_si128((__m128i *) (src + 704 + k));
6902 m128iS23 = _mm_load_si128((__m128i *) (src + 736 + k));
6903 m128iS24 = _mm_load_si128((__m128i *) (src + 768 + k));
6904 m128iS25 = _mm_load_si128((__m128i *) (src + 800 + k));
6905 m128iS26 = _mm_load_si128((__m128i *) (src + 832 + k));
6906 m128iS27 = _mm_load_si128((__m128i *) (src + 864 + k));
6907 m128iS28 = _mm_load_si128((__m128i *) (src + 896 + k));
6908 m128iS29 = _mm_load_si128((__m128i *) (src + 928 + k));
6909 m128iS30 = _mm_load_si128((__m128i *) (src + 960 + k));
6910 m128iS31 = _mm_load_si128((__m128i *) (src + 992 + k));
6911 } else {
6912 m128iS0 = _mm_load_si128((__m128i *) (src));
6913 m128iS1 = _mm_load_si128((__m128i *) (src + 128));
6914 m128iS2 = _mm_load_si128((__m128i *) (src + 256));
6915 m128iS3 = _mm_load_si128((__m128i *) (src + 384));
6916 m128iS4 = _mm_loadu_si128((__m128i *) (src + 512));
6917 m128iS5 = _mm_load_si128((__m128i *) (src + 640));
6918 m128iS6 = _mm_load_si128((__m128i *) (src + 768));
6919 m128iS7 = _mm_load_si128((__m128i *) (src + 896));
6920 m128iS8 = _mm_load_si128((__m128i *) (src + 8));
6921 m128iS9 = _mm_load_si128((__m128i *) (src + 128 + 8));
6922 m128iS10 = _mm_load_si128((__m128i *) (src + 256 + 8));
6923 m128iS11 = _mm_load_si128((__m128i *) (src + 384 + 8));
6924 m128iS12 = _mm_loadu_si128((__m128i *) (src + 512 + 8));
6925 m128iS13 = _mm_load_si128((__m128i *) (src + 640 + 8));
6926 m128iS14 = _mm_load_si128((__m128i *) (src + 768 + 8));
6927 m128iS15 = _mm_load_si128((__m128i *) (src + 896 + 8));
6928 m128iS16 = _mm_load_si128((__m128i *) (src + 16));
6929 m128iS17 = _mm_load_si128((__m128i *) (src + 128 + 16));
6930 m128iS18 = _mm_load_si128((__m128i *) (src + 256 + 16));
6931 m128iS19 = _mm_load_si128((__m128i *) (src + 384 + 16));
6932 m128iS20 = _mm_loadu_si128((__m128i *) (src + 512 + 16));
6933 m128iS21 = _mm_load_si128((__m128i *) (src + 640 + 16));
6934 m128iS22 = _mm_load_si128((__m128i *) (src + 768 + 16));
6935 m128iS23 = _mm_load_si128((__m128i *) (src + 896 + 16));
6936 m128iS24 = _mm_load_si128((__m128i *) (src + 24));
6937 m128iS25 = _mm_load_si128((__m128i *) (src + 128 + 24));
6938 m128iS26 = _mm_load_si128((__m128i *) (src + 256 + 24));
6939 m128iS27 = _mm_load_si128((__m128i *) (src + 384 + 24));
6940 m128iS28 = _mm_loadu_si128((__m128i *) (src + 512 + 24));
6941 m128iS29 = _mm_load_si128((__m128i *) (src + 640 + 24));
6942 m128iS30 = _mm_load_si128((__m128i *) (src + 768 + 24));
6943 m128iS31 = _mm_load_si128((__m128i *) (src + 896 + 24));
6944 shift = shift_2nd;
6945 m128iAdd = _mm_set1_epi32(add_2nd);
6946 }
6947
6948 } else {
6949 int k, m = 0;
6950 _mm_storeu_si128((__m128i *) (src), m128iS0);
6951 _mm_storeu_si128((__m128i *) (src + 8), m128iS1);
6952 _mm_storeu_si128((__m128i *) (src + 16), m128iS2);
6953 _mm_storeu_si128((__m128i *) (src + 24), m128iS3);
6954 _mm_storeu_si128((__m128i *) (src + 128), m128iS4);
6955 _mm_storeu_si128((__m128i *) (src + 128 + 8), m128iS5);
6956 _mm_storeu_si128((__m128i *) (src + 128 + 16), m128iS6);
6957 _mm_storeu_si128((__m128i *) (src + 128 + 24), m128iS7);
6958 _mm_storeu_si128((__m128i *) (src + 256), m128iS8);
6959 _mm_storeu_si128((__m128i *) (src + 256 + 8), m128iS9);
6960 _mm_storeu_si128((__m128i *) (src + 256 + 16), m128iS10);
6961 _mm_storeu_si128((__m128i *) (src + 256 + 24), m128iS11);
6962 _mm_storeu_si128((__m128i *) (src + 384), m128iS12);
6963 _mm_storeu_si128((__m128i *) (src + 384 + 8), m128iS13);
6964 _mm_storeu_si128((__m128i *) (src + 384 + 16), m128iS14);
6965 _mm_storeu_si128((__m128i *) (src + 384 + 24), m128iS15);
6966
6967 _mm_storeu_si128((__m128i *) (src + 512), m128iS16);
6968 _mm_storeu_si128((__m128i *) (src + 512 + 8), m128iS17);
6969 _mm_storeu_si128((__m128i *) (src + 512 + 16), m128iS18);
6970 _mm_storeu_si128((__m128i *) (src + 512 + 24), m128iS19);
6971 _mm_storeu_si128((__m128i *) (src + 640), m128iS20);
6972 _mm_storeu_si128((__m128i *) (src + 640 + 8), m128iS21);
6973 _mm_storeu_si128((__m128i *) (src + 640 + 16), m128iS22);
6974 _mm_storeu_si128((__m128i *) (src + 640 + 24), m128iS23);
6975 _mm_storeu_si128((__m128i *) (src + 768), m128iS24);
6976 _mm_storeu_si128((__m128i *) (src + 768 + 8), m128iS25);
6977 _mm_storeu_si128((__m128i *) (src + 768 + 16), m128iS26);
6978 _mm_storeu_si128((__m128i *) (src + 768 + 24), m128iS27);
6979 _mm_storeu_si128((__m128i *) (src + 896), m128iS28);
6980 _mm_storeu_si128((__m128i *) (src + 896 + 8), m128iS29);
6981 _mm_storeu_si128((__m128i *) (src + 896 + 16), m128iS30);
6982 _mm_storeu_si128((__m128i *) (src + 896 + 24), m128iS31);
6983 dst = (uint16_t*) _dst + (i * stride);
6984 for (k = 0; k < 8; k++) {
6985 dst[0] = av_clip_uintp2(dst[0] + src[m],10);
6986 dst[1] = av_clip_uintp2(dst[1] + src[m + 8],10);
6987 dst[2] = av_clip_uintp2(dst[2] + src[m + 16],10);
6988 dst[3] = av_clip_uintp2(dst[3] + src[m + 24],10);
6989 dst[4] = av_clip_uintp2(
6990 dst[4] + src[m + 128],10);
6991 dst[5] = av_clip_uintp2(
6992 dst[5] + src[m + 128 + 8],10);
6993 dst[6] = av_clip_uintp2(
6994 dst[6] + src[m + 128 + 16],10);
6995 dst[7] = av_clip_uintp2(
6996 dst[7] + src[m + 128 + 24],10);
6997
6998 dst[8] = av_clip_uintp2(
6999 dst[8] + src[m + 256],10);
7000 dst[9] = av_clip_uintp2(
7001 dst[9] + src[m + 256 + 8],10);
7002 dst[10] = av_clip_uintp2(
7003 dst[10] + src[m + 256 + 16],10);
7004 dst[11] = av_clip_uintp2(
7005 dst[11] + src[m + 256 + 24],10);
7006 dst[12] = av_clip_uintp2(
7007 dst[12] + src[m + 384],10);
7008 dst[13] = av_clip_uintp2(
7009 dst[13] + src[m + 384 + 8],10);
7010 dst[14] = av_clip_uintp2(
7011 dst[14] + src[m + 384 + 16],10);
7012 dst[15] = av_clip_uintp2(
7013 dst[15] + src[m + 384 + 24],10);
7014
7015 dst[16] = av_clip_uintp2(
7016 dst[16] + src[m + 512],10);
7017 dst[17] = av_clip_uintp2(
7018 dst[17] + src[m + 512 + 8],10);
7019 dst[18] = av_clip_uintp2(
7020 dst[18] + src[m + 512 + 16],10);
7021 dst[19] = av_clip_uintp2(
7022 dst[19] + src[m + 512 + 24],10);
7023 dst[20] = av_clip_uintp2(
7024 dst[20] + src[m + 640],10);
7025 dst[21] = av_clip_uintp2(
7026 dst[21] + src[m + 640 + 8],10);
7027 dst[22] = av_clip_uintp2(
7028 dst[22] + src[m + 640 + 16],10);
7029 dst[23] = av_clip_uintp2(
7030 dst[23] + src[m + 640 + 24],10);
7031
7032 dst[24] = av_clip_uintp2(
7033 dst[24] + src[m + 768],10);
7034 dst[25] = av_clip_uintp2(
7035 dst[25] + src[m + 768 + 8],10);
7036 dst[26] = av_clip_uintp2(
7037 dst[26] + src[m + 768 + 16],10);
7038 dst[27] = av_clip_uintp2(
7039 dst[27] + src[m + 768 + 24],10);
7040 dst[28] = av_clip_uintp2(
7041 dst[28] + src[m + 896],10);
7042 dst[29] = av_clip_uintp2(
7043 dst[29] + src[m + 896 + 8],10);
7044 dst[30] = av_clip_uintp2(
7045 dst[30] + src[m + 896 + 16],10);
7046 dst[31] = av_clip_uintp2(
7047 dst[31] + src[m + 896 + 24],10);
7048
7049 m += 1;
7050 dst += stride;
7051 }
7052 if (i <= 16) {
7053 int k = (i + 8) * 4;
7054 m128iS0 = _mm_load_si128((__m128i *) (src + k));
7055 m128iS1 = _mm_load_si128((__m128i *) (src + 128 + k));
7056 m128iS2 = _mm_load_si128((__m128i *) (src + 256 + k));
7057 m128iS3 = _mm_load_si128((__m128i *) (src + 384 + k));
7058 m128iS4 = _mm_loadu_si128((__m128i *) (src + 512 + k));
7059 m128iS5 = _mm_load_si128((__m128i *) (src + 640 + k));
7060 m128iS6 = _mm_load_si128((__m128i *) (src + 768 + k));
7061 m128iS7 = _mm_load_si128((__m128i *) (src + 896 + k));
7062 m128iS8 = _mm_load_si128((__m128i *) (src + 8 + k));
7063 m128iS9 = _mm_load_si128((__m128i *) (src + 128 + 8 + k));
7064 m128iS10 = _mm_load_si128((__m128i *) (src + 256 + 8 + k));
7065 m128iS11 = _mm_load_si128((__m128i *) (src + 384 + 8 + k));
7066 m128iS12 = _mm_loadu_si128((__m128i *) (src + 512 + 8 + k));
7067 m128iS13 = _mm_load_si128((__m128i *) (src + 640 + 8 + k));
7068 m128iS14 = _mm_load_si128((__m128i *) (src + 768 + 8 + k));
7069 m128iS15 = _mm_load_si128((__m128i *) (src + 896 + 8 + k));
7070 m128iS16 = _mm_load_si128((__m128i *) (src + 16 + k));
7071 m128iS17 = _mm_load_si128((__m128i *) (src + 128 + 16 + k));
7072 m128iS18 = _mm_load_si128((__m128i *) (src + 256 + 16 + k));
7073 m128iS19 = _mm_load_si128((__m128i *) (src + 384 + 16 + k));
7074 m128iS20 = _mm_loadu_si128(
7075 (__m128i *) (src + 512 + 16 + k));
7076 m128iS21 = _mm_load_si128((__m128i *) (src + 640 + 16 + k));
7077 m128iS22 = _mm_load_si128((__m128i *) (src + 768 + 16 + k));
7078 m128iS23 = _mm_load_si128((__m128i *) (src + 896 + 16 + k));
7079 m128iS24 = _mm_load_si128((__m128i *) (src + 24 + k));
7080 m128iS25 = _mm_load_si128((__m128i *) (src + 128 + 24 + k));
7081 m128iS26 = _mm_load_si128((__m128i *) (src + 256 + 24 + k));
7082 m128iS27 = _mm_load_si128((__m128i *) (src + 384 + 24 + k));
7083 m128iS28 = _mm_loadu_si128(
7084 (__m128i *) (src + 512 + 24 + k));
7085 m128iS29 = _mm_load_si128((__m128i *) (src + 640 + 24 + k));
7086 m128iS30 = _mm_load_si128((__m128i *) (src + 768 + 24 + k));
7087 m128iS31 = _mm_load_si128((__m128i *) (src + 896 + 24 + k));
7088 }
7089 }
7090 }
7091 }
7092 }
7093 #endif
7094
7095