1 /*
2  * H.265 video codec.
3  * Copyright (c) 2013 openHEVC contributors
4  * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
5  *
6  * This file is part of libde265.
7  *
8  * libde265 is free software: you can redistribute it and/or modify
9  * it under the terms of the GNU Lesser General Public License as
10  * published by the Free Software Foundation, either version 3 of
11  * the License, or (at your option) any later version.
12  *
13  * libde265 is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public License
19  * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
20  */
21 
22 #include "x86/sse-dct.h"
23 #include "libde265/util.h"
24 
25 #ifdef HAVE_CONFIG_H
26 #include "config.h"
27 #endif
28 
29 #include <emmintrin.h> // SSE2
30 #include <tmmintrin.h> // SSSE3
31 
32 #if HAVE_SSE4_1
33 #include <smmintrin.h> // SSE4.1
34 #endif
35 
36 
37 ALIGNED_16(static const int16_t) transform4x4_luma[8][8] =
38 {
39     {   29, +84, 29,  +84,  29, +84,  29, +84 },
40     {  +74, +55, +74, +55, +74, +55, +74, +55 },
41     {   55, -29,  55, -29,  55, -29,  55, -29 },
42     {  +74, -84, +74, -84, +74, -84, +74, -84 },
43     {   74, -74,  74, -74,  74, -74,  74, -74 },
44     {    0, +74,   0, +74,   0, +74,   0, +74 },
45     {   84, +55,  84, +55,  84, +55,  84, +55 },
46     {  -74, -29, -74, -29, -74, -29, -74, -29 }
47 };
48 
49 ALIGNED_16(static const int16_t) transform4x4[4][8] = {
50     { 64,  64, 64,  64, 64,  64, 64,  64 },
51     { 64, -64, 64, -64, 64, -64, 64, -64 },
52     { 83,  36, 83,  36, 83,  36, 83,  36 },
53     { 36, -83, 36, -83, 36, -83, 36, -83 }
54 };
55 
56 ALIGNED_16(static const int16_t) transform8x8[12][8] =
57 {
58     {  89,  75,  89,  75, 89,  75, 89,  75 },
59     {  50,  18,  50,  18, 50,  18, 50,  18 },
60     {  75, -18,  75, -18, 75, -18, 75, -18 },
61     { -89, -50, -89, -50,-89, -50,-89, -50 },
62     {  50, -89,  50, -89, 50, -89, 50, -89 },
63     {  18,  75,  18,  75, 18,  75, 18,  75 },
64     {  18, -50,  18, -50, 18, -50, 18, -50 },
65     {  75, -89,  75, -89, 75, -89, 75, -89 },
66     {  64,  64,  64,  64, 64,  64, 64,  64 },
67     {  64, -64,  64, -64, 64, -64, 64, -64 },
68     {  83,  36,  83,  36, 83,  36, 83,  36 },
69     {  36, -83,  36, -83, 36, -83, 36, -83 }
70 };
71 
72 ALIGNED_16(static const int16_t) transform16x16_1[4][8][8] =
73 {
74     {/*1-3*/ /*2-6*/
75         { 90,  87,  90,  87,  90,  87,  90,  87 },
76         { 87,  57,  87,  57,  87,  57,  87,  57 },
77         { 80,   9,  80,   9,  80,   9,  80,   9 },
78         { 70, -43,  70, -43,  70, -43,  70, -43 },
79         { 57, -80,  57, -80,  57, -80,  57, -80 },
80         { 43, -90,  43, -90,  43, -90,  43, -90 },
81         { 25, -70,  25, -70,  25, -70,  25, -70 },
82         { 9,  -25,   9, -25,   9, -25,   9, -25 },
83     },{ /*5-7*/ /*10-14*/
84         {  80,  70,  80,  70,  80,  70,  80,  70 },
85         {   9, -43,   9, -43,   9, -43,   9, -43 },
86         { -70, -87, -70, -87, -70, -87, -70, -87 },
87         { -87,   9, -87,   9, -87,   9, -87,   9 },
88         { -25,  90, -25,  90, -25,  90, -25,  90 },
89         {  57,  25,  57,  25,  57,  25,  57,  25 },
90         {  90, -80,  90, -80,  90, -80,  90, -80 },
91         {  43, -57,  43, -57,  43, -57,  43, -57 },
92     },{ /*9-11*/ /*18-22*/
93         {  57,  43,  57,  43,  57,  43,  57,  43 },
94         { -80, -90, -80, -90, -80, -90, -80, -90 },
95         { -25,  57, -25,  57, -25,  57, -25,  57 },
96         {  90,  25,  90,  25,  90,  25,  90,  25 },
97         {  -9,  -87, -9,  -87, -9,  -87, -9, -87 },
98         { -87,  70, -87,  70, -87,  70, -87,  70 },
99         {  43,   9,  43,   9,  43,   9,  43,   9 },
100         {  70, -80,  70, -80,  70, -80,  70, -80 },
101     },{/*13-15*/ /*  26-30   */
102         {  25,   9,  25,   9,  25,   9,  25,   9 },
103         { -70, -25, -70, -25, -70, -25, -70, -25 },
104         {  90,  43,  90,  43,  90,  43,  90,  43 },
105         { -80, -57, -80, -57, -80, -57, -80, -57 },
106         {  43,  70,  43,  70,  43,  70,  43,  70 },
107         {  9,  -80,   9, -80,   9, -80,   9, -80 },
108         { -57,  87, -57,  87, -57,  87, -57,  87 },
109         {  87, -90,  87, -90,  87, -90,  87, -90 },
110     }
111 };
112 
113 ALIGNED_16(static const int16_t) transform16x16_2[2][4][8] =
114 {
115     { /*2-6*/ /*4-12*/
116         { 89,  75,  89,  75, 89,  75, 89,  75 },
117         { 75, -18,  75, -18, 75, -18, 75, -18 },
118         { 50, -89,  50, -89, 50, -89, 50, -89 },
119         { 18, -50,  18, -50, 18, -50, 18, -50 },
120     },{ /*10-14*/  /*20-28*/
121         {  50,  18,  50,  18,  50,  18,  50,  18 },
122         { -89, -50, -89, -50, -89, -50, -89, -50 },
123         {  18,  75,  18,  75,  18,  75,  18,  75 },
124         {  75, -89,  75, -89,  75, -89,  75, -89 },
125     }
126 };
127 
128 ALIGNED_16(static const int16_t) transform16x16_3[2][2][8] =
129 {
130     {/*4-12*/ /*8-24*/
131         {  83,  36,  83,  36,  83,  36,  83,  36 },
132         {  36, -83,  36, -83,  36, -83,  36, -83 },
133     },{ /*0-8*/  /*0-16*/
134         { 64,  64, 64,  64, 64,  64, 64,  64 },
135         { 64, -64, 64, -64, 64, -64, 64, -64 },
136     }
137 };
138 
139 
140 ALIGNED_16(static const int16_t) transform32x32[8][16][8] =
141 {
142     { /*   1-3     */
143         { 90,  90, 90,  90, 90,  90, 90,  90 },
144         { 90,  82, 90,  82, 90,  82, 90,  82 },
145         { 88,  67, 88,  67, 88,  67, 88,  67 },
146         { 85,  46, 85,  46, 85,  46, 85,  46 },
147         { 82,  22, 82,  22, 82,  22, 82,  22 },
148         { 78,  -4, 78,  -4, 78,  -4, 78,  -4 },
149         { 73, -31, 73, -31, 73, -31, 73, -31 },
150         { 67, -54, 67, -54, 67, -54, 67, -54 },
151         { 61, -73, 61, -73, 61, -73, 61, -73 },
152         { 54, -85, 54, -85, 54, -85, 54, -85 },
153         { 46, -90, 46, -90, 46, -90, 46, -90 },
154         { 38, -88, 38, -88, 38, -88, 38, -88 },
155         { 31, -78, 31, -78, 31, -78, 31, -78 },
156         { 22, -61, 22, -61, 22, -61, 22, -61 },
157         { 13, -38, 13, -38, 13, -38, 13, -38 },
158         { 4,  -13,  4, -13,  4, -13,  4, -13 },
159     },{/*  5-7 */
160         {  88,  85,  88,  85,  88,  85,  88,  85 },
161         {  67,  46,  67,  46,  67,  46,  67,  46 },
162         {  31, -13,  31, -13,  31, -13,  31, -13 },
163         { -13, -67, -13, -67, -13, -67, -13, -67 },
164         { -54, -90, -54, -90, -54, -90, -54, -90 },
165         { -82, -73, -82, -73, -82, -73, -82, -73 },
166         { -90, -22, -90, -22, -90, -22, -90, -22 },
167         { -78,  38, -78,  38, -78,  38, -78,  38 },
168         { -46,  82, -46,  82, -46,  82, -46,  82 },
169         {  -4,  88,  -4,  88,  -4,  88,  -4,  88 },
170         {  38,  54,  38,  54,  38,  54,  38,  54 },
171         {  73,  -4,  73,  -4,  73,  -4,  73,  -4 },
172         {  90, -61,  90, -61,  90, -61,  90, -61 },
173         {  85, -90,  85, -90,  85, -90,  85, -90 },
174         {  61, -78,  61, -78,  61, -78,  61, -78 },
175         {  22, -31,  22, -31,  22, -31,  22, -31 },
176     },{/*  9-11   */
177         {  82,  78,  82,  78,  82,  78,  82,  78 },
178         {  22,  -4,  22,  -4,  22,  -4,  22,  -4 },
179         { -54, -82, -54, -82, -54, -82, -54, -82 },
180         { -90, -73, -90, -73, -90, -73, -90, -73 },
181         { -61,  13, -61,  13, -61,  13, -61,  13 },
182         {  13,  85,  13,  85,  13,  85,  13,  85 },
183         {  78,  67,  78,  67,  78,  67,  78,  67 },
184         {  85, -22,  85, -22,  85, -22,  85, -22 },
185         {  31, -88,  31, -88,  31, -88,  31, -88 },
186         { -46, -61, -46, -61, -46, -61, -46, -61 },
187         { -90,  31, -90,  31, -90,  31, -90,  31 },
188         { -67,  90, -67,  90, -67,  90, -67,  90 },
189         {   4,  54,   4,  54,   4,  54,   4,  54 },
190         {  73, -38,  73, -38,  73, -38,  73, -38 },
191         {  88, -90,  88, -90,  88, -90,  88, -90 },
192         {  38, -46,  38, -46,  38, -46,  38, -46 },
193     },{/*  13-15   */
194         {  73,  67,  73,  67,  73,  67,  73,  67 },
195         { -31, -54, -31, -54, -31, -54, -31, -54 },
196         { -90, -78, -90, -78, -90, -78, -90, -78 },
197         { -22,  38, -22,  38, -22,  38, -22,  38 },
198         {  78,  85,  78,  85,  78,  85,  78,  85 },
199         {  67, -22,  67, -22,  67, -22,  67, -22 },
200         { -38, -90, -38, -90, -38, -90, -38, -90 },
201         { -90,   4, -90,   4, -90,   4, -90,   4 },
202         { -13,  90, -13,  90, -13,  90, -13,  90 },
203         {  82,  13,  82,  13,  82,  13,  82,  13 },
204         {  61, -88,  61, -88,  61, -88,  61, -88 },
205         { -46, -31, -46, -31, -46, -31, -46, -31 },
206         { -88,  82, -88,  82, -88,  82, -88,  82 },
207         { -4,   46, -4,   46, -4,   46, -4,   46 },
208         {  85, -73,  85, -73,  85, -73,  85, -73 },
209         {  54, -61,  54, -61,  54, -61,  54, -61 },
210     },{/*  17-19   */
211         {  61,  54,  61,  54,  61,  54,  61,  54 },
212         { -73, -85, -73, -85, -73, -85, -73, -85 },
213         { -46,  -4, -46,  -4, -46,  -4, -46,  -4 },
214         {  82,  88,  82,  88,  82,  88,  82,  88 },
215         {  31, -46,  31, -46,  31, -46,  31, -46 },
216         { -88, -61, -88, -61, -88, -61, -88, -61 },
217         { -13,  82, -13,  82, -13,  82, -13,  82 },
218         {  90,  13,  90,  13,  90,  13,  90,  13 },
219         { -4, -90,  -4, -90,  -4, -90,  -4, -90 },
220         { -90,  38, -90,  38, -90,  38, -90,  38 },
221         {  22,  67,  22,  67,  22,  67,  22,  67 },
222         {  85, -78,  85, -78,  85, -78,  85, -78 },
223         { -38, -22, -38, -22, -38, -22, -38, -22 },
224         { -78,  90, -78,  90, -78,  90, -78,  90 },
225         {  54, -31,  54, -31,  54, -31,  54, -31 },
226         {  67, -73,  67, -73,  67, -73,  67, -73 },
227     },{ /*  21-23   */
228         {  46,  38,  46,  38,  46,  38,  46,  38 },
229         { -90, -88, -90, -88, -90, -88, -90, -88 },
230         {  38,  73,  38,  73,  38,  73,  38,  73 },
231         {  54,  -4,  54,  -4,  54,  -4,  54,  -4 },
232         { -90, -67, -90, -67, -90, -67, -90, -67 },
233         {  31,  90,  31,  90,  31,  90,  31,  90 },
234         {  61, -46,  61, -46,  61, -46,  61, -46 },
235         { -88, -31, -88, -31, -88, -31, -88, -31 },
236         {  22,  85,  22,  85,  22,  85,  22,  85 },
237         {  67, -78,  67, -78,  67, -78,  67, -78 },
238         { -85,  13, -85,  13, -85,  13, -85,  13 },
239         {  13,  61,  13,  61,  13,  61,  13,  61 },
240         {  73, -90,  73, -90,  73, -90,  73, -90 },
241         { -82,  54, -82,  54, -82,  54, -82,  54 },
242         {   4,  22,   4,  22,   4,  22,   4,  22 },
243         {  78, -82,  78, -82,  78, -82,  78, -82 },
244     },{ /*  25-27   */
245         {  31,  22,  31,  22,  31,  22,  31,  22 },
246         { -78, -61, -78, -61, -78, -61, -78, -61 },
247         {  90,  85,  90,  85,  90,  85,  90,  85 },
248         { -61, -90, -61, -90, -61, -90, -61, -90 },
249         {   4,  73,   4,  73,   4,  73,   4,  73 },
250         {  54, -38,  54, -38,  54, -38,  54, -38 },
251         { -88,  -4, -88,  -4, -88,  -4, -88,  -4 },
252         {  82,  46,  82,  46,  82,  46,  82,  46 },
253         { -38, -78, -38, -78, -38, -78, -38, -78 },
254         { -22,  90, -22,  90, -22,  90, -22,  90 },
255         {  73, -82,  73, -82,  73, -82,  73, -82 },
256         { -90,  54, -90,  54, -90,  54, -90,  54 },
257         {  67, -13,  67, -13,  67, -13,  67, -13 },
258         { -13, -31, -13, -31, -13, -31, -13, -31 },
259         { -46,  67, -46,  67, -46,  67, -46,  67 },
260         {  85, -88,  85, -88,  85, -88,  85, -88 },
261     },{/*  29-31   */
262         {  13,   4,  13,   4,  13,   4,  13,   4 },
263         { -38, -13, -38, -13, -38, -13, -38, -13 },
264         {  61,  22,  61,  22,  61,  22,  61,  22 },
265         { -78, -31, -78, -31, -78, -31, -78, -31 },
266         {  88,  38,  88,  38,  88,  38,  88,  38 },
267         { -90, -46, -90, -46, -90, -46, -90, -46 },
268         {  85,  54,  85,  54,  85,  54,  85,  54 },
269         { -73, -61, -73, -61, -73, -61, -73, -61 },
270         {  54,  67,  54,  67,  54,  67,  54,  67 },
271         { -31, -73, -31, -73, -31, -73, -31, -73 },
272         {   4,  78,   4,  78,   4,  78,   4,  78 },
273         {  22, -82,  22, -82,  22, -82,  22, -82 },
274         { -46,  85, -46,  85, -46,  85, -46,  85 },
275         {  67, -88,  67, -88,  67, -88,  67, -88 },
276         { -82,  90, -82,  90, -82,  90, -82,  90 },
277         {  90, -90,  90, -90,  90, -90,  90, -90 },
278     }
279 };
280 
281 #define shift_1st 7
282 #define add_1st (1 << (shift_1st - 1))
283 
284 
ff_hevc_transform_skip_8_sse(uint8_t * _dst,const int16_t * coeffs,ptrdiff_t _stride)285 void ff_hevc_transform_skip_8_sse(uint8_t *_dst, const int16_t *coeffs, ptrdiff_t _stride)
286 {
287     uint8_t *dst = (uint8_t*)_dst;
288     ptrdiff_t stride = _stride;
289     int shift = 5;
290     int offset = 16;
291     __m128i r0,r1,r2,r3,r4,r5,r6,r9;
292 
293     r9= _mm_setzero_si128();
294     //r8= _mm_set_epi32(0,0,0,-1);
295     r2= _mm_set1_epi16(offset);
296 
297     r0= _mm_load_si128((__m128i*)(coeffs));
298     r1= _mm_load_si128((__m128i*)(coeffs+8));
299 
300 
301     r0= _mm_adds_epi16(r0,r2);
302     r1= _mm_adds_epi16(r1,r2);
303 
304     r0= _mm_srai_epi16(r0,shift);
305     r1= _mm_srai_epi16(r1,shift);
306 
307     r3= _mm_loadl_epi64((__m128i*)(dst));
308     r4= _mm_loadl_epi64((__m128i*)(dst + stride));
309     r5= _mm_loadl_epi64((__m128i*)(dst + 2*stride));
310     r6= _mm_loadl_epi64((__m128i*)(dst + 3*stride));
311 
312     r3= _mm_unpacklo_epi8(r3,r9);
313     r4= _mm_unpacklo_epi8(r4,r9);
314     r5= _mm_unpacklo_epi8(r5,r9);
315     r6= _mm_unpacklo_epi8(r6,r9);
316     r3= _mm_unpacklo_epi64(r3,r4);
317     r4= _mm_unpacklo_epi64(r5,r6);
318 
319 
320     r3= _mm_adds_epi16(r3,r0);
321     r4= _mm_adds_epi16(r4,r1);
322 
323     r3= _mm_packus_epi16(r3,r4);
324     //r8= _mm_set_epi32(0,0,0,-1);
325 
326     //_mm_maskmoveu_si128(r3,r8,(char *) (dst));
327     *((uint32_t*)(dst)) = _mm_cvtsi128_si32(r3);
328 
329     r3= _mm_srli_si128(r3,4);
330     //_mm_maskmoveu_si128(r3,r8,(char *) (dst+stride));
331     *((uint32_t*)(dst+stride)) = _mm_cvtsi128_si32(r3);
332 
333     r3= _mm_srli_si128(r3,4);
334     //_mm_maskmoveu_si128(r3,r8,(char *) (dst+2*stride));
335     *((uint32_t*)(dst+2*stride)) = _mm_cvtsi128_si32(r3);
336 
337     r3= _mm_srli_si128(r3,4);
338     //_mm_maskmoveu_si128(r3,r8,(char *) (dst+3*stride));
339     *((uint32_t*)(dst+3*stride)) = _mm_cvtsi128_si32(r3);
340 }
341 
342 
343 
344 #if HAVE_SSE4_1
ff_hevc_transform_4x4_luma_add_8_sse4(uint8_t * _dst,const int16_t * coeffs,ptrdiff_t _stride)345 void ff_hevc_transform_4x4_luma_add_8_sse4(uint8_t *_dst, const int16_t *coeffs,
346                                            ptrdiff_t _stride) {
347 
348     uint8_t shift_2nd = 12; // 20 - Bit depth
349     uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
350 
351     uint8_t *dst = (uint8_t*) _dst;
352     ptrdiff_t stride = _stride;
353     const int16_t *src = coeffs;
354     __m128i m128iAdd, S0, S8, m128iTmp1, m128iTmp2, m128iAC, m128iBD, m128iA,
355             m128iD;
356     m128iAdd = _mm_set1_epi32(64);
357 
358     S0 = _mm_load_si128((__m128i *) (src));
359     S8 = _mm_load_si128((__m128i *) (src + 8));
360 
361     m128iAC = _mm_unpacklo_epi16(S0, S8);
362     m128iBD = _mm_unpackhi_epi16(S0, S8);
363 
364     m128iTmp1 = _mm_madd_epi16(m128iAC,
365             _mm_load_si128((__m128i *) (transform4x4_luma[0])));
366     m128iTmp2 = _mm_madd_epi16(m128iBD,
367             _mm_load_si128((__m128i *) (transform4x4_luma[1])));
368     S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
369     S0 = _mm_add_epi32(S0, m128iAdd);
370     S0 = _mm_srai_epi32(S0, shift_1st);
371 
372     m128iTmp1 = _mm_madd_epi16(m128iAC,
373             _mm_load_si128((__m128i *) (transform4x4_luma[2])));
374     m128iTmp2 = _mm_madd_epi16(m128iBD,
375             _mm_load_si128((__m128i *) (transform4x4_luma[3])));
376     S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
377     S8 = _mm_add_epi32(S8, m128iAdd);
378     S8 = _mm_srai_epi32(S8, shift_1st);
379 
380     m128iA = _mm_packs_epi32(S0, S8);
381 
382     m128iTmp1 = _mm_madd_epi16(m128iAC,
383             _mm_load_si128((__m128i *) (transform4x4_luma[4])));
384     m128iTmp2 = _mm_madd_epi16(m128iBD,
385             _mm_load_si128((__m128i *) (transform4x4_luma[5])));
386     S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
387     S0 = _mm_add_epi32(S0, m128iAdd);
388     S0 = _mm_srai_epi32(S0, shift_1st);
389 
390     m128iTmp1 = _mm_madd_epi16(m128iAC,
391             _mm_load_si128((__m128i *) (transform4x4_luma[6])));
392     m128iTmp2 = _mm_madd_epi16(m128iBD,
393             _mm_load_si128((__m128i *) (transform4x4_luma[7])));
394     S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
395     S8 = _mm_add_epi32(S8, m128iAdd);
396     S8 = _mm_srai_epi32(S8, shift_1st);
397 
398     m128iD = _mm_packs_epi32(S0, S8);
399 
400     S0 = _mm_unpacklo_epi16(m128iA, m128iD);
401     S8 = _mm_unpackhi_epi16(m128iA, m128iD);
402 
403     m128iA = _mm_unpacklo_epi16(S0, S8);
404     m128iD = _mm_unpackhi_epi16(S0, S8);
405 
406     /*   ###################    */
407     m128iAdd = _mm_set1_epi32(add_2nd);
408 
409     m128iAC = _mm_unpacklo_epi16(m128iA, m128iD);
410     m128iBD = _mm_unpackhi_epi16(m128iA, m128iD);
411 
412     m128iTmp1 = _mm_madd_epi16(m128iAC,
413             _mm_load_si128((__m128i *) (transform4x4_luma[0])));
414     m128iTmp2 = _mm_madd_epi16(m128iBD,
415             _mm_load_si128((__m128i *) (transform4x4_luma[1])));
416     S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
417     S0 = _mm_add_epi32(S0, m128iAdd);
418     S0 = _mm_srai_epi32(S0, shift_2nd);
419 
420     m128iTmp1 = _mm_madd_epi16(m128iAC,
421             _mm_load_si128((__m128i *) (transform4x4_luma[2])));
422     m128iTmp2 = _mm_madd_epi16(m128iBD,
423             _mm_load_si128((__m128i *) (transform4x4_luma[3])));
424     S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
425     S8 = _mm_add_epi32(S8, m128iAdd);
426     S8 = _mm_srai_epi32(S8, shift_2nd);
427 
428     m128iA = _mm_packs_epi32(S0, S8);
429 
430     m128iTmp1 = _mm_madd_epi16(m128iAC,
431             _mm_load_si128((__m128i *) (transform4x4_luma[4])));
432     m128iTmp2 = _mm_madd_epi16(m128iBD,
433             _mm_load_si128((__m128i *) (transform4x4_luma[5])));
434     S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
435     S0 = _mm_add_epi32(S0, m128iAdd);
436     S0 = _mm_srai_epi32(S0, shift_2nd);
437 
438     m128iTmp1 = _mm_madd_epi16(m128iAC,
439             _mm_load_si128((__m128i *) (transform4x4_luma[6])));
440     m128iTmp2 = _mm_madd_epi16(m128iBD,
441             _mm_load_si128((__m128i *) (transform4x4_luma[7])));
442     S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
443     S8 = _mm_add_epi32(S8, m128iAdd);
444     S8 = _mm_srai_epi32(S8, shift_2nd);
445 
446     m128iD = _mm_packs_epi32(S0, S8);
447 
448 //    _mm_storeu_si128((__m128i *) (src), m128iA);
449 //    _mm_storeu_si128((__m128i *) (src + 8), m128iD);
450 
451     S0 = _mm_move_epi64(m128iA); //contains row 0
452     S8 = _mm_move_epi64(m128iD); //row 2
453     m128iA = _mm_srli_si128(m128iA, 8); // row 1
454     m128iD = _mm_srli_si128(m128iD, 8); // row 3
455     m128iTmp1 = _mm_unpacklo_epi16(S0, m128iA);
456     m128iTmp2 = _mm_unpacklo_epi16(S8, m128iD);
457     S0 = _mm_unpacklo_epi32(m128iTmp1, m128iTmp2);
458     S8 = _mm_unpackhi_epi32(m128iTmp1, m128iTmp2);
459 
460     //m128iTmp2 = _mm_set_epi32(0, 0, 0, -1);   //mask to store 4 * 8bit data
461 
462     m128iA = _mm_loadl_epi64((__m128i *) dst);
463     m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
464     m128iTmp1 = _mm_adds_epi16(S0, m128iA);	//contains first 4 values
465     m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
466     //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
467     *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
468 
469     dst += stride;
470 
471     m128iA = _mm_loadl_epi64((__m128i *) dst);
472     m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
473     m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S0, 8), m128iA);
474     m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
475     //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
476     *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
477 
478     dst += stride;
479 
480     m128iA = _mm_loadl_epi64((__m128i *) dst);
481     m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
482     m128iTmp1 = _mm_adds_epi16(S8, m128iA);
483     m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
484     //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
485     *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
486 
487     dst += stride;
488 
489     m128iA = _mm_loadl_epi64((__m128i *) dst);
490     m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
491     m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S8, 8), m128iA);
492     m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
493     //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
494     *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
495 }
496 #endif // SSE4.1
497 
498 #if 0
499 void ff_hevc_transform_4x4_luma_add_10_sse4(uint8_t *_dst, const int16_t *coeffs,
500         ptrdiff_t _stride) {
501     int i,j;
502     uint8_t shift_2nd = 10; // 20 - Bit depth
503     uint16_t add_2nd = 1 << 9; //(1 << (shift_2nd - 1))
504 
505     uint16_t *dst = (uint16_t*) _dst;
506     ptrdiff_t stride = _stride/(sizeof(uint16_t));
507     int16_t *src = coeffs;
508     __m128i m128iAdd, S0, S8, m128iTmp1, m128iTmp2, m128iAC, m128iBD, m128iA,
509             m128iD;
510 
511     m128iAdd = _mm_set1_epi32(64);
512 
513     S0 = _mm_loadu_si128((__m128i *) (src));
514     S8 = _mm_loadu_si128((__m128i *) (src + 8));
515 
516     m128iAC = _mm_unpacklo_epi16(S0, S8);
517     m128iBD = _mm_unpackhi_epi16(S0, S8);
518 
519     m128iTmp1 = _mm_madd_epi16(m128iAC,
520             _mm_loadu_si128((__m128i *) (transform4x4_luma[0])));
521     m128iTmp2 = _mm_madd_epi16(m128iBD,
522             _mm_loadu_si128((__m128i *) (transform4x4_luma[1])));
523     S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
524     S0 = _mm_add_epi32(S0, m128iAdd);
525     S0 = _mm_srai_epi32(S0, shift_1st);
526 
527     m128iTmp1 = _mm_madd_epi16(m128iAC,
528             _mm_loadu_si128((__m128i *) (transform4x4_luma[2])));
529     m128iTmp2 = _mm_madd_epi16(m128iBD,
530             _mm_loadu_si128((__m128i *) (transform4x4_luma[3])));
531     S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
532     S8 = _mm_add_epi32(S8, m128iAdd);
533     S8 = _mm_srai_epi32(S8, shift_1st);
534 
535     m128iA = _mm_packs_epi32(S0, S8);
536 
537     m128iTmp1 = _mm_madd_epi16(m128iAC,
538             _mm_loadu_si128((__m128i *) (transform4x4_luma[4])));
539     m128iTmp2 = _mm_madd_epi16(m128iBD,
540             _mm_loadu_si128((__m128i *) (transform4x4_luma[5])));
541     S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
542     S0 = _mm_add_epi32(S0, m128iAdd);
543     S0 = _mm_srai_epi32(S0, shift_1st);
544 
545     m128iTmp1 = _mm_madd_epi16(m128iAC,
546             _mm_loadu_si128((__m128i *) (transform4x4_luma[6])));
547     m128iTmp2 = _mm_madd_epi16(m128iBD,
548             _mm_loadu_si128((__m128i *) (transform4x4_luma[7])));
549     S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
550     S8 = _mm_add_epi32(S8, m128iAdd);
551     S8 = _mm_srai_epi32(S8, shift_1st);
552 
553     m128iD = _mm_packs_epi32(S0, S8);
554 
555     S0 = _mm_unpacklo_epi16(m128iA, m128iD);
556     S8 = _mm_unpackhi_epi16(m128iA, m128iD);
557 
558     m128iA = _mm_unpacklo_epi16(S0, S8);
559     m128iD = _mm_unpackhi_epi16(S0, S8);
560 
561     /*   ###################    */
562     m128iAdd = _mm_set1_epi32(add_2nd);
563 
564     m128iAC = _mm_unpacklo_epi16(m128iA, m128iD);
565     m128iBD = _mm_unpackhi_epi16(m128iA, m128iD);
566 
567     m128iTmp1 = _mm_madd_epi16(m128iAC,
568             _mm_load_si128((__m128i *) (transform4x4_luma[0])));
569     m128iTmp2 = _mm_madd_epi16(m128iBD,
570             _mm_load_si128((__m128i *) (transform4x4_luma[1])));
571     S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
572     S0 = _mm_add_epi32(S0, m128iAdd);
573     S0 = _mm_srai_epi32(S0, shift_2nd);
574 
575     m128iTmp1 = _mm_madd_epi16(m128iAC,
576             _mm_load_si128((__m128i *) (transform4x4_luma[2])));
577     m128iTmp2 = _mm_madd_epi16(m128iBD,
578             _mm_load_si128((__m128i *) (transform4x4_luma[3])));
579     S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
580     S8 = _mm_add_epi32(S8, m128iAdd);
581     S8 = _mm_srai_epi32(S8, shift_2nd);
582 
583     m128iA = _mm_packs_epi32(S0, S8);
584 
585     m128iTmp1 = _mm_madd_epi16(m128iAC,
586             _mm_load_si128((__m128i *) (transform4x4_luma[4])));
587     m128iTmp2 = _mm_madd_epi16(m128iBD,
588             _mm_load_si128((__m128i *) (transform4x4_luma[5])));
589     S0 = _mm_add_epi32(m128iTmp1, m128iTmp2);
590     S0 = _mm_add_epi32(S0, m128iAdd);
591     S0 = _mm_srai_epi32(S0, shift_2nd);
592 
593     m128iTmp1 = _mm_madd_epi16(m128iAC,
594             _mm_load_si128((__m128i *) (transform4x4_luma[6])));
595     m128iTmp2 = _mm_madd_epi16(m128iBD,
596             _mm_load_si128((__m128i *) (transform4x4_luma[7])));
597     S8 = _mm_add_epi32(m128iTmp1, m128iTmp2);
598     S8 = _mm_add_epi32(S8, m128iAdd);
599     S8 = _mm_srai_epi32(S8, shift_2nd);
600 
601     m128iD = _mm_packs_epi32(S0, S8);
602 
603     _mm_storeu_si128((__m128i *) (src), m128iA);
604     _mm_storeu_si128((__m128i *) (src + 8), m128iD);
605     j = 0;
606     for (i = 0; i < 2; i++) {
607         dst[0] = av_clip_uintp2(dst[0] + src[j],10);
608         dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10);
609         dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10);
610         dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10);
611         j += 1;
612         dst += stride;
613         dst[0] = av_clip_uintp2(dst[0] + src[j],10);
614         dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10);
615         dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10);
616         dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10);
617         j += 1;
618         dst += stride;
619     }
620 
621 }
622 #endif
623 
624 
625 #if HAVE_SSE4_1
ff_hevc_transform_4x4_add_8_sse4(uint8_t * _dst,const int16_t * coeffs,ptrdiff_t _stride)626 void ff_hevc_transform_4x4_add_8_sse4(uint8_t *_dst, const int16_t *coeffs,
627         ptrdiff_t _stride) {
628     uint8_t shift_2nd = 12; // 20 - Bit depth
629     uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
630 
631     uint8_t *dst = (uint8_t*) _dst;
632     ptrdiff_t stride = _stride;
633     const int16_t *src = coeffs;
634 
635     __m128i S0, S8, m128iAdd, m128Tmp, E1, E2, O1, O2, m128iA, m128iD, m128iTmp1,m128iTmp2;
636     S0 = _mm_load_si128((__m128i *) (src));
637     S8 = _mm_load_si128((__m128i *) (src + 8));
638     m128iAdd = _mm_set1_epi32(add_1st);
639 
640     m128Tmp = _mm_unpacklo_epi16(S0, S8);
641     E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0])));
642     E1 = _mm_add_epi32(E1, m128iAdd);
643 
644     E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1])));
645     E2 = _mm_add_epi32(E2, m128iAdd);
646 
647     m128Tmp = _mm_unpackhi_epi16(S0, S8);
648     O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2])));
649     O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3])));
650 
651     m128iA = _mm_add_epi32(E1, O1);
652     m128iA = _mm_srai_epi32(m128iA, shift_1st);        // Sum = Sum >> iShiftNum
653     m128Tmp = _mm_add_epi32(E2, O2);
654     m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st);      // Sum = Sum >> iShiftNum
655     m128iA = _mm_packs_epi32(m128iA, m128Tmp);
656 
657     m128iD = _mm_sub_epi32(E2, O2);
658     m128iD = _mm_srai_epi32(m128iD, shift_1st);        // Sum = Sum >> iShiftNum
659 
660     m128Tmp = _mm_sub_epi32(E1, O1);
661     m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st);      // Sum = Sum >> iShiftNum
662 
663     m128iD = _mm_packs_epi32(m128iD, m128Tmp);
664 
665     S0 = _mm_unpacklo_epi16(m128iA, m128iD);
666     S8 = _mm_unpackhi_epi16(m128iA, m128iD);
667 
668     m128iA = _mm_unpacklo_epi16(S0, S8);
669     m128iD = _mm_unpackhi_epi16(S0, S8);
670 
671     /*  ##########################  */
672 
673     m128iAdd = _mm_set1_epi32(add_2nd);
674     m128Tmp = _mm_unpacklo_epi16(m128iA, m128iD);
675     E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0])));
676     E1 = _mm_add_epi32(E1, m128iAdd);
677 
678     E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1])));
679     E2 = _mm_add_epi32(E2, m128iAdd);
680 
681     m128Tmp = _mm_unpackhi_epi16(m128iA, m128iD);
682     O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2])));
683     O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3])));
684 
685     m128iA = _mm_add_epi32(E1, O1);
686     m128iA = _mm_srai_epi32(m128iA, shift_2nd);
687     m128Tmp = _mm_add_epi32(E2, O2);
688     m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd);
689     m128iA = _mm_packs_epi32(m128iA, m128Tmp);
690 
691     m128iD = _mm_sub_epi32(E2, O2);
692     m128iD = _mm_srai_epi32(m128iD, shift_2nd);
693 
694     m128Tmp = _mm_sub_epi32(E1, O1);
695     m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd);
696 
697     m128iD = _mm_packs_epi32(m128iD, m128Tmp);
698 
699     S0 = _mm_move_epi64(m128iA); //contains row 0
700     S8 = _mm_move_epi64(m128iD); //row 2
701     m128iA = _mm_srli_si128(m128iA, 8); // row 1
702     m128iD = _mm_srli_si128(m128iD, 8); // row 3
703     m128iTmp1 = _mm_unpacklo_epi16(S0, m128iA);
704     m128iTmp2 = _mm_unpacklo_epi16(S8, m128iD);
705     S0 = _mm_unpacklo_epi32(m128iTmp1, m128iTmp2);
706     S8 = _mm_unpackhi_epi32(m128iTmp1, m128iTmp2);
707 
708     //m128iTmp2 = _mm_set_epi32(0, 0, 0, -1);   //mask to store 4 * 8bit data
709 
710     m128iA = _mm_loadl_epi64((__m128i *) dst);
711     m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
712     m128iTmp1 = _mm_adds_epi16(S0, m128iA);	//contains first 4 values
713     m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
714     //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
715     *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
716 
717     dst += stride;
718 
719     m128iA = _mm_loadl_epi64((__m128i *) dst);
720     m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
721     m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S0, 8), m128iA);
722     m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
723     //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
724     *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
725 
726     dst += stride;
727 
728     m128iA = _mm_loadl_epi64((__m128i *) dst);
729     m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
730     m128iTmp1 = _mm_adds_epi16(S8, m128iA);
731     m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
732     //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
733     *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
734 
735     dst += stride;
736 
737     m128iA = _mm_loadl_epi64((__m128i *) dst);
738     m128iA = _mm_unpacklo_epi8(m128iA, _mm_setzero_si128());
739     m128iTmp1 = _mm_adds_epi16(_mm_srli_si128(S8, 8), m128iA);
740     m128iTmp1 = _mm_packus_epi16(m128iTmp1, _mm_setzero_si128());
741     //_mm_maskmoveu_si128(m128iTmp1, m128iTmp2, (char*) dst);
742     *((uint32_t*)(dst)) = _mm_cvtsi128_si32(m128iTmp1);
743 }
744 #endif
745 
746 #if 0
747 void ff_hevc_transform_4x4_add_10_sse4(uint8_t *_dst, const int16_t *coeffs,
748         ptrdiff_t _stride) {
749     int i;
750     uint8_t shift_2nd = 10; // 20 - Bit depth
751     uint16_t add_2nd = 1 << 9; //(1 << (shift_2nd - 1))
752 
753     uint16_t *dst = (uint16_t*) _dst;
754     ptrdiff_t stride = _stride/2;
755     int16_t *src = coeffs;
756 
757     int j;
758         __m128i S0, S8, m128iAdd, m128Tmp, E1, E2, O1, O2, m128iA, m128iD;
759         S0 = _mm_load_si128((__m128i *) (src));
760         S8 = _mm_load_si128((__m128i *) (src + 8));
761         m128iAdd = _mm_set1_epi32(add_1st);
762 
763         m128Tmp = _mm_unpacklo_epi16(S0, S8);
764         E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0])));
765         E1 = _mm_add_epi32(E1, m128iAdd);
766 
767         E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1])));
768         E2 = _mm_add_epi32(E2, m128iAdd);
769 
770         m128Tmp = _mm_unpackhi_epi16(S0, S8);
771         O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2])));
772         O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3])));
773 
774         m128iA = _mm_add_epi32(E1, O1);
775         m128iA = _mm_srai_epi32(m128iA, shift_1st);        // Sum = Sum >> iShiftNum
776         m128Tmp = _mm_add_epi32(E2, O2);
777         m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st);      // Sum = Sum >> iShiftNum
778         m128iA = _mm_packs_epi32(m128iA, m128Tmp);
779 
780         m128iD = _mm_sub_epi32(E2, O2);
781         m128iD = _mm_srai_epi32(m128iD, shift_1st);        // Sum = Sum >> iShiftNum
782 
783         m128Tmp = _mm_sub_epi32(E1, O1);
784         m128Tmp = _mm_srai_epi32(m128Tmp, shift_1st);      // Sum = Sum >> iShiftNum
785 
786         m128iD = _mm_packs_epi32(m128iD, m128Tmp);
787 
788         S0 = _mm_unpacklo_epi16(m128iA, m128iD);
789         S8 = _mm_unpackhi_epi16(m128iA, m128iD);
790 
791         m128iA = _mm_unpacklo_epi16(S0, S8);
792         m128iD = _mm_unpackhi_epi16(S0, S8);
793 
794         /*  ##########################  */
795 
796         m128iAdd = _mm_set1_epi32(add_2nd);
797         m128Tmp = _mm_unpacklo_epi16(m128iA, m128iD);
798         E1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[0])));
799         E1 = _mm_add_epi32(E1, m128iAdd);
800 
801         E2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[1])));
802         E2 = _mm_add_epi32(E2, m128iAdd);
803 
804         m128Tmp = _mm_unpackhi_epi16(m128iA, m128iD);
805         O1 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[2])));
806         O2 = _mm_madd_epi16(m128Tmp, _mm_load_si128((__m128i *) (transform4x4[3])));
807 
808         m128iA = _mm_add_epi32(E1, O1);
809         m128iA = _mm_srai_epi32(m128iA, shift_2nd);
810         m128Tmp = _mm_add_epi32(E2, O2);
811         m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd);
812         m128iA = _mm_packs_epi32(m128iA, m128Tmp);
813 
814         m128iD = _mm_sub_epi32(E2, O2);
815         m128iD = _mm_srai_epi32(m128iD, shift_2nd);
816 
817         m128Tmp = _mm_sub_epi32(E1, O1);
818         m128Tmp = _mm_srai_epi32(m128Tmp, shift_2nd);
819 
820         m128iD = _mm_packs_epi32(m128iD, m128Tmp);
821         _mm_storeu_si128((__m128i *) (src), m128iA);
822         _mm_storeu_si128((__m128i *) (src + 8), m128iD);
823         j = 0;
824         for (i = 0; i < 2; i++) {
825             dst[0] = av_clip_uintp2(dst[0] + src[j],10);
826             dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10);
827             dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10);
828             dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10);
829             j += 1;
830             dst += stride;
831             dst[0] = av_clip_uintp2(dst[0] + src[j],10);
832             dst[1] = av_clip_uintp2(dst[1] + src[j + 4],10);
833             dst[2] = av_clip_uintp2(dst[2] + src[j + 8],10);
834             dst[3] = av_clip_uintp2(dst[3] + src[j + 12],10);
835             j += 1;
836             dst += stride;
837         }
838 }
839 #endif
840 
841 #if HAVE_SSE4_1
ff_hevc_transform_8x8_add_8_sse4(uint8_t * _dst,const int16_t * coeffs,ptrdiff_t _stride)842 void ff_hevc_transform_8x8_add_8_sse4(uint8_t *_dst, const int16_t *coeffs,
843         ptrdiff_t _stride) {
844     uint8_t shift_2nd = 12; // 20 - Bit depth
845     uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
846 
847     uint8_t *dst = (uint8_t*) _dst;
848     ptrdiff_t stride = _stride / sizeof(uint8_t);
849     const int16_t *src = coeffs;
850     __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
851             m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h,
852             E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l,
853 
854             O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h,
855             T0,T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11;
856     T0= _mm_load_si128((__m128i *) (transform8x8[0]));
857     T1= _mm_load_si128((__m128i *) (transform8x8[1]));
858     T2= _mm_load_si128((__m128i *) (transform8x8[2]));
859     T3= _mm_load_si128((__m128i *) (transform8x8[3]));
860     T4= _mm_load_si128((__m128i *) (transform8x8[4]));
861     T5= _mm_load_si128((__m128i *) (transform8x8[5]));
862     T6= _mm_load_si128((__m128i *) (transform8x8[6]));
863     T7= _mm_load_si128((__m128i *) (transform8x8[7]));
864     T8= _mm_load_si128((__m128i *) (transform8x8[8]));
865     T9= _mm_load_si128((__m128i *) (transform8x8[9]));
866     T10= _mm_load_si128((__m128i *) (transform8x8[10]));
867     T11= _mm_load_si128((__m128i *) (transform8x8[11]));
868 
869     m128iAdd = _mm_set1_epi32(add_1st);
870 
871     m128iS1 = _mm_load_si128((__m128i *) (src + 8));
872     m128iS3 = _mm_load_si128((__m128i *) (src + 24));
873     m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
874     E1l = _mm_madd_epi16(m128Tmp0, T0);
875     m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
876     E1h = _mm_madd_epi16(m128Tmp1, T0);
877     m128iS5 = _mm_load_si128((__m128i *) (src + 40));
878     m128iS7 = _mm_load_si128((__m128i *) (src + 56));
879     m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
880     E2l = _mm_madd_epi16(m128Tmp2, T1);
881     m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
882     E2h = _mm_madd_epi16(m128Tmp3, T1);
883     O0l = _mm_add_epi32(E1l, E2l);
884     O0h = _mm_add_epi32(E1h, E2h);
885 
886     E1l = _mm_madd_epi16(m128Tmp0, T2);
887     E1h = _mm_madd_epi16(m128Tmp1, T2);
888     E2l = _mm_madd_epi16(m128Tmp2, T3);
889     E2h = _mm_madd_epi16(m128Tmp3, T3);
890 
891     O1l = _mm_add_epi32(E1l, E2l);
892     O1h = _mm_add_epi32(E1h, E2h);
893 
894     E1l = _mm_madd_epi16(m128Tmp0, T4);
895     E1h = _mm_madd_epi16(m128Tmp1, T4);
896     E2l = _mm_madd_epi16(m128Tmp2, T5);
897     E2h = _mm_madd_epi16(m128Tmp3, T5);
898     O2l = _mm_add_epi32(E1l, E2l);
899     O2h = _mm_add_epi32(E1h, E2h);
900 
901     E1l = _mm_madd_epi16(m128Tmp0, T6);
902     E1h = _mm_madd_epi16(m128Tmp1, T6);
903     E2l = _mm_madd_epi16(m128Tmp2, T7);
904     E2h = _mm_madd_epi16(m128Tmp3, T7);
905     O3h = _mm_add_epi32(E1h, E2h);
906     O3l = _mm_add_epi32(E1l, E2l);
907 
908     /*    -------     */
909 
910     m128iS0 = _mm_load_si128((__m128i *) (src + 0));
911     m128iS4 = _mm_load_si128((__m128i *) (src + 32));
912     m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
913     EE0l = _mm_madd_epi16(m128Tmp0, T8);
914     m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
915     EE0h = _mm_madd_epi16(m128Tmp1, T8);
916 
917     EE1l = _mm_madd_epi16(m128Tmp0, T9);
918     EE1h = _mm_madd_epi16(m128Tmp1, T9);
919 
920     /*    -------     */
921 
922     m128iS2 = _mm_load_si128((__m128i *) (src + 16));
923     m128iS6 = _mm_load_si128((__m128i *) (src + 48));
924     m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
925     E00l = _mm_madd_epi16(m128Tmp0, T10);
926     m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
927     E00h = _mm_madd_epi16(m128Tmp1, T10);
928     E01l = _mm_madd_epi16(m128Tmp0, T11);
929     E01h = _mm_madd_epi16(m128Tmp1, T11);
930     E0l = _mm_add_epi32(EE0l, E00l);
931     E0l = _mm_add_epi32(E0l, m128iAdd);
932     E0h = _mm_add_epi32(EE0h, E00h);
933     E0h = _mm_add_epi32(E0h, m128iAdd);
934     E3l = _mm_sub_epi32(EE0l, E00l);
935     E3l = _mm_add_epi32(E3l, m128iAdd);
936     E3h = _mm_sub_epi32(EE0h, E00h);
937     E3h = _mm_add_epi32(E3h, m128iAdd);
938 
939     E1l = _mm_add_epi32(EE1l, E01l);
940     E1l = _mm_add_epi32(E1l, m128iAdd);
941     E1h = _mm_add_epi32(EE1h, E01h);
942     E1h = _mm_add_epi32(E1h, m128iAdd);
943     E2l = _mm_sub_epi32(EE1l, E01l);
944     E2l = _mm_add_epi32(E2l, m128iAdd);
945     E2h = _mm_sub_epi32(EE1h, E01h);
946     E2h = _mm_add_epi32(E2h, m128iAdd);
947     m128iS0 = _mm_packs_epi32(
948             _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_1st),
949             _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_1st));
950     m128iS1 = _mm_packs_epi32(
951             _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_1st),
952             _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_1st));
953     m128iS2 = _mm_packs_epi32(
954             _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_1st),
955             _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_1st));
956     m128iS3 = _mm_packs_epi32(
957             _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_1st),
958             _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_1st));
959     m128iS4 = _mm_packs_epi32(
960             _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_1st),
961             _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_1st));
962     m128iS5 = _mm_packs_epi32(
963             _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_1st),
964             _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_1st));
965     m128iS6 = _mm_packs_epi32(
966             _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_1st),
967             _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_1st));
968     m128iS7 = _mm_packs_epi32(
969             _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_1st),
970             _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_1st));
971     /*  Invers matrix   */
972 
973     E0l = _mm_unpacklo_epi16(m128iS0, m128iS4);
974     E1l = _mm_unpacklo_epi16(m128iS1, m128iS5);
975     E2l = _mm_unpacklo_epi16(m128iS2, m128iS6);
976     E3l = _mm_unpacklo_epi16(m128iS3, m128iS7);
977     O0l = _mm_unpackhi_epi16(m128iS0, m128iS4);
978     O1l = _mm_unpackhi_epi16(m128iS1, m128iS5);
979     O2l = _mm_unpackhi_epi16(m128iS2, m128iS6);
980     O3l = _mm_unpackhi_epi16(m128iS3, m128iS7);
981     m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l);
982     m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l);
983     m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
984     m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
985     m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l);
986     m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l);
987     m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
988     m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
989     m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l);
990     m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l);
991     m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
992     m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
993     m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l);
994     m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l);
995     m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
996     m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
997 
998     m128iAdd = _mm_set1_epi32(add_2nd);
999 
1000     m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
1001     E1l = _mm_madd_epi16(m128Tmp0, T0);
1002     m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
1003     E1h = _mm_madd_epi16(m128Tmp1, T0);
1004     m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
1005     E2l = _mm_madd_epi16(m128Tmp2, T1);
1006     m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
1007     E2h = _mm_madd_epi16(m128Tmp3, T1);
1008     O0l = _mm_add_epi32(E1l, E2l);
1009     O0h = _mm_add_epi32(E1h, E2h);
1010     E1l = _mm_madd_epi16(m128Tmp0, T2);
1011     E1h = _mm_madd_epi16(m128Tmp1, T2);
1012     E2l = _mm_madd_epi16(m128Tmp2, T3);
1013     E2h = _mm_madd_epi16(m128Tmp3, T3);
1014     O1l = _mm_add_epi32(E1l, E2l);
1015     O1h = _mm_add_epi32(E1h, E2h);
1016     E1l = _mm_madd_epi16(m128Tmp0, T4);
1017     E1h = _mm_madd_epi16(m128Tmp1, T4);
1018     E2l = _mm_madd_epi16(m128Tmp2, T5);
1019     E2h = _mm_madd_epi16(m128Tmp3, T5);
1020     O2l = _mm_add_epi32(E1l, E2l);
1021     O2h = _mm_add_epi32(E1h, E2h);
1022     E1l = _mm_madd_epi16(m128Tmp0, T6);
1023     E1h = _mm_madd_epi16(m128Tmp1, T6);
1024     E2l = _mm_madd_epi16(m128Tmp2, T7);
1025     E2h = _mm_madd_epi16(m128Tmp3, T7);
1026     O3h = _mm_add_epi32(E1h, E2h);
1027     O3l = _mm_add_epi32(E1l, E2l);
1028 
1029     m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
1030     EE0l = _mm_madd_epi16(m128Tmp0, T8);
1031     m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
1032     EE0h = _mm_madd_epi16(m128Tmp1, T8);
1033     EE1l = _mm_madd_epi16(m128Tmp0, T9);
1034     EE1h = _mm_madd_epi16(m128Tmp1, T9);
1035 
1036     m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
1037     E00l = _mm_madd_epi16(m128Tmp0, T10);
1038     m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
1039     E00h = _mm_madd_epi16(m128Tmp1, T10);
1040     E01l = _mm_madd_epi16(m128Tmp0, T11);
1041     E01h = _mm_madd_epi16(m128Tmp1, T11);
1042     E0l = _mm_add_epi32(EE0l, E00l);
1043     E0l = _mm_add_epi32(E0l, m128iAdd);
1044     E0h = _mm_add_epi32(EE0h, E00h);
1045     E0h = _mm_add_epi32(E0h, m128iAdd);
1046     E3l = _mm_sub_epi32(EE0l, E00l);
1047     E3l = _mm_add_epi32(E3l, m128iAdd);
1048     E3h = _mm_sub_epi32(EE0h, E00h);
1049     E3h = _mm_add_epi32(E3h, m128iAdd);
1050     E1l = _mm_add_epi32(EE1l, E01l);
1051     E1l = _mm_add_epi32(E1l, m128iAdd);
1052     E1h = _mm_add_epi32(EE1h, E01h);
1053     E1h = _mm_add_epi32(E1h, m128iAdd);
1054     E2l = _mm_sub_epi32(EE1l, E01l);
1055     E2l = _mm_add_epi32(E2l, m128iAdd);
1056     E2h = _mm_sub_epi32(EE1h, E01h);
1057     E2h = _mm_add_epi32(E2h, m128iAdd);
1058 
1059     m128iS0 = _mm_packs_epi32(
1060             _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_2nd),
1061             _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_2nd));
1062     m128iS1 = _mm_packs_epi32(
1063             _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_2nd),
1064             _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_2nd));
1065     m128iS2 = _mm_packs_epi32(
1066             _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_2nd),
1067             _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_2nd));
1068     m128iS3 = _mm_packs_epi32(
1069             _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_2nd),
1070             _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_2nd));
1071     m128iS4 = _mm_packs_epi32(
1072             _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_2nd),
1073             _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_2nd));
1074     m128iS5 = _mm_packs_epi32(
1075             _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_2nd),
1076             _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_2nd));
1077     m128iS6 = _mm_packs_epi32(
1078             _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_2nd),
1079             _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_2nd));
1080     m128iS7 = _mm_packs_epi32(
1081             _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_2nd),
1082             _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_2nd));
1083 
1084     E0l = _mm_unpacklo_epi16(m128iS0, m128iS4);
1085     E1l = _mm_unpacklo_epi16(m128iS1, m128iS5);
1086     E2l = _mm_unpacklo_epi16(m128iS2, m128iS6);
1087     E3l = _mm_unpacklo_epi16(m128iS3, m128iS7);
1088     O0l = _mm_unpackhi_epi16(m128iS0, m128iS4);
1089     O1l = _mm_unpackhi_epi16(m128iS1, m128iS5);
1090     O2l = _mm_unpackhi_epi16(m128iS2, m128iS6);
1091     O3l = _mm_unpackhi_epi16(m128iS3, m128iS7);
1092     m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l);
1093     m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l);
1094     m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
1095     m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
1096     m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l);
1097     m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l);
1098     m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
1099     m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
1100     m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l);
1101     m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l);
1102     m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
1103     m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
1104     m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l);
1105     m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l);
1106     m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
1107     m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
1108 
1109     E0l = _mm_loadl_epi64((__m128i *) dst);
1110     E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1111 
1112     E0l = _mm_adds_epi16(E0l, m128iS0);
1113     E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1114     _mm_storel_epi64((__m128i *) dst, E0l);
1115     dst += stride;
1116 
1117     E0l = _mm_loadl_epi64((__m128i *) dst);
1118     E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1119 
1120     E0l = _mm_adds_epi16(E0l, m128iS1);
1121     E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1122     _mm_storel_epi64((__m128i *) dst, E0l);
1123     dst += stride;
1124 
1125     E0l = _mm_loadl_epi64((__m128i *) dst);
1126     E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1127 
1128     E0l = _mm_adds_epi16(E0l, m128iS2);
1129     E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1130     _mm_storel_epi64((__m128i *) dst, E0l);
1131     dst += stride;
1132 
1133     E0l = _mm_loadl_epi64((__m128i *) dst);
1134     E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1135 
1136     E0l = _mm_adds_epi16(E0l, m128iS3);
1137     E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1138     _mm_storel_epi64((__m128i *) dst, E0l);
1139     dst += stride;
1140 
1141     E0l = _mm_loadl_epi64((__m128i *) dst);
1142     E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1143 
1144     E0l = _mm_adds_epi16(E0l, m128iS4);
1145     E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1146     _mm_storel_epi64((__m128i *) dst, E0l);
1147     dst += stride;
1148 
1149     E0l = _mm_loadl_epi64((__m128i *) dst);
1150     E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1151 
1152     E0l = _mm_adds_epi16(E0l, m128iS5);
1153     E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1154     _mm_storel_epi64((__m128i *) dst, E0l);
1155     dst += stride;
1156 
1157     E0l = _mm_loadl_epi64((__m128i *) dst);
1158     E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1159 
1160     E0l = _mm_adds_epi16(E0l, m128iS6);
1161     E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1162     _mm_storel_epi64((__m128i *) dst, E0l);
1163     dst += stride;
1164 
1165     E0l = _mm_loadl_epi64((__m128i *) dst);
1166     E0l = _mm_unpacklo_epi8(E0l, _mm_setzero_si128());
1167 
1168     E0l = _mm_adds_epi16(E0l, m128iS7);
1169     E0l = _mm_packus_epi16(E0l, _mm_setzero_si128());
1170     _mm_storel_epi64((__m128i *) dst, E0l);
1171     dst += stride;
1172 
1173 }
1174 #endif
1175 
1176 #if 0
1177 void ff_hevc_transform_8x8_add_10_sse4(uint8_t *_dst, const int16_t *coeffs,
1178         ptrdiff_t _stride) {
1179     int i;
1180     uint16_t *dst = (uint16_t*) _dst;
1181     ptrdiff_t stride = _stride / sizeof(uint16_t);
1182     int16_t *src = coeffs;
1183     uint8_t shift_2nd = 10; // 20 - Bit depth
1184     uint16_t add_2nd = 1 << 9; //(1 << (shift_2nd - 1))
1185 
1186     __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
1187             m128iS7, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2, m128Tmp3, E0h, E1h,
1188             E2h, E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O0l, O1l, O2l,
1189             O3l, EE0l, EE1l, E00l, E01l, EE0h, EE1h, E00h, E01h;
1190     int j;
1191     m128iAdd = _mm_set1_epi32(add_1st);
1192 
1193     m128iS1 = _mm_load_si128((__m128i *) (src + 8));
1194     m128iS3 = _mm_load_si128((__m128i *) (src + 24));
1195     m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
1196     E1l = _mm_madd_epi16(m128Tmp0,
1197             _mm_load_si128((__m128i *) (transform8x8[0])));
1198     m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
1199     E1h = _mm_madd_epi16(m128Tmp1,
1200             _mm_load_si128((__m128i *) (transform8x8[0])));
1201     m128iS5 = _mm_load_si128((__m128i *) (src + 40));
1202     m128iS7 = _mm_load_si128((__m128i *) (src + 56));
1203     m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
1204     E2l = _mm_madd_epi16(m128Tmp2,
1205             _mm_load_si128((__m128i *) (transform8x8[1])));
1206     m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
1207     E2h = _mm_madd_epi16(m128Tmp3,
1208             _mm_load_si128((__m128i *) (transform8x8[1])));
1209     O0l = _mm_add_epi32(E1l, E2l);
1210     O0h = _mm_add_epi32(E1h, E2h);
1211 
1212     E1l = _mm_madd_epi16(m128Tmp0,
1213             _mm_load_si128((__m128i *) (transform8x8[2])));
1214     E1h = _mm_madd_epi16(m128Tmp1,
1215             _mm_load_si128((__m128i *) (transform8x8[2])));
1216     E2l = _mm_madd_epi16(m128Tmp2,
1217             _mm_load_si128((__m128i *) (transform8x8[3])));
1218     E2h = _mm_madd_epi16(m128Tmp3,
1219             _mm_load_si128((__m128i *) (transform8x8[3])));
1220 
1221     O1l = _mm_add_epi32(E1l, E2l);
1222     O1h = _mm_add_epi32(E1h, E2h);
1223 
1224     E1l = _mm_madd_epi16(m128Tmp0,
1225             _mm_load_si128((__m128i *) (transform8x8[4])));
1226     E1h = _mm_madd_epi16(m128Tmp1,
1227             _mm_load_si128((__m128i *) (transform8x8[4])));
1228     E2l = _mm_madd_epi16(m128Tmp2,
1229             _mm_load_si128((__m128i *) (transform8x8[5])));
1230     E2h = _mm_madd_epi16(m128Tmp3,
1231             _mm_load_si128((__m128i *) (transform8x8[5])));
1232     O2l = _mm_add_epi32(E1l, E2l);
1233     O2h = _mm_add_epi32(E1h, E2h);
1234 
1235     E1l = _mm_madd_epi16(m128Tmp0,
1236             _mm_load_si128((__m128i *) (transform8x8[6])));
1237     E1h = _mm_madd_epi16(m128Tmp1,
1238             _mm_load_si128((__m128i *) (transform8x8[6])));
1239     E2l = _mm_madd_epi16(m128Tmp2,
1240             _mm_load_si128((__m128i *) (transform8x8[7])));
1241     E2h = _mm_madd_epi16(m128Tmp3,
1242             _mm_load_si128((__m128i *) (transform8x8[7])));
1243     O3h = _mm_add_epi32(E1h, E2h);
1244     O3l = _mm_add_epi32(E1l, E2l);
1245 
1246     /*    -------     */
1247 
1248     m128iS0 = _mm_load_si128((__m128i *) (src + 0));
1249     m128iS4 = _mm_load_si128((__m128i *) (src + 32));
1250     m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
1251     EE0l = _mm_madd_epi16(m128Tmp0,
1252             _mm_load_si128((__m128i *) (transform8x8[8])));
1253     m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
1254     EE0h = _mm_madd_epi16(m128Tmp1,
1255             _mm_load_si128((__m128i *) (transform8x8[8])));
1256 
1257     EE1l = _mm_madd_epi16(m128Tmp0,
1258             _mm_load_si128((__m128i *) (transform8x8[9])));
1259     EE1h = _mm_madd_epi16(m128Tmp1,
1260             _mm_load_si128((__m128i *) (transform8x8[9])));
1261 
1262     /*    -------     */
1263 
1264     m128iS2 = _mm_load_si128((__m128i *) (src + 16));
1265     m128iS6 = _mm_load_si128((__m128i *) (src + 48));
1266     m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
1267     E00l = _mm_madd_epi16(m128Tmp0,
1268             _mm_load_si128((__m128i *) (transform8x8[10])));
1269     m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
1270     E00h = _mm_madd_epi16(m128Tmp1,
1271             _mm_load_si128((__m128i *) (transform8x8[10])));
1272     E01l = _mm_madd_epi16(m128Tmp0,
1273             _mm_load_si128((__m128i *) (transform8x8[11])));
1274     E01h = _mm_madd_epi16(m128Tmp1,
1275             _mm_load_si128((__m128i *) (transform8x8[11])));
1276     E0l = _mm_add_epi32(EE0l, E00l);
1277     E0l = _mm_add_epi32(E0l, m128iAdd);
1278     E0h = _mm_add_epi32(EE0h, E00h);
1279     E0h = _mm_add_epi32(E0h, m128iAdd);
1280     E3l = _mm_sub_epi32(EE0l, E00l);
1281     E3l = _mm_add_epi32(E3l, m128iAdd);
1282     E3h = _mm_sub_epi32(EE0h, E00h);
1283     E3h = _mm_add_epi32(E3h, m128iAdd);
1284 
1285     E1l = _mm_add_epi32(EE1l, E01l);
1286     E1l = _mm_add_epi32(E1l, m128iAdd);
1287     E1h = _mm_add_epi32(EE1h, E01h);
1288     E1h = _mm_add_epi32(E1h, m128iAdd);
1289     E2l = _mm_sub_epi32(EE1l, E01l);
1290     E2l = _mm_add_epi32(E2l, m128iAdd);
1291     E2h = _mm_sub_epi32(EE1h, E01h);
1292     E2h = _mm_add_epi32(E2h, m128iAdd);
1293     m128iS0 = _mm_packs_epi32(
1294             _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_1st),
1295             _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_1st));
1296     m128iS1 = _mm_packs_epi32(
1297             _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_1st),
1298             _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_1st));
1299     m128iS2 = _mm_packs_epi32(
1300             _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_1st),
1301             _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_1st));
1302     m128iS3 = _mm_packs_epi32(
1303             _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_1st),
1304             _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_1st));
1305     m128iS4 = _mm_packs_epi32(
1306             _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_1st),
1307             _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_1st));
1308     m128iS5 = _mm_packs_epi32(
1309             _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_1st),
1310             _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_1st));
1311     m128iS6 = _mm_packs_epi32(
1312             _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_1st),
1313             _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_1st));
1314     m128iS7 = _mm_packs_epi32(
1315             _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_1st),
1316             _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_1st));
1317     /*  Invers matrix   */
1318 
1319     E0l = _mm_unpacklo_epi16(m128iS0, m128iS4);
1320     E1l = _mm_unpacklo_epi16(m128iS1, m128iS5);
1321     E2l = _mm_unpacklo_epi16(m128iS2, m128iS6);
1322     E3l = _mm_unpacklo_epi16(m128iS3, m128iS7);
1323     O0l = _mm_unpackhi_epi16(m128iS0, m128iS4);
1324     O1l = _mm_unpackhi_epi16(m128iS1, m128iS5);
1325     O2l = _mm_unpackhi_epi16(m128iS2, m128iS6);
1326     O3l = _mm_unpackhi_epi16(m128iS3, m128iS7);
1327     m128Tmp0 = _mm_unpacklo_epi16(E0l, E2l);
1328     m128Tmp1 = _mm_unpacklo_epi16(E1l, E3l);
1329     m128iS0 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
1330     m128iS1 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
1331     m128Tmp2 = _mm_unpackhi_epi16(E0l, E2l);
1332     m128Tmp3 = _mm_unpackhi_epi16(E1l, E3l);
1333     m128iS2 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
1334     m128iS3 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
1335     m128Tmp0 = _mm_unpacklo_epi16(O0l, O2l);
1336     m128Tmp1 = _mm_unpacklo_epi16(O1l, O3l);
1337     m128iS4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp1);
1338     m128iS5 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp1);
1339     m128Tmp2 = _mm_unpackhi_epi16(O0l, O2l);
1340     m128Tmp3 = _mm_unpackhi_epi16(O1l, O3l);
1341     m128iS6 = _mm_unpacklo_epi16(m128Tmp2, m128Tmp3);
1342     m128iS7 = _mm_unpackhi_epi16(m128Tmp2, m128Tmp3);
1343 
1344     m128iAdd = _mm_set1_epi32(add_2nd);
1345 
1346     m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
1347     E1l = _mm_madd_epi16(m128Tmp0,
1348             _mm_load_si128((__m128i *) (transform8x8[0])));
1349     m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
1350     E1h = _mm_madd_epi16(m128Tmp1,
1351             _mm_load_si128((__m128i *) (transform8x8[0])));
1352     m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
1353     E2l = _mm_madd_epi16(m128Tmp2,
1354             _mm_load_si128((__m128i *) (transform8x8[1])));
1355     m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
1356     E2h = _mm_madd_epi16(m128Tmp3,
1357             _mm_load_si128((__m128i *) (transform8x8[1])));
1358     O0l = _mm_add_epi32(E1l, E2l);
1359     O0h = _mm_add_epi32(E1h, E2h);
1360     E1l = _mm_madd_epi16(m128Tmp0,
1361             _mm_load_si128((__m128i *) (transform8x8[2])));
1362     E1h = _mm_madd_epi16(m128Tmp1,
1363             _mm_load_si128((__m128i *) (transform8x8[2])));
1364     E2l = _mm_madd_epi16(m128Tmp2,
1365             _mm_load_si128((__m128i *) (transform8x8[3])));
1366     E2h = _mm_madd_epi16(m128Tmp3,
1367             _mm_load_si128((__m128i *) (transform8x8[3])));
1368     O1l = _mm_add_epi32(E1l, E2l);
1369     O1h = _mm_add_epi32(E1h, E2h);
1370     E1l = _mm_madd_epi16(m128Tmp0,
1371             _mm_load_si128((__m128i *) (transform8x8[4])));
1372     E1h = _mm_madd_epi16(m128Tmp1,
1373             _mm_load_si128((__m128i *) (transform8x8[4])));
1374     E2l = _mm_madd_epi16(m128Tmp2,
1375             _mm_load_si128((__m128i *) (transform8x8[5])));
1376     E2h = _mm_madd_epi16(m128Tmp3,
1377             _mm_load_si128((__m128i *) (transform8x8[5])));
1378     O2l = _mm_add_epi32(E1l, E2l);
1379     O2h = _mm_add_epi32(E1h, E2h);
1380     E1l = _mm_madd_epi16(m128Tmp0,
1381             _mm_load_si128((__m128i *) (transform8x8[6])));
1382     E1h = _mm_madd_epi16(m128Tmp1,
1383             _mm_load_si128((__m128i *) (transform8x8[6])));
1384     E2l = _mm_madd_epi16(m128Tmp2,
1385             _mm_load_si128((__m128i *) (transform8x8[7])));
1386     E2h = _mm_madd_epi16(m128Tmp3,
1387             _mm_load_si128((__m128i *) (transform8x8[7])));
1388     O3h = _mm_add_epi32(E1h, E2h);
1389     O3l = _mm_add_epi32(E1l, E2l);
1390 
1391     m128Tmp0 = _mm_unpacklo_epi16(m128iS0, m128iS4);
1392     EE0l = _mm_madd_epi16(m128Tmp0,
1393             _mm_load_si128((__m128i *) (transform8x8[8])));
1394     m128Tmp1 = _mm_unpackhi_epi16(m128iS0, m128iS4);
1395     EE0h = _mm_madd_epi16(m128Tmp1,
1396             _mm_load_si128((__m128i *) (transform8x8[8])));
1397     EE1l = _mm_madd_epi16(m128Tmp0,
1398             _mm_load_si128((__m128i *) (transform8x8[9])));
1399     EE1h = _mm_madd_epi16(m128Tmp1,
1400             _mm_load_si128((__m128i *) (transform8x8[9])));
1401 
1402     m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
1403     E00l = _mm_madd_epi16(m128Tmp0,
1404             _mm_load_si128((__m128i *) (transform8x8[10])));
1405     m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
1406     E00h = _mm_madd_epi16(m128Tmp1,
1407             _mm_load_si128((__m128i *) (transform8x8[10])));
1408     E01l = _mm_madd_epi16(m128Tmp0,
1409             _mm_load_si128((__m128i *) (transform8x8[11])));
1410     E01h = _mm_madd_epi16(m128Tmp1,
1411             _mm_load_si128((__m128i *) (transform8x8[11])));
1412     E0l = _mm_add_epi32(EE0l, E00l);
1413     E0l = _mm_add_epi32(E0l, m128iAdd);
1414     E0h = _mm_add_epi32(EE0h, E00h);
1415     E0h = _mm_add_epi32(E0h, m128iAdd);
1416     E3l = _mm_sub_epi32(EE0l, E00l);
1417     E3l = _mm_add_epi32(E3l, m128iAdd);
1418     E3h = _mm_sub_epi32(EE0h, E00h);
1419     E3h = _mm_add_epi32(E3h, m128iAdd);
1420     E1l = _mm_add_epi32(EE1l, E01l);
1421     E1l = _mm_add_epi32(E1l, m128iAdd);
1422     E1h = _mm_add_epi32(EE1h, E01h);
1423     E1h = _mm_add_epi32(E1h, m128iAdd);
1424     E2l = _mm_sub_epi32(EE1l, E01l);
1425     E2l = _mm_add_epi32(E2l, m128iAdd);
1426     E2h = _mm_sub_epi32(EE1h, E01h);
1427     E2h = _mm_add_epi32(E2h, m128iAdd);
1428 
1429     m128iS0 = _mm_packs_epi32(
1430             _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift_2nd),
1431             _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift_2nd));
1432     m128iS1 = _mm_packs_epi32(
1433             _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift_2nd),
1434             _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift_2nd));
1435     m128iS2 = _mm_packs_epi32(
1436             _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift_2nd),
1437             _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift_2nd));
1438     m128iS3 = _mm_packs_epi32(
1439             _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift_2nd),
1440             _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift_2nd));
1441     m128iS4 = _mm_packs_epi32(
1442             _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift_2nd),
1443             _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift_2nd));
1444     m128iS5 = _mm_packs_epi32(
1445             _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift_2nd),
1446             _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift_2nd));
1447     m128iS6 = _mm_packs_epi32(
1448             _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift_2nd),
1449             _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift_2nd));
1450     m128iS7 = _mm_packs_epi32(
1451             _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift_2nd),
1452             _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift_2nd));
1453 
1454     _mm_store_si128((__m128i *) (src), m128iS0);
1455     _mm_store_si128((__m128i *) (src + 8), m128iS1);
1456     _mm_store_si128((__m128i *) (src + 16), m128iS2);
1457     _mm_store_si128((__m128i *) (src + 24), m128iS3);
1458     _mm_store_si128((__m128i *) (src + 32), m128iS4);
1459     _mm_store_si128((__m128i *) (src + 40), m128iS5);
1460     _mm_store_si128((__m128i *) (src + 48), m128iS6);
1461     _mm_store_si128((__m128i *) (src + 56), m128iS7);
1462 
1463     j = 0;
1464     for (i = 0; i < 4; i++) {
1465         dst[0] = av_clip_uintp2(dst[0] + src[j],10);
1466         dst[1] = av_clip_uintp2(dst[1] + src[j + 8],10);
1467         dst[2] = av_clip_uintp2(dst[2] + src[j + 16],10);
1468         dst[3] = av_clip_uintp2(dst[3] + src[j + 24],10);
1469         dst[4] = av_clip_uintp2(dst[4] + src[j + 32],10);
1470         dst[5] = av_clip_uintp2(dst[5] + src[j + 40],10);
1471         dst[6] = av_clip_uintp2(dst[6] + src[j + 48],10);
1472         dst[7] = av_clip_uintp2(dst[7] + src[j + 56],10);
1473         j += 1;
1474         dst += stride;
1475         dst[0] = av_clip_uintp2(dst[0] + src[j],10);
1476         dst[1] = av_clip_uintp2(dst[1] + src[j + 8],10);
1477         dst[2] = av_clip_uintp2(dst[2] + src[j + 16],10);
1478         dst[3] = av_clip_uintp2(dst[3] + src[j + 24],10);
1479         dst[4] = av_clip_uintp2(dst[4] + src[j + 32],10);
1480         dst[5] = av_clip_uintp2(dst[5] + src[j + 40],10);
1481         dst[6] = av_clip_uintp2(dst[6] + src[j + 48],10);
1482         dst[7] = av_clip_uintp2(dst[7] + src[j + 56],10);
1483         j += 1;
1484         dst += stride;
1485     }
1486 
1487 }
1488 #endif
1489 
1490 
1491 #if HAVE_SSE4_1
ff_hevc_transform_16x16_add_8_sse4(uint8_t * _dst,const int16_t * coeffs,ptrdiff_t _stride)1492 void ff_hevc_transform_16x16_add_8_sse4(uint8_t *_dst, const int16_t *coeffs,
1493         ptrdiff_t _stride) {
1494     uint8_t shift_2nd = 12; // 20 - Bit depth
1495     uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
1496     int i;
1497     uint8_t *dst = (uint8_t*) _dst;
1498     ptrdiff_t stride = _stride / sizeof(uint8_t);
1499     const int16_t *src = coeffs;
1500     int32_t shift;
1501     __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
1502             m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13,
1503             m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2,
1504             m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h,
1505             E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h,
1506             O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l,
1507             E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h;
1508     __m128i E4l, E5l, E6l, E7l;
1509     __m128i E4h, E5h, E6h, E7h;
1510     __m128i r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15;
1511     __m128i r16,r17,r18,r19,r20,r21,r22,r23,r24,r25,r26,r27,r28,r29,r30,r31;
1512 
1513 
1514     /*__m128i T00,T01, T02, T03, T04, T05, T06, T07;
1515     __m128i T10,T11, T12, T13, T14, T15, T16, T17;
1516     __m128i T20,T21, T22, T23, T24, T25, T26, T27;
1517     __m128i T30,T31, T32, T33, T34, T35, T36, T37;
1518 
1519     __m128i U00,U01, U02, U03, U10, U11, U12, U13;
1520 
1521     __m128i V00,V01, V10, V11;*/
1522 
1523 
1524     const __m128i T00 = _mm_load_si128((__m128i *) (transform16x16_1[0][0]));
1525     const __m128i T01 = _mm_load_si128((__m128i *) (transform16x16_1[0][1]));
1526     const __m128i T02 = _mm_load_si128((__m128i *) (transform16x16_1[0][2]));
1527     const __m128i T03 = _mm_load_si128((__m128i *) (transform16x16_1[0][3]));
1528     const __m128i T04 = _mm_load_si128((__m128i *) (transform16x16_1[0][4]));
1529     const __m128i T05 = _mm_load_si128((__m128i *) (transform16x16_1[0][5]));
1530     const __m128i T06 = _mm_load_si128((__m128i *) (transform16x16_1[0][6]));
1531     const __m128i T07 = _mm_load_si128((__m128i *) (transform16x16_1[0][7]));
1532     const __m128i T10 = _mm_load_si128((__m128i *) (transform16x16_1[1][0]));
1533     const __m128i T11 = _mm_load_si128((__m128i *) (transform16x16_1[1][1]));
1534     const __m128i T12 = _mm_load_si128((__m128i *) (transform16x16_1[1][2]));
1535     const __m128i T13 = _mm_load_si128((__m128i *) (transform16x16_1[1][3]));
1536     const __m128i T14 = _mm_load_si128((__m128i *) (transform16x16_1[1][4]));
1537     const __m128i T15 = _mm_load_si128((__m128i *) (transform16x16_1[1][5]));
1538     const __m128i T16 = _mm_load_si128((__m128i *) (transform16x16_1[1][6]));
1539     const __m128i T17 = _mm_load_si128((__m128i *) (transform16x16_1[1][7]));
1540     const __m128i T20 = _mm_load_si128((__m128i *) (transform16x16_1[2][0]));
1541     const __m128i T21 = _mm_load_si128((__m128i *) (transform16x16_1[2][1]));
1542     const __m128i T22 = _mm_load_si128((__m128i *) (transform16x16_1[2][2]));
1543     const __m128i T23 = _mm_load_si128((__m128i *) (transform16x16_1[2][3]));
1544     const __m128i T24 = _mm_load_si128((__m128i *) (transform16x16_1[2][4]));
1545     const __m128i T25 = _mm_load_si128((__m128i *) (transform16x16_1[2][5]));
1546     const __m128i T26 = _mm_load_si128((__m128i *) (transform16x16_1[2][6]));
1547     const __m128i T27 = _mm_load_si128((__m128i *) (transform16x16_1[2][7]));
1548     const __m128i T30 = _mm_load_si128((__m128i *) (transform16x16_1[3][0]));
1549     const __m128i T31 = _mm_load_si128((__m128i *) (transform16x16_1[3][1]));
1550     const __m128i T32 = _mm_load_si128((__m128i *) (transform16x16_1[3][2]));
1551     const __m128i T33 = _mm_load_si128((__m128i *) (transform16x16_1[3][3]));
1552     const __m128i T34 = _mm_load_si128((__m128i *) (transform16x16_1[3][4]));
1553     const __m128i T35 = _mm_load_si128((__m128i *) (transform16x16_1[3][5]));
1554     const __m128i T36 = _mm_load_si128((__m128i *) (transform16x16_1[3][6]));
1555     const __m128i T37 = _mm_load_si128((__m128i *) (transform16x16_1[3][7]));
1556 
1557     const __m128i U00 = _mm_load_si128((__m128i *) (transform16x16_2[0][0]));
1558     const __m128i U01 = _mm_load_si128((__m128i *) (transform16x16_2[0][1]));
1559     const __m128i U02 = _mm_load_si128((__m128i *) (transform16x16_2[0][2]));
1560     const __m128i U03 = _mm_load_si128((__m128i *) (transform16x16_2[0][3]));
1561     const __m128i U10 = _mm_load_si128((__m128i *) (transform16x16_2[1][0]));
1562     const __m128i U11 = _mm_load_si128((__m128i *) (transform16x16_2[1][1]));
1563     const __m128i U12 = _mm_load_si128((__m128i *) (transform16x16_2[1][2]));
1564     const __m128i U13 = _mm_load_si128((__m128i *) (transform16x16_2[1][3]));
1565 
1566     const __m128i V00 = _mm_load_si128((__m128i *) (transform16x16_3[0][0]));
1567     const __m128i V01 = _mm_load_si128((__m128i *) (transform16x16_3[0][1]));
1568     const __m128i V10 = _mm_load_si128((__m128i *) (transform16x16_3[1][0]));
1569     const __m128i V11 = _mm_load_si128((__m128i *) (transform16x16_3[1][1]));
1570 
1571 
1572 
1573     int j;
1574     m128iS0 = _mm_load_si128((__m128i *) (src));
1575     m128iS1 = _mm_load_si128((__m128i *) (src + 16));
1576     m128iS2 = _mm_load_si128((__m128i *) (src + 32));
1577     m128iS3 = _mm_load_si128((__m128i *) (src + 48));
1578     m128iS4 = _mm_loadu_si128((__m128i *) (src + 64));
1579     m128iS5 = _mm_load_si128((__m128i *) (src + 80));
1580     m128iS6 = _mm_load_si128((__m128i *) (src + 96));
1581     m128iS7 = _mm_load_si128((__m128i *) (src + 112));
1582     m128iS8 = _mm_load_si128((__m128i *) (src + 128));
1583     m128iS9 = _mm_load_si128((__m128i *) (src + 144));
1584     m128iS10 = _mm_load_si128((__m128i *) (src + 160));
1585     m128iS11 = _mm_load_si128((__m128i *) (src + 176));
1586     m128iS12 = _mm_load_si128((__m128i *) (src + 192));
1587     m128iS13 = _mm_load_si128((__m128i *) (src + 208));
1588     m128iS14 = _mm_load_si128((__m128i *) (src + 224));
1589     m128iS15 = _mm_load_si128((__m128i *) (src + 240));
1590     shift = shift_1st;
1591     m128iAdd = _mm_set1_epi32(add_1st);
1592 
1593     for (j = 0; j < 2; j++) {
1594         for (i = 0; i < 16; i += 8) {
1595 
1596             m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
1597             E0l = _mm_madd_epi16(m128Tmp0,T00);
1598             m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
1599             E0h = _mm_madd_epi16(m128Tmp1,T00);
1600 
1601             m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
1602             E1l = _mm_madd_epi16(m128Tmp2,T10);
1603             m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
1604             E1h = _mm_madd_epi16(m128Tmp3,T10);
1605 
1606             m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11);
1607             E2l = _mm_madd_epi16(m128Tmp4,T20);
1608             m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11);
1609             E2h = _mm_madd_epi16(m128Tmp5,T20);
1610 
1611             m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15);
1612             E3l = _mm_madd_epi16(m128Tmp6,T30);
1613             m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15);
1614             E3h = _mm_madd_epi16(m128Tmp7,T30);
1615 
1616             O0l = _mm_add_epi32(E0l, E1l);
1617             O0l = _mm_add_epi32(O0l, E2l);
1618             O0l = _mm_add_epi32(O0l, E3l);
1619 
1620             O0h = _mm_add_epi32(E0h, E1h);
1621             O0h = _mm_add_epi32(O0h, E2h);
1622             O0h = _mm_add_epi32(O0h, E3h);
1623 
1624             /* Compute O1*/
1625             E0l = _mm_madd_epi16(m128Tmp0,T01);
1626             E0h = _mm_madd_epi16(m128Tmp1,T01);
1627             E1l = _mm_madd_epi16(m128Tmp2,T11);
1628             E1h = _mm_madd_epi16(m128Tmp3,T11);
1629             E2l = _mm_madd_epi16(m128Tmp4,T21);
1630             E2h = _mm_madd_epi16(m128Tmp5,T21);
1631             E3l = _mm_madd_epi16(m128Tmp6,T31);
1632             E3h = _mm_madd_epi16(m128Tmp7,T31);
1633             O1l = _mm_add_epi32(E0l, E1l);
1634             O1l = _mm_add_epi32(O1l, E2l);
1635             O1l = _mm_add_epi32(O1l, E3l);
1636             O1h = _mm_add_epi32(E0h, E1h);
1637             O1h = _mm_add_epi32(O1h, E2h);
1638             O1h = _mm_add_epi32(O1h, E3h);
1639 
1640             /* Compute O2*/
1641             E0l = _mm_madd_epi16(m128Tmp0,T02);
1642             E0h = _mm_madd_epi16(m128Tmp1,T02);
1643             E1l = _mm_madd_epi16(m128Tmp2,T12);
1644             E1h = _mm_madd_epi16(m128Tmp3,T12);
1645             E2l = _mm_madd_epi16(m128Tmp4,T22);
1646             E2h = _mm_madd_epi16(m128Tmp5,T22);
1647             E3l = _mm_madd_epi16(m128Tmp6,T32);
1648             E3h = _mm_madd_epi16(m128Tmp7,T32);
1649             O2l = _mm_add_epi32(E0l, E1l);
1650             O2l = _mm_add_epi32(O2l, E2l);
1651             O2l = _mm_add_epi32(O2l, E3l);
1652 
1653             O2h = _mm_add_epi32(E0h, E1h);
1654             O2h = _mm_add_epi32(O2h, E2h);
1655             O2h = _mm_add_epi32(O2h, E3h);
1656 
1657             /* Compute O3*/
1658             E0l = _mm_madd_epi16(m128Tmp0,T03);
1659             E0h = _mm_madd_epi16(m128Tmp1,T03);
1660             E1l = _mm_madd_epi16(m128Tmp2,T13);
1661             E1h = _mm_madd_epi16(m128Tmp3,T13);
1662             E2l = _mm_madd_epi16(m128Tmp4,T23);
1663             E2h = _mm_madd_epi16(m128Tmp5,T23);
1664             E3l = _mm_madd_epi16(m128Tmp6,T33);
1665             E3h = _mm_madd_epi16(m128Tmp7,T33);
1666 
1667             O3l = _mm_add_epi32(E0l, E1l);
1668             O3l = _mm_add_epi32(O3l, E2l);
1669             O3l = _mm_add_epi32(O3l, E3l);
1670 
1671             O3h = _mm_add_epi32(E0h, E1h);
1672             O3h = _mm_add_epi32(O3h, E2h);
1673             O3h = _mm_add_epi32(O3h, E3h);
1674 
1675             /* Compute O4*/
1676 
1677             E0l = _mm_madd_epi16(m128Tmp0,T04);
1678             E0h = _mm_madd_epi16(m128Tmp1,T04);
1679             E1l = _mm_madd_epi16(m128Tmp2,T14);
1680             E1h = _mm_madd_epi16(m128Tmp3,T14);
1681             E2l = _mm_madd_epi16(m128Tmp4,T24);
1682             E2h = _mm_madd_epi16(m128Tmp5,T24);
1683             E3l = _mm_madd_epi16(m128Tmp6,T34);
1684             E3h = _mm_madd_epi16(m128Tmp7,T34);
1685 
1686             O4l = _mm_add_epi32(E0l, E1l);
1687             O4l = _mm_add_epi32(O4l, E2l);
1688             O4l = _mm_add_epi32(O4l, E3l);
1689 
1690             O4h = _mm_add_epi32(E0h, E1h);
1691             O4h = _mm_add_epi32(O4h, E2h);
1692             O4h = _mm_add_epi32(O4h, E3h);
1693 
1694             /* Compute O5*/
1695             E0l = _mm_madd_epi16(m128Tmp0,T05);
1696             E0h = _mm_madd_epi16(m128Tmp1,T05);
1697             E1l = _mm_madd_epi16(m128Tmp2,T15);
1698             E1h = _mm_madd_epi16(m128Tmp3,T15);
1699             E2l = _mm_madd_epi16(m128Tmp4,T25);
1700             E2h = _mm_madd_epi16(m128Tmp5,T25);
1701             E3l = _mm_madd_epi16(m128Tmp6,T35);
1702             E3h = _mm_madd_epi16(m128Tmp7,T35);
1703 
1704             O5l = _mm_add_epi32(E0l, E1l);
1705             O5l = _mm_add_epi32(O5l, E2l);
1706             O5l = _mm_add_epi32(O5l, E3l);
1707 
1708             O5h = _mm_add_epi32(E0h, E1h);
1709             O5h = _mm_add_epi32(O5h, E2h);
1710             O5h = _mm_add_epi32(O5h, E3h);
1711 
1712             /* Compute O6*/
1713 
1714             E0l = _mm_madd_epi16(m128Tmp0,T06);
1715             E0h = _mm_madd_epi16(m128Tmp1,T06);
1716             E1l = _mm_madd_epi16(m128Tmp2,T16);
1717             E1h = _mm_madd_epi16(m128Tmp3,T16);
1718             E2l = _mm_madd_epi16(m128Tmp4,T26);
1719             E2h = _mm_madd_epi16(m128Tmp5,T26);
1720             E3l = _mm_madd_epi16(m128Tmp6,T36);
1721             E3h = _mm_madd_epi16(m128Tmp7,T36);
1722 
1723             O6l = _mm_add_epi32(E0l, E1l);
1724             O6l = _mm_add_epi32(O6l, E2l);
1725             O6l = _mm_add_epi32(O6l, E3l);
1726 
1727             O6h = _mm_add_epi32(E0h, E1h);
1728             O6h = _mm_add_epi32(O6h, E2h);
1729             O6h = _mm_add_epi32(O6h, E3h);
1730 
1731             /* Compute O7*/
1732 
1733             E0l = _mm_madd_epi16(m128Tmp0,T07);
1734             E0h = _mm_madd_epi16(m128Tmp1,T07);
1735             E1l = _mm_madd_epi16(m128Tmp2,T17);
1736             E1h = _mm_madd_epi16(m128Tmp3,T17);
1737             E2l = _mm_madd_epi16(m128Tmp4,T27);
1738             E2h = _mm_madd_epi16(m128Tmp5,T27);
1739             E3l = _mm_madd_epi16(m128Tmp6,T37);
1740             E3h = _mm_madd_epi16(m128Tmp7,T37);
1741 
1742             O7l = _mm_add_epi32(E0l, E1l);
1743             O7l = _mm_add_epi32(O7l, E2l);
1744             O7l = _mm_add_epi32(O7l, E3l);
1745 
1746             O7h = _mm_add_epi32(E0h, E1h);
1747             O7h = _mm_add_epi32(O7h, E2h);
1748             O7h = _mm_add_epi32(O7h, E3h);
1749 
1750             /*  Compute E0  */
1751 
1752 
1753 
1754             m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
1755             E0l = _mm_madd_epi16(m128Tmp0,U00);
1756             m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
1757             E0h = _mm_madd_epi16(m128Tmp1,U00);
1758 
1759             m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14);
1760             E0l = _mm_add_epi32(E0l,
1761                     _mm_madd_epi16(m128Tmp2,U10));
1762             m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14);
1763             E0h = _mm_add_epi32(E0h,
1764                     _mm_madd_epi16(m128Tmp3,U10));
1765 
1766             /*  Compute E1  */
1767             E1l = _mm_madd_epi16(m128Tmp0,U01);
1768             E1h = _mm_madd_epi16(m128Tmp1,U01);
1769             E1l = _mm_add_epi32(E1l,
1770                     _mm_madd_epi16(m128Tmp2,U11));
1771             E1h = _mm_add_epi32(E1h,
1772                     _mm_madd_epi16(m128Tmp3,U11));
1773 
1774             /*  Compute E2  */
1775             E2l = _mm_madd_epi16(m128Tmp0,U02);
1776             E2h = _mm_madd_epi16(m128Tmp1,U02);
1777             E2l = _mm_add_epi32(E2l,
1778                     _mm_madd_epi16(m128Tmp2,U12));
1779             E2h = _mm_add_epi32(E2h,
1780                     _mm_madd_epi16(m128Tmp3,U12));
1781             /*  Compute E3  */
1782             E3l = _mm_madd_epi16(m128Tmp0,U03);
1783             E3h = _mm_madd_epi16(m128Tmp1,U03);
1784             E3l = _mm_add_epi32(E3l,
1785                     _mm_madd_epi16(m128Tmp2,U13));
1786             E3h = _mm_add_epi32(E3h,
1787                     _mm_madd_epi16(m128Tmp3,U13));
1788 
1789             /*  Compute EE0 and EEE */
1790 
1791             m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12);
1792             E00l = _mm_madd_epi16(m128Tmp0,V00);
1793             m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12);
1794             E00h = _mm_madd_epi16(m128Tmp1,V00);
1795 
1796             m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS8);
1797             EE0l = _mm_madd_epi16(m128Tmp2,V10);
1798             m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS8);
1799             EE0h = _mm_madd_epi16(m128Tmp3,V10);
1800 
1801             E01l = _mm_madd_epi16(m128Tmp0,V01);
1802             E01h = _mm_madd_epi16(m128Tmp1,V01);
1803 
1804             EE1l = _mm_madd_epi16(m128Tmp2,V11);
1805             EE1h = _mm_madd_epi16(m128Tmp3,V11);
1806 
1807             /*  Compute EE    */
1808             EE2l = _mm_sub_epi32(EE1l, E01l);
1809             EE3l = _mm_sub_epi32(EE0l, E00l);
1810             EE2h = _mm_sub_epi32(EE1h, E01h);
1811             EE3h = _mm_sub_epi32(EE0h, E00h);
1812 
1813             EE0l = _mm_add_epi32(EE0l, E00l);
1814             EE1l = _mm_add_epi32(EE1l, E01l);
1815             EE0h = _mm_add_epi32(EE0h, E00h);
1816             EE1h = _mm_add_epi32(EE1h, E01h);
1817 
1818             /*      Compute E       */
1819 
1820             E4l = _mm_sub_epi32(EE3l, E3l);
1821             E4l = _mm_add_epi32(E4l, m128iAdd);
1822 
1823             E5l = _mm_sub_epi32(EE2l, E2l);
1824             E5l = _mm_add_epi32(E5l, m128iAdd);
1825 
1826             E6l = _mm_sub_epi32(EE1l, E1l);
1827             E6l = _mm_add_epi32(E6l, m128iAdd);
1828 
1829             E7l = _mm_sub_epi32(EE0l, E0l);
1830             E7l = _mm_add_epi32(E7l, m128iAdd);
1831 
1832             E4h = _mm_sub_epi32(EE3h, E3h);
1833             E4h = _mm_add_epi32(E4h, m128iAdd);
1834 
1835             E5h = _mm_sub_epi32(EE2h, E2h);
1836             E5h = _mm_add_epi32(E5h, m128iAdd);
1837 
1838             E6h = _mm_sub_epi32(EE1h, E1h);
1839             E6h = _mm_add_epi32(E6h, m128iAdd);
1840 
1841             E7h = _mm_sub_epi32(EE0h, E0h);
1842             E7h = _mm_add_epi32(E7h, m128iAdd);
1843 
1844             E0l = _mm_add_epi32(EE0l, E0l);
1845             E0l = _mm_add_epi32(E0l, m128iAdd);
1846 
1847             E1l = _mm_add_epi32(EE1l, E1l);
1848             E1l = _mm_add_epi32(E1l, m128iAdd);
1849 
1850             E2l = _mm_add_epi32(EE2l, E2l);
1851             E2l = _mm_add_epi32(E2l, m128iAdd);
1852 
1853             E3l = _mm_add_epi32(EE3l, E3l);
1854             E3l = _mm_add_epi32(E3l, m128iAdd);
1855 
1856             E0h = _mm_add_epi32(EE0h, E0h);
1857             E0h = _mm_add_epi32(E0h, m128iAdd);
1858 
1859             E1h = _mm_add_epi32(EE1h, E1h);
1860             E1h = _mm_add_epi32(E1h, m128iAdd);
1861 
1862             E2h = _mm_add_epi32(EE2h, E2h);
1863             E2h = _mm_add_epi32(E2h, m128iAdd);
1864 
1865             E3h = _mm_add_epi32(EE3h, E3h);
1866             E3h = _mm_add_epi32(E3h, m128iAdd);
1867 
1868             m128iS0 = _mm_packs_epi32(
1869                     _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift),
1870                     _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift));
1871             m128iS1 = _mm_packs_epi32(
1872                     _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift),
1873                     _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift));
1874             m128iS2 = _mm_packs_epi32(
1875                     _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift),
1876                     _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift));
1877             m128iS3 = _mm_packs_epi32(
1878                     _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift),
1879                     _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift));
1880 
1881             m128iS4 = _mm_packs_epi32(
1882                     _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift),
1883                     _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift));
1884             m128iS5 = _mm_packs_epi32(
1885                     _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift),
1886                     _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift));
1887             m128iS6 = _mm_packs_epi32(
1888                     _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift),
1889                     _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift));
1890             m128iS7 = _mm_packs_epi32(
1891                     _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift),
1892                     _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift));
1893 
1894             m128iS15 = _mm_packs_epi32(
1895                     _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift),
1896                     _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift));
1897             m128iS14 = _mm_packs_epi32(
1898                     _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift),
1899                     _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift));
1900             m128iS13 = _mm_packs_epi32(
1901                     _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift),
1902                     _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift));
1903             m128iS12 = _mm_packs_epi32(
1904                     _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift),
1905                     _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift));
1906 
1907             m128iS11 = _mm_packs_epi32(
1908                     _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift),
1909                     _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift));
1910             m128iS10 = _mm_packs_epi32(
1911                     _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift),
1912                     _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift));
1913             m128iS9 = _mm_packs_epi32(
1914                     _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift),
1915                     _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift));
1916             m128iS8 = _mm_packs_epi32(
1917                     _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift),
1918                     _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift));
1919 
1920 
1921 
1922             if (!j) { //first pass
1923 
1924                 /*      Inverse the matrix      */
1925                 E0l = _mm_unpacklo_epi16(m128iS0, m128iS8);
1926                 E1l = _mm_unpacklo_epi16(m128iS1, m128iS9);
1927                 E2l = _mm_unpacklo_epi16(m128iS2, m128iS10);
1928                 E3l = _mm_unpacklo_epi16(m128iS3, m128iS11);
1929                 E4l = _mm_unpacklo_epi16(m128iS4, m128iS12);
1930                 E5l = _mm_unpacklo_epi16(m128iS5, m128iS13);
1931                 E6l = _mm_unpacklo_epi16(m128iS6, m128iS14);
1932                 E7l = _mm_unpacklo_epi16(m128iS7, m128iS15);
1933 
1934                 E0h = _mm_unpackhi_epi16(m128iS0, m128iS8);
1935                 E1h = _mm_unpackhi_epi16(m128iS1, m128iS9);
1936                 E2h = _mm_unpackhi_epi16(m128iS2, m128iS10);
1937                 E3h = _mm_unpackhi_epi16(m128iS3, m128iS11);
1938                 E4h = _mm_unpackhi_epi16(m128iS4, m128iS12);
1939                 E5h = _mm_unpackhi_epi16(m128iS5, m128iS13);
1940                 E6h = _mm_unpackhi_epi16(m128iS6, m128iS14);
1941                 E7h = _mm_unpackhi_epi16(m128iS7, m128iS15);
1942 
1943                 m128Tmp0 = _mm_unpacklo_epi16(E0l, E4l);
1944                 m128Tmp1 = _mm_unpacklo_epi16(E1l, E5l);
1945                 m128Tmp2 = _mm_unpacklo_epi16(E2l, E6l);
1946                 m128Tmp3 = _mm_unpacklo_epi16(E3l, E7l);
1947 
1948                 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
1949                 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
1950                 m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1951                 m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1952 
1953                 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
1954                 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
1955                 m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1956                 m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1957 
1958                 m128Tmp0 = _mm_unpackhi_epi16(E0l, E4l);
1959                 m128Tmp1 = _mm_unpackhi_epi16(E1l, E5l);
1960                 m128Tmp2 = _mm_unpackhi_epi16(E2l, E6l);
1961                 m128Tmp3 = _mm_unpackhi_epi16(E3l, E7l);
1962 
1963                 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
1964                 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
1965                 m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1966                 m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1967 
1968                 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
1969                 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
1970                 m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1971                 m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1972 
1973                 m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h);
1974                 m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h);
1975                 m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h);
1976                 m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h);
1977 
1978                 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
1979                 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
1980                 m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1981                 m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1982 
1983                 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
1984                 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
1985                 m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1986                 m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1987 
1988                 m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h);
1989                 m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h);
1990                 m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h);
1991                 m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h);
1992 
1993                 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
1994                 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
1995                 m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
1996                 m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
1997 
1998                 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
1999                 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
2000                 m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2001                 m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2002 
2003                 if (!i) {
2004 
2005                     r0= m128iS0;    //0
2006                     r1= m128iS1;    //16
2007                     r2= m128iS2;    //32
2008                     r3= m128iS3;    //48
2009                     r4= m128iS4;    //64
2010                     r5= m128iS5;    //80
2011                     r6= m128iS6;    //96
2012                     r7= m128iS7;    //112
2013                     r8= m128iS8;    //128
2014                     r9= m128iS9;    //144
2015                     r10= m128iS10;  //160
2016                     r11= m128iS11;  //176
2017                     r12= m128iS12;  //192
2018                     r13= m128iS13;  //208
2019                     r14= m128iS14;  //224
2020                     r15= m128iS15;  //240
2021 
2022 
2023 
2024                     m128iS0 = _mm_load_si128((__m128i *) (src + 8));
2025                     m128iS1 = _mm_load_si128((__m128i *) (src + 24));
2026                     m128iS2 = _mm_load_si128((__m128i *) (src + 40));
2027                     m128iS3 = _mm_load_si128((__m128i *) (src + 56));
2028                     m128iS4 = _mm_loadu_si128((__m128i *) (src + 72));
2029                     m128iS5 = _mm_load_si128((__m128i *) (src + 88));
2030                     m128iS6 = _mm_load_si128((__m128i *) (src + 104));
2031                     m128iS7 = _mm_load_si128((__m128i *) (src + 120));
2032                     m128iS8 = _mm_load_si128((__m128i *) (src + 136));
2033                     m128iS9 = _mm_load_si128((__m128i *) (src + 152));
2034                     m128iS10 = _mm_load_si128((__m128i *) (src + 168));
2035                     m128iS11 = _mm_load_si128((__m128i *) (src + 184));
2036                     m128iS12 = _mm_load_si128((__m128i *) (src + 200));
2037                     m128iS13 = _mm_load_si128((__m128i *) (src + 216));
2038                     m128iS14 = _mm_load_si128((__m128i *) (src + 232));
2039                     m128iS15 = _mm_load_si128((__m128i *) (src + 248));
2040                 } else {
2041 
2042                     r16= m128iS0;    //8
2043                     r17= m128iS1;    //24
2044                     r18= m128iS2;    //40
2045                     r19= m128iS3;    //56
2046                     r20= m128iS4;    //72
2047                     r21= m128iS5;    //88
2048                     r22= m128iS6;    //104
2049                     r23= m128iS7;    //120
2050                     r24= m128iS8;    //136
2051                     r25= m128iS9;    //152
2052                     r26= m128iS10;  //168
2053                     r27= m128iS11;  //184
2054                     r28= m128iS12;  //200
2055                     r29= m128iS13;  //216
2056                     r30= m128iS14;  //232
2057                     r31= m128iS15;  //248
2058 
2059                     //prepare next iteration :
2060 
2061                     m128iS0= r0;
2062                     m128iS1= r2;
2063                     m128iS2= r4;
2064                     m128iS3= r6;
2065                     m128iS4= r8;
2066                     m128iS5= r10;
2067                     m128iS6= r12;
2068                     m128iS7= r14;
2069                     m128iS8= r16;
2070                     m128iS9= r18;
2071                     m128iS10=r20;
2072                     m128iS11=r22;
2073                     m128iS12=r24;
2074                     m128iS13=r26;
2075                     m128iS14=r28;
2076                     m128iS15=r30;
2077 
2078                     shift = shift_2nd;
2079                     m128iAdd = _mm_set1_epi32(add_2nd);
2080                 }
2081 
2082             } else {
2083 
2084                 //transpose half matrix :
2085                 //instead of having 1 register = 1 half-column,
2086                 //1 register = 1 half-row.
2087                 E0l = _mm_unpacklo_epi16(m128iS0, m128iS1);
2088                 E1l = _mm_unpacklo_epi16(m128iS2, m128iS3);
2089                 E2l = _mm_unpacklo_epi16(m128iS4, m128iS5);
2090                 E3l = _mm_unpacklo_epi16(m128iS6, m128iS7);
2091                 E4l = _mm_unpacklo_epi16(m128iS8, m128iS9);
2092                 E5l = _mm_unpacklo_epi16(m128iS10, m128iS11);
2093                 E6l = _mm_unpacklo_epi16(m128iS12, m128iS13);
2094                 E7l = _mm_unpacklo_epi16(m128iS14, m128iS15);
2095 
2096                 O0l = _mm_unpackhi_epi16(m128iS0, m128iS1);
2097                 O1l = _mm_unpackhi_epi16(m128iS2, m128iS3);
2098                 O2l = _mm_unpackhi_epi16(m128iS4, m128iS5);
2099                 O3l = _mm_unpackhi_epi16(m128iS6, m128iS7);
2100                 O4l = _mm_unpackhi_epi16(m128iS8, m128iS9);
2101                 O5l = _mm_unpackhi_epi16(m128iS10, m128iS11);
2102                 O6l = _mm_unpackhi_epi16(m128iS12, m128iS13);
2103                 O7l = _mm_unpackhi_epi16(m128iS14, m128iS15);
2104 
2105 
2106                 m128Tmp0 = _mm_unpacklo_epi32(E0l, E1l);
2107                 m128Tmp1 = _mm_unpacklo_epi32(E2l, E3l);
2108 
2109                 m128Tmp2 = _mm_unpacklo_epi32(E4l, E5l);
2110                 m128Tmp3 = _mm_unpacklo_epi32(E6l, E7l);
2111 
2112                 r0 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1);    //1st half 1st row
2113                 r2 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3);    //2nd half 1st row
2114 
2115 
2116                 r4 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1);    //1st half 2nd row
2117                 r6 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3);    //2nd hald 2nd row
2118 
2119                 m128Tmp0 = _mm_unpackhi_epi32(E0l, E1l);
2120                 m128Tmp1 = _mm_unpackhi_epi32(E2l, E3l);
2121                 m128Tmp2 = _mm_unpackhi_epi32(E4l, E5l);
2122                 m128Tmp3 = _mm_unpackhi_epi32(E6l, E7l);
2123 
2124 
2125                 r8 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1);
2126                 r10 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3);
2127 
2128                 r12 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1);
2129                 r14 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3);
2130 
2131                 m128Tmp0 = _mm_unpacklo_epi32(O0l, O1l);
2132                 m128Tmp1 = _mm_unpacklo_epi32(O2l, O3l);
2133                 m128Tmp2 = _mm_unpacklo_epi32(O4l, O5l);
2134                 m128Tmp3 = _mm_unpacklo_epi32(O6l, O7l);
2135 
2136                 r16 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1);
2137                 r18 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3);
2138 
2139 
2140                 r20 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1);
2141                 r22 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3);
2142 
2143                 m128Tmp0 = _mm_unpackhi_epi32(O0l, O1l);
2144                 m128Tmp1 = _mm_unpackhi_epi32(O2l, O3l);
2145                 m128Tmp2 = _mm_unpackhi_epi32(O4l, O5l);
2146                 m128Tmp3 = _mm_unpackhi_epi32(O6l, O7l);
2147 
2148                 r24 = _mm_unpacklo_epi64(m128Tmp0, m128Tmp1);
2149                 r26 = _mm_unpacklo_epi64(m128Tmp2, m128Tmp3);
2150 
2151 
2152                 r28 = _mm_unpackhi_epi64(m128Tmp0, m128Tmp1);
2153                 r30 = _mm_unpackhi_epi64(m128Tmp2, m128Tmp3);
2154 
2155                 dst = (uint8_t*) (_dst + (i*stride));
2156                 m128Tmp0= _mm_setzero_si128();
2157                 m128Tmp1= _mm_load_si128((__m128i*)dst);
2158                 m128Tmp2= _mm_load_si128((__m128i*)(dst+stride));
2159                 m128Tmp3= _mm_load_si128((__m128i*)(dst+2*stride));
2160                 m128Tmp4= _mm_load_si128((__m128i*)(dst+3*stride));
2161                 m128Tmp5= _mm_load_si128((__m128i*)(dst+4*stride));
2162                 m128Tmp6= _mm_load_si128((__m128i*)(dst+5*stride));
2163                 m128Tmp7= _mm_load_si128((__m128i*)(dst+6*stride));
2164                 E0l= _mm_load_si128((__m128i*)(dst+7*stride));
2165 
2166 
2167                 r0= _mm_adds_epi16(r0,_mm_unpacklo_epi8(m128Tmp1,m128Tmp0));
2168                 r2= _mm_adds_epi16(r2,_mm_unpackhi_epi8(m128Tmp1,m128Tmp0));
2169                 r0= _mm_packus_epi16(r0,r2);
2170 
2171 
2172 
2173 
2174                 r4= _mm_adds_epi16(r4,_mm_unpacklo_epi8(m128Tmp2,m128Tmp0));
2175                 r6= _mm_adds_epi16(r6,_mm_unpackhi_epi8(m128Tmp2,m128Tmp0));
2176                 r4= _mm_packus_epi16(r4,r6);
2177 
2178 
2179                 r8= _mm_adds_epi16(r8,_mm_unpacklo_epi8(m128Tmp3,m128Tmp0));
2180                 r10= _mm_adds_epi16(r10,_mm_unpackhi_epi8(m128Tmp3,m128Tmp0));
2181                 r8= _mm_packus_epi16(r8,r10);
2182 
2183 
2184                 r12= _mm_adds_epi16(r12,_mm_unpacklo_epi8(m128Tmp4,m128Tmp0));
2185                 r14= _mm_adds_epi16(r14,_mm_unpackhi_epi8(m128Tmp4,m128Tmp0));
2186                 r12= _mm_packus_epi16(r12,r14);
2187 
2188 
2189                 r16= _mm_adds_epi16(r16,_mm_unpacklo_epi8(m128Tmp5,m128Tmp0));
2190                 r18= _mm_adds_epi16(r18,_mm_unpackhi_epi8(m128Tmp5,m128Tmp0));
2191                 r16= _mm_packus_epi16(r16,r18);
2192 
2193 
2194                 r20= _mm_adds_epi16(r20,_mm_unpacklo_epi8(m128Tmp6,m128Tmp0));
2195                 r22= _mm_adds_epi16(r22,_mm_unpackhi_epi8(m128Tmp6,m128Tmp0));
2196                 r20= _mm_packus_epi16(r20,r22);
2197 
2198 
2199                 r24= _mm_adds_epi16(r24,_mm_unpacklo_epi8(m128Tmp7,m128Tmp0));
2200                 r26= _mm_adds_epi16(r26,_mm_unpackhi_epi8(m128Tmp7,m128Tmp0));
2201                 r24= _mm_packus_epi16(r24,r26);
2202 
2203 
2204 
2205                 r28= _mm_adds_epi16(r28,_mm_unpacklo_epi8(E0l,m128Tmp0));
2206                 r30= _mm_adds_epi16(r30,_mm_unpackhi_epi8(E0l,m128Tmp0));
2207                 r28= _mm_packus_epi16(r28,r30);
2208 
2209                 _mm_store_si128((__m128i*)dst,r0);
2210                 _mm_store_si128((__m128i*)(dst+stride),r4);
2211                 _mm_store_si128((__m128i*)(dst+2*stride),r8);
2212                 _mm_store_si128((__m128i*)(dst+3*stride),r12);
2213                 _mm_store_si128((__m128i*)(dst+4*stride),r16);
2214                 _mm_store_si128((__m128i*)(dst+5*stride),r20);
2215                 _mm_store_si128((__m128i*)(dst+6*stride),r24);
2216                 _mm_store_si128((__m128i*)(dst+7*stride),r28);
2217 
2218 
2219 
2220                 if (!i) {
2221                     //first half done, can store !
2222 
2223 
2224                     m128iS0= r1;
2225                     m128iS1= r3;
2226                     m128iS2= r5;
2227                     m128iS3= r7;
2228                     m128iS4= r9;
2229                     m128iS5= r11;
2230                     m128iS6= r13;
2231                     m128iS7= r15;
2232                     m128iS8= r17;
2233                     m128iS9= r19;
2234                     m128iS10=r21;
2235                     m128iS11=r23;
2236                     m128iS12=r25;
2237                     m128iS13=r27;
2238                     m128iS14=r29;
2239                     m128iS15=r31;
2240                 }
2241             }
2242         }
2243     }
2244 }
2245 #endif
2246 
2247 
2248 #if 0
2249 void ff_hevc_transform_16x16_add_10_sse4(uint8_t *_dst, const int16_t *coeffs,
2250         ptrdiff_t _stride) {
2251     int i;
2252     uint16_t *dst = (uint16_t*) _dst;
2253     ptrdiff_t stride = _stride / 2;
2254     int16_t *src = coeffs;
2255     int32_t shift;
2256     uint8_t shift_2nd = 10; //20 - bit depth
2257     uint16_t add_2nd = 1 << 9; //shift - 1;
2258     __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
2259             m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13,
2260             m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2,
2261             m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h,
2262             E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h,
2263             O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l,
2264             E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h;
2265     __m128i E4l, E5l, E6l, E7l;
2266     __m128i E4h, E5h, E6h, E7h;
2267     int j;
2268     m128iS0 = _mm_load_si128((__m128i *) (src));
2269     m128iS1 = _mm_load_si128((__m128i *) (src + 16));
2270     m128iS2 = _mm_load_si128((__m128i *) (src + 32));
2271     m128iS3 = _mm_load_si128((__m128i *) (src + 48));
2272     m128iS4 = _mm_loadu_si128((__m128i *) (src + 64));
2273     m128iS5 = _mm_load_si128((__m128i *) (src + 80));
2274     m128iS6 = _mm_load_si128((__m128i *) (src + 96));
2275     m128iS7 = _mm_load_si128((__m128i *) (src + 112));
2276     m128iS8 = _mm_load_si128((__m128i *) (src + 128));
2277     m128iS9 = _mm_load_si128((__m128i *) (src + 144));
2278     m128iS10 = _mm_load_si128((__m128i *) (src + 160));
2279     m128iS11 = _mm_load_si128((__m128i *) (src + 176));
2280     m128iS12 = _mm_loadu_si128((__m128i *) (src + 192));
2281     m128iS13 = _mm_load_si128((__m128i *) (src + 208));
2282     m128iS14 = _mm_load_si128((__m128i *) (src + 224));
2283     m128iS15 = _mm_load_si128((__m128i *) (src + 240));
2284     shift = shift_1st;
2285     m128iAdd = _mm_set1_epi32(add_1st);
2286 
2287     for (j = 0; j < 2; j++) {
2288         for (i = 0; i < 16; i += 8) {
2289 
2290             m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
2291             E0l = _mm_madd_epi16(m128Tmp0,
2292                     _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
2293             m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
2294             E0h = _mm_madd_epi16(m128Tmp1,
2295                     _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
2296 
2297             m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
2298             E1l = _mm_madd_epi16(m128Tmp2,
2299                     _mm_load_si128((__m128i *) (transform16x16_1[1][0])));
2300             m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
2301             E1h = _mm_madd_epi16(m128Tmp3,
2302                     _mm_load_si128((__m128i *) (transform16x16_1[1][0])));
2303 
2304             m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11);
2305             E2l = _mm_madd_epi16(m128Tmp4,
2306                     _mm_load_si128((__m128i *) (transform16x16_1[2][0])));
2307             m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11);
2308             E2h = _mm_madd_epi16(m128Tmp5,
2309                     _mm_load_si128((__m128i *) (transform16x16_1[2][0])));
2310 
2311             m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15);
2312             E3l = _mm_madd_epi16(m128Tmp6,
2313                     _mm_load_si128((__m128i *) (transform16x16_1[3][0])));
2314             m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15);
2315             E3h = _mm_madd_epi16(m128Tmp7,
2316                     _mm_load_si128((__m128i *) (transform16x16_1[3][0])));
2317 
2318             O0l = _mm_add_epi32(E0l, E1l);
2319             O0l = _mm_add_epi32(O0l, E2l);
2320             O0l = _mm_add_epi32(O0l, E3l);
2321 
2322             O0h = _mm_add_epi32(E0h, E1h);
2323             O0h = _mm_add_epi32(O0h, E2h);
2324             O0h = _mm_add_epi32(O0h, E3h);
2325 
2326             /* Compute O1*/
2327             E0l = _mm_madd_epi16(m128Tmp0,
2328                     _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
2329             E0h = _mm_madd_epi16(m128Tmp1,
2330                     _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
2331             E1l = _mm_madd_epi16(m128Tmp2,
2332                     _mm_load_si128((__m128i *) (transform16x16_1[1][1])));
2333             E1h = _mm_madd_epi16(m128Tmp3,
2334                     _mm_load_si128((__m128i *) (transform16x16_1[1][1])));
2335             E2l = _mm_madd_epi16(m128Tmp4,
2336                     _mm_load_si128((__m128i *) (transform16x16_1[2][1])));
2337             E2h = _mm_madd_epi16(m128Tmp5,
2338                     _mm_load_si128((__m128i *) (transform16x16_1[2][1])));
2339             E3l = _mm_madd_epi16(m128Tmp6,
2340                     _mm_load_si128((__m128i *) (transform16x16_1[3][1])));
2341             E3h = _mm_madd_epi16(m128Tmp7,
2342                     _mm_load_si128((__m128i *) (transform16x16_1[3][1])));
2343             O1l = _mm_add_epi32(E0l, E1l);
2344             O1l = _mm_add_epi32(O1l, E2l);
2345             O1l = _mm_add_epi32(O1l, E3l);
2346             O1h = _mm_add_epi32(E0h, E1h);
2347             O1h = _mm_add_epi32(O1h, E2h);
2348             O1h = _mm_add_epi32(O1h, E3h);
2349 
2350             /* Compute O2*/
2351             E0l = _mm_madd_epi16(m128Tmp0,
2352                     _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
2353             E0h = _mm_madd_epi16(m128Tmp1,
2354                     _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
2355             E1l = _mm_madd_epi16(m128Tmp2,
2356                     _mm_load_si128((__m128i *) (transform16x16_1[1][2])));
2357             E1h = _mm_madd_epi16(m128Tmp3,
2358                     _mm_load_si128((__m128i *) (transform16x16_1[1][2])));
2359             E2l = _mm_madd_epi16(m128Tmp4,
2360                     _mm_load_si128((__m128i *) (transform16x16_1[2][2])));
2361             E2h = _mm_madd_epi16(m128Tmp5,
2362                     _mm_load_si128((__m128i *) (transform16x16_1[2][2])));
2363             E3l = _mm_madd_epi16(m128Tmp6,
2364                     _mm_load_si128((__m128i *) (transform16x16_1[3][2])));
2365             E3h = _mm_madd_epi16(m128Tmp7,
2366                     _mm_load_si128((__m128i *) (transform16x16_1[3][2])));
2367             O2l = _mm_add_epi32(E0l, E1l);
2368             O2l = _mm_add_epi32(O2l, E2l);
2369             O2l = _mm_add_epi32(O2l, E3l);
2370 
2371             O2h = _mm_add_epi32(E0h, E1h);
2372             O2h = _mm_add_epi32(O2h, E2h);
2373             O2h = _mm_add_epi32(O2h, E3h);
2374 
2375             /* Compute O3*/
2376             E0l = _mm_madd_epi16(m128Tmp0,
2377                     _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
2378             E0h = _mm_madd_epi16(m128Tmp1,
2379                     _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
2380             E1l = _mm_madd_epi16(m128Tmp2,
2381                     _mm_load_si128((__m128i *) (transform16x16_1[1][3])));
2382             E1h = _mm_madd_epi16(m128Tmp3,
2383                     _mm_load_si128((__m128i *) (transform16x16_1[1][3])));
2384             E2l = _mm_madd_epi16(m128Tmp4,
2385                     _mm_load_si128((__m128i *) (transform16x16_1[2][3])));
2386             E2h = _mm_madd_epi16(m128Tmp5,
2387                     _mm_load_si128((__m128i *) (transform16x16_1[2][3])));
2388             E3l = _mm_madd_epi16(m128Tmp6,
2389                     _mm_load_si128((__m128i *) (transform16x16_1[3][3])));
2390             E3h = _mm_madd_epi16(m128Tmp7,
2391                     _mm_load_si128((__m128i *) (transform16x16_1[3][3])));
2392 
2393             O3l = _mm_add_epi32(E0l, E1l);
2394             O3l = _mm_add_epi32(O3l, E2l);
2395             O3l = _mm_add_epi32(O3l, E3l);
2396 
2397             O3h = _mm_add_epi32(E0h, E1h);
2398             O3h = _mm_add_epi32(O3h, E2h);
2399             O3h = _mm_add_epi32(O3h, E3h);
2400 
2401             /* Compute O4*/
2402 
2403             E0l = _mm_madd_epi16(m128Tmp0,
2404                     _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
2405             E0h = _mm_madd_epi16(m128Tmp1,
2406                     _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
2407             E1l = _mm_madd_epi16(m128Tmp2,
2408                     _mm_load_si128((__m128i *) (transform16x16_1[1][4])));
2409             E1h = _mm_madd_epi16(m128Tmp3,
2410                     _mm_load_si128((__m128i *) (transform16x16_1[1][4])));
2411             E2l = _mm_madd_epi16(m128Tmp4,
2412                     _mm_load_si128((__m128i *) (transform16x16_1[2][4])));
2413             E2h = _mm_madd_epi16(m128Tmp5,
2414                     _mm_load_si128((__m128i *) (transform16x16_1[2][4])));
2415             E3l = _mm_madd_epi16(m128Tmp6,
2416                     _mm_load_si128((__m128i *) (transform16x16_1[3][4])));
2417             E3h = _mm_madd_epi16(m128Tmp7,
2418                     _mm_load_si128((__m128i *) (transform16x16_1[3][4])));
2419 
2420             O4l = _mm_add_epi32(E0l, E1l);
2421             O4l = _mm_add_epi32(O4l, E2l);
2422             O4l = _mm_add_epi32(O4l, E3l);
2423 
2424             O4h = _mm_add_epi32(E0h, E1h);
2425             O4h = _mm_add_epi32(O4h, E2h);
2426             O4h = _mm_add_epi32(O4h, E3h);
2427 
2428             /* Compute O5*/
2429             E0l = _mm_madd_epi16(m128Tmp0,
2430                     _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
2431             E0h = _mm_madd_epi16(m128Tmp1,
2432                     _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
2433             E1l = _mm_madd_epi16(m128Tmp2,
2434                     _mm_load_si128((__m128i *) (transform16x16_1[1][5])));
2435             E1h = _mm_madd_epi16(m128Tmp3,
2436                     _mm_load_si128((__m128i *) (transform16x16_1[1][5])));
2437             E2l = _mm_madd_epi16(m128Tmp4,
2438                     _mm_load_si128((__m128i *) (transform16x16_1[2][5])));
2439             E2h = _mm_madd_epi16(m128Tmp5,
2440                     _mm_load_si128((__m128i *) (transform16x16_1[2][5])));
2441             E3l = _mm_madd_epi16(m128Tmp6,
2442                     _mm_load_si128((__m128i *) (transform16x16_1[3][5])));
2443             E3h = _mm_madd_epi16(m128Tmp7,
2444                     _mm_load_si128((__m128i *) (transform16x16_1[3][5])));
2445 
2446             O5l = _mm_add_epi32(E0l, E1l);
2447             O5l = _mm_add_epi32(O5l, E2l);
2448             O5l = _mm_add_epi32(O5l, E3l);
2449 
2450             O5h = _mm_add_epi32(E0h, E1h);
2451             O5h = _mm_add_epi32(O5h, E2h);
2452             O5h = _mm_add_epi32(O5h, E3h);
2453 
2454             /* Compute O6*/
2455 
2456             E0l = _mm_madd_epi16(m128Tmp0,
2457                     _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
2458             E0h = _mm_madd_epi16(m128Tmp1,
2459                     _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
2460             E1l = _mm_madd_epi16(m128Tmp2,
2461                     _mm_load_si128((__m128i *) (transform16x16_1[1][6])));
2462             E1h = _mm_madd_epi16(m128Tmp3,
2463                     _mm_load_si128((__m128i *) (transform16x16_1[1][6])));
2464             E2l = _mm_madd_epi16(m128Tmp4,
2465                     _mm_load_si128((__m128i *) (transform16x16_1[2][6])));
2466             E2h = _mm_madd_epi16(m128Tmp5,
2467                     _mm_load_si128((__m128i *) (transform16x16_1[2][6])));
2468             E3l = _mm_madd_epi16(m128Tmp6,
2469                     _mm_load_si128((__m128i *) (transform16x16_1[3][6])));
2470             E3h = _mm_madd_epi16(m128Tmp7,
2471                     _mm_load_si128((__m128i *) (transform16x16_1[3][6])));
2472 
2473             O6l = _mm_add_epi32(E0l, E1l);
2474             O6l = _mm_add_epi32(O6l, E2l);
2475             O6l = _mm_add_epi32(O6l, E3l);
2476 
2477             O6h = _mm_add_epi32(E0h, E1h);
2478             O6h = _mm_add_epi32(O6h, E2h);
2479             O6h = _mm_add_epi32(O6h, E3h);
2480 
2481             /* Compute O7*/
2482 
2483             E0l = _mm_madd_epi16(m128Tmp0,
2484                     _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
2485             E0h = _mm_madd_epi16(m128Tmp1,
2486                     _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
2487             E1l = _mm_madd_epi16(m128Tmp2,
2488                     _mm_load_si128((__m128i *) (transform16x16_1[1][7])));
2489             E1h = _mm_madd_epi16(m128Tmp3,
2490                     _mm_load_si128((__m128i *) (transform16x16_1[1][7])));
2491             E2l = _mm_madd_epi16(m128Tmp4,
2492                     _mm_load_si128((__m128i *) (transform16x16_1[2][7])));
2493             E2h = _mm_madd_epi16(m128Tmp5,
2494                     _mm_load_si128((__m128i *) (transform16x16_1[2][7])));
2495             E3l = _mm_madd_epi16(m128Tmp6,
2496                     _mm_load_si128((__m128i *) (transform16x16_1[3][7])));
2497             E3h = _mm_madd_epi16(m128Tmp7,
2498                     _mm_load_si128((__m128i *) (transform16x16_1[3][7])));
2499 
2500             O7l = _mm_add_epi32(E0l, E1l);
2501             O7l = _mm_add_epi32(O7l, E2l);
2502             O7l = _mm_add_epi32(O7l, E3l);
2503 
2504             O7h = _mm_add_epi32(E0h, E1h);
2505             O7h = _mm_add_epi32(O7h, E2h);
2506             O7h = _mm_add_epi32(O7h, E3h);
2507 
2508             /*  Compute E0  */
2509 
2510             m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
2511             E0l = _mm_madd_epi16(m128Tmp0,
2512                     _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
2513             m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
2514             E0h = _mm_madd_epi16(m128Tmp1,
2515                     _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
2516 
2517             m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14);
2518             E0l = _mm_add_epi32(E0l,
2519                     _mm_madd_epi16(m128Tmp2,
2520                             _mm_load_si128(
2521                                     (__m128i *) (transform16x16_2[1][0]))));
2522             m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14);
2523             E0h = _mm_add_epi32(E0h,
2524                     _mm_madd_epi16(m128Tmp3,
2525                             _mm_load_si128(
2526                                     (__m128i *) (transform16x16_2[1][0]))));
2527 
2528             /*  Compute E1  */
2529             E1l = _mm_madd_epi16(m128Tmp0,
2530                     _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
2531             E1h = _mm_madd_epi16(m128Tmp1,
2532                     _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
2533             E1l = _mm_add_epi32(E1l,
2534                     _mm_madd_epi16(m128Tmp2,
2535                             _mm_load_si128(
2536                                     (__m128i *) (transform16x16_2[1][1]))));
2537             E1h = _mm_add_epi32(E1h,
2538                     _mm_madd_epi16(m128Tmp3,
2539                             _mm_load_si128(
2540                                     (__m128i *) (transform16x16_2[1][1]))));
2541 
2542             /*  Compute E2  */
2543             E2l = _mm_madd_epi16(m128Tmp0,
2544                     _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
2545             E2h = _mm_madd_epi16(m128Tmp1,
2546                     _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
2547             E2l = _mm_add_epi32(E2l,
2548                     _mm_madd_epi16(m128Tmp2,
2549                             _mm_load_si128(
2550                                     (__m128i *) (transform16x16_2[1][2]))));
2551             E2h = _mm_add_epi32(E2h,
2552                     _mm_madd_epi16(m128Tmp3,
2553                             _mm_load_si128(
2554                                     (__m128i *) (transform16x16_2[1][2]))));
2555             /*  Compute E3  */
2556             E3l = _mm_madd_epi16(m128Tmp0,
2557                     _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
2558             E3h = _mm_madd_epi16(m128Tmp1,
2559                     _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
2560             E3l = _mm_add_epi32(E3l,
2561                     _mm_madd_epi16(m128Tmp2,
2562                             _mm_load_si128(
2563                                     (__m128i *) (transform16x16_2[1][3]))));
2564             E3h = _mm_add_epi32(E3h,
2565                     _mm_madd_epi16(m128Tmp3,
2566                             _mm_load_si128(
2567                                     (__m128i *) (transform16x16_2[1][3]))));
2568 
2569             /*  Compute EE0 and EEE */
2570 
2571             m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12);
2572             E00l = _mm_madd_epi16(m128Tmp0,
2573                     _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
2574             m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12);
2575             E00h = _mm_madd_epi16(m128Tmp1,
2576                     _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
2577 
2578             m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS8);
2579             EE0l = _mm_madd_epi16(m128Tmp2,
2580                     _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
2581             m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS8);
2582             EE0h = _mm_madd_epi16(m128Tmp3,
2583                     _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
2584 
2585             E01l = _mm_madd_epi16(m128Tmp0,
2586                     _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
2587             E01h = _mm_madd_epi16(m128Tmp1,
2588                     _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
2589 
2590             EE1l = _mm_madd_epi16(m128Tmp2,
2591                     _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
2592             EE1h = _mm_madd_epi16(m128Tmp3,
2593                     _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
2594 
2595             /*  Compute EE    */
2596             EE2l = _mm_sub_epi32(EE1l, E01l);
2597             EE3l = _mm_sub_epi32(EE0l, E00l);
2598             EE2h = _mm_sub_epi32(EE1h, E01h);
2599             EE3h = _mm_sub_epi32(EE0h, E00h);
2600 
2601             EE0l = _mm_add_epi32(EE0l, E00l);
2602             EE1l = _mm_add_epi32(EE1l, E01l);
2603             EE0h = _mm_add_epi32(EE0h, E00h);
2604             EE1h = _mm_add_epi32(EE1h, E01h);
2605 
2606             /*      Compute E       */
2607 
2608             E4l = _mm_sub_epi32(EE3l, E3l);
2609             E4l = _mm_add_epi32(E4l, m128iAdd);
2610 
2611             E5l = _mm_sub_epi32(EE2l, E2l);
2612             E5l = _mm_add_epi32(E5l, m128iAdd);
2613 
2614             E6l = _mm_sub_epi32(EE1l, E1l);
2615             E6l = _mm_add_epi32(E6l, m128iAdd);
2616 
2617             E7l = _mm_sub_epi32(EE0l, E0l);
2618             E7l = _mm_add_epi32(E7l, m128iAdd);
2619 
2620             E4h = _mm_sub_epi32(EE3h, E3h);
2621             E4h = _mm_add_epi32(E4h, m128iAdd);
2622 
2623             E5h = _mm_sub_epi32(EE2h, E2h);
2624             E5h = _mm_add_epi32(E5h, m128iAdd);
2625 
2626             E6h = _mm_sub_epi32(EE1h, E1h);
2627             E6h = _mm_add_epi32(E6h, m128iAdd);
2628 
2629             E7h = _mm_sub_epi32(EE0h, E0h);
2630             E7h = _mm_add_epi32(E7h, m128iAdd);
2631 
2632             E0l = _mm_add_epi32(EE0l, E0l);
2633             E0l = _mm_add_epi32(E0l, m128iAdd);
2634 
2635             E1l = _mm_add_epi32(EE1l, E1l);
2636             E1l = _mm_add_epi32(E1l, m128iAdd);
2637 
2638             E2l = _mm_add_epi32(EE2l, E2l);
2639             E2l = _mm_add_epi32(E2l, m128iAdd);
2640 
2641             E3l = _mm_add_epi32(EE3l, E3l);
2642             E3l = _mm_add_epi32(E3l, m128iAdd);
2643 
2644             E0h = _mm_add_epi32(EE0h, E0h);
2645             E0h = _mm_add_epi32(E0h, m128iAdd);
2646 
2647             E1h = _mm_add_epi32(EE1h, E1h);
2648             E1h = _mm_add_epi32(E1h, m128iAdd);
2649 
2650             E2h = _mm_add_epi32(EE2h, E2h);
2651             E2h = _mm_add_epi32(E2h, m128iAdd);
2652 
2653             E3h = _mm_add_epi32(EE3h, E3h);
2654             E3h = _mm_add_epi32(E3h, m128iAdd);
2655 
2656             m128iS0 = _mm_packs_epi32(
2657                     _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift),
2658                     _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift));
2659             m128iS1 = _mm_packs_epi32(
2660                     _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift),
2661                     _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift));
2662             m128iS2 = _mm_packs_epi32(
2663                     _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift),
2664                     _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift));
2665             m128iS3 = _mm_packs_epi32(
2666                     _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift),
2667                     _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift));
2668 
2669             m128iS4 = _mm_packs_epi32(
2670                     _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift),
2671                     _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift));
2672             m128iS5 = _mm_packs_epi32(
2673                     _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift),
2674                     _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift));
2675             m128iS6 = _mm_packs_epi32(
2676                     _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift),
2677                     _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift));
2678             m128iS7 = _mm_packs_epi32(
2679                     _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift),
2680                     _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift));
2681 
2682             m128iS15 = _mm_packs_epi32(
2683                     _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift),
2684                     _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift));
2685             m128iS14 = _mm_packs_epi32(
2686                     _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift),
2687                     _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift));
2688             m128iS13 = _mm_packs_epi32(
2689                     _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift),
2690                     _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift));
2691             m128iS12 = _mm_packs_epi32(
2692                     _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift),
2693                     _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift));
2694 
2695             m128iS11 = _mm_packs_epi32(
2696                     _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift),
2697                     _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift));
2698             m128iS10 = _mm_packs_epi32(
2699                     _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift),
2700                     _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift));
2701             m128iS9 = _mm_packs_epi32(
2702                     _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift),
2703                     _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift));
2704             m128iS8 = _mm_packs_epi32(
2705                     _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift),
2706                     _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift));
2707 
2708             if (!j) {
2709                 /*      Inverse the matrix      */
2710                 E0l = _mm_unpacklo_epi16(m128iS0, m128iS8);
2711                 E1l = _mm_unpacklo_epi16(m128iS1, m128iS9);
2712                 E2l = _mm_unpacklo_epi16(m128iS2, m128iS10);
2713                 E3l = _mm_unpacklo_epi16(m128iS3, m128iS11);
2714                 E4l = _mm_unpacklo_epi16(m128iS4, m128iS12);
2715                 E5l = _mm_unpacklo_epi16(m128iS5, m128iS13);
2716                 E6l = _mm_unpacklo_epi16(m128iS6, m128iS14);
2717                 E7l = _mm_unpacklo_epi16(m128iS7, m128iS15);
2718 
2719                 O0l = _mm_unpackhi_epi16(m128iS0, m128iS8);
2720                 O1l = _mm_unpackhi_epi16(m128iS1, m128iS9);
2721                 O2l = _mm_unpackhi_epi16(m128iS2, m128iS10);
2722                 O3l = _mm_unpackhi_epi16(m128iS3, m128iS11);
2723                 O4l = _mm_unpackhi_epi16(m128iS4, m128iS12);
2724                 O5l = _mm_unpackhi_epi16(m128iS5, m128iS13);
2725                 O6l = _mm_unpackhi_epi16(m128iS6, m128iS14);
2726                 O7l = _mm_unpackhi_epi16(m128iS7, m128iS15);
2727 
2728                 m128Tmp0 = _mm_unpacklo_epi16(E0l, E4l);
2729                 m128Tmp1 = _mm_unpacklo_epi16(E1l, E5l);
2730                 m128Tmp2 = _mm_unpacklo_epi16(E2l, E6l);
2731                 m128Tmp3 = _mm_unpacklo_epi16(E3l, E7l);
2732 
2733                 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
2734                 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
2735                 m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2736                 m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2737 
2738                 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
2739                 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
2740                 m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2741                 m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2742 
2743                 m128Tmp0 = _mm_unpackhi_epi16(E0l, E4l);
2744                 m128Tmp1 = _mm_unpackhi_epi16(E1l, E5l);
2745                 m128Tmp2 = _mm_unpackhi_epi16(E2l, E6l);
2746                 m128Tmp3 = _mm_unpackhi_epi16(E3l, E7l);
2747 
2748                 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
2749                 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
2750                 m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2751                 m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2752 
2753                 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
2754                 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
2755                 m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2756                 m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2757 
2758                 m128Tmp0 = _mm_unpacklo_epi16(O0l, O4l);
2759                 m128Tmp1 = _mm_unpacklo_epi16(O1l, O5l);
2760                 m128Tmp2 = _mm_unpacklo_epi16(O2l, O6l);
2761                 m128Tmp3 = _mm_unpacklo_epi16(O3l, O7l);
2762 
2763                 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
2764                 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
2765                 m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2766                 m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2767 
2768                 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
2769                 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
2770                 m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2771                 m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2772 
2773                 m128Tmp0 = _mm_unpackhi_epi16(O0l, O4l);
2774                 m128Tmp1 = _mm_unpackhi_epi16(O1l, O5l);
2775                 m128Tmp2 = _mm_unpackhi_epi16(O2l, O6l);
2776                 m128Tmp3 = _mm_unpackhi_epi16(O3l, O7l);
2777 
2778                 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
2779                 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
2780                 m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2781                 m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2782 
2783                 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
2784                 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
2785                 m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
2786                 m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
2787 
2788                 /*  */
2789                 _mm_store_si128((__m128i *) (src + i), m128iS0);
2790                 _mm_store_si128((__m128i *) (src + 16 + i), m128iS1);
2791                 _mm_store_si128((__m128i *) (src + 32 + i), m128iS2);
2792                 _mm_store_si128((__m128i *) (src + 48 + i), m128iS3);
2793                 _mm_store_si128((__m128i *) (src + 64 + i), m128iS4);
2794                 _mm_store_si128((__m128i *) (src + 80 + i), m128iS5);
2795                 _mm_store_si128((__m128i *) (src + 96 + i), m128iS6);
2796                 _mm_store_si128((__m128i *) (src + 112 + i), m128iS7);
2797                 _mm_store_si128((__m128i *) (src + 128 + i), m128iS8);
2798                 _mm_store_si128((__m128i *) (src + 144 + i), m128iS9);
2799                 _mm_store_si128((__m128i *) (src + 160 + i), m128iS10);
2800                 _mm_store_si128((__m128i *) (src + 176 + i), m128iS11);
2801                 _mm_store_si128((__m128i *) (src + 192 + i), m128iS12);
2802                 _mm_store_si128((__m128i *) (src + 208 + i), m128iS13);
2803                 _mm_store_si128((__m128i *) (src + 224 + i), m128iS14);
2804                 _mm_store_si128((__m128i *) (src + 240 + i), m128iS15);
2805 
2806                 if (!i) {
2807                     m128iS0 = _mm_load_si128((__m128i *) (src + 8));
2808                     m128iS1 = _mm_load_si128((__m128i *) (src + 24));
2809                     m128iS2 = _mm_load_si128((__m128i *) (src + 40));
2810                     m128iS3 = _mm_load_si128((__m128i *) (src + 56));
2811                     m128iS4 = _mm_loadu_si128((__m128i *) (src + 72));
2812                     m128iS5 = _mm_load_si128((__m128i *) (src + 88));
2813                     m128iS6 = _mm_load_si128((__m128i *) (src + 104));
2814                     m128iS7 = _mm_load_si128((__m128i *) (src + 120));
2815                     m128iS8 = _mm_load_si128((__m128i *) (src + 136));
2816                     m128iS9 = _mm_load_si128((__m128i *) (src + 152));
2817                     m128iS10 = _mm_load_si128((__m128i *) (src + 168));
2818                     m128iS11 = _mm_load_si128((__m128i *) (src + 184));
2819                     m128iS12 = _mm_loadu_si128((__m128i *) (src + 200));
2820                     m128iS13 = _mm_load_si128((__m128i *) (src + 216));
2821                     m128iS14 = _mm_load_si128((__m128i *) (src + 232));
2822                     m128iS15 = _mm_load_si128((__m128i *) (src + 248));
2823                 } else {
2824                     m128iS0 = _mm_load_si128((__m128i *) (src));
2825                     m128iS1 = _mm_load_si128((__m128i *) (src + 32));
2826                     m128iS2 = _mm_load_si128((__m128i *) (src + 64));
2827                     m128iS3 = _mm_load_si128((__m128i *) (src + 96));
2828                     m128iS4 = _mm_loadu_si128((__m128i *) (src + 128));
2829                     m128iS5 = _mm_load_si128((__m128i *) (src + 160));
2830                     m128iS6 = _mm_load_si128((__m128i *) (src + 192));
2831                     m128iS7 = _mm_load_si128((__m128i *) (src + 224));
2832                     m128iS8 = _mm_load_si128((__m128i *) (src + 8));
2833                     m128iS9 = _mm_load_si128((__m128i *) (src + 32 + 8));
2834                     m128iS10 = _mm_load_si128((__m128i *) (src + 64 + 8));
2835                     m128iS11 = _mm_load_si128((__m128i *) (src + 96 + 8));
2836                     m128iS12 = _mm_loadu_si128((__m128i *) (src + 128 + 8));
2837                     m128iS13 = _mm_load_si128((__m128i *) (src + 160 + 8));
2838                     m128iS14 = _mm_load_si128((__m128i *) (src + 192 + 8));
2839                     m128iS15 = _mm_load_si128((__m128i *) (src + 224 + 8));
2840                     shift = shift_2nd;
2841                     m128iAdd = _mm_set1_epi32(add_2nd);
2842                 }
2843 
2844             } else {
2845                 int k, m = 0;
2846                 _mm_storeu_si128((__m128i *) (src), m128iS0);
2847                 _mm_storeu_si128((__m128i *) (src + 8), m128iS1);
2848                 _mm_storeu_si128((__m128i *) (src + 32), m128iS2);
2849                 _mm_storeu_si128((__m128i *) (src + 40), m128iS3);
2850                 _mm_storeu_si128((__m128i *) (src + 64), m128iS4);
2851                 _mm_storeu_si128((__m128i *) (src + 72), m128iS5);
2852                 _mm_storeu_si128((__m128i *) (src + 96), m128iS6);
2853                 _mm_storeu_si128((__m128i *) (src + 104), m128iS7);
2854                 _mm_storeu_si128((__m128i *) (src + 128), m128iS8);
2855                 _mm_storeu_si128((__m128i *) (src + 136), m128iS9);
2856                 _mm_storeu_si128((__m128i *) (src + 160), m128iS10);
2857                 _mm_storeu_si128((__m128i *) (src + 168), m128iS11);
2858                 _mm_storeu_si128((__m128i *) (src + 192), m128iS12);
2859                 _mm_storeu_si128((__m128i *) (src + 200), m128iS13);
2860                 _mm_storeu_si128((__m128i *) (src + 224), m128iS14);
2861                 _mm_storeu_si128((__m128i *) (src + 232), m128iS15);
2862                 dst = (uint16_t*) _dst + (i * stride);
2863 
2864                 for (k = 0; k < 8; k++) {
2865                     dst[0] = av_clip_uintp2(dst[0] + src[m],10);
2866                     dst[1] = av_clip_uintp2(dst[1] + src[m + 8],10);
2867                     dst[2] = av_clip_uintp2(dst[2] + src[m + 32],10);
2868                     dst[3] = av_clip_uintp2(dst[3] + src[m + 40],10);
2869                     dst[4] = av_clip_uintp2(dst[4] + src[m + 64],10);
2870                     dst[5] = av_clip_uintp2(dst[5] + src[m + 72],10);
2871                     dst[6] = av_clip_uintp2(dst[6] + src[m + 96],10);
2872                     dst[7] = av_clip_uintp2(dst[7] + src[m + 104],10);
2873 
2874                     dst[8] = av_clip_uintp2(dst[8] + src[m + 128],10);
2875                     dst[9] = av_clip_uintp2(dst[9] + src[m + 136],10);
2876                     dst[10] = av_clip_uintp2(dst[10] + src[m + 160],10);
2877                     dst[11] = av_clip_uintp2(dst[11] + src[m + 168],10);
2878                     dst[12] = av_clip_uintp2(dst[12] + src[m + 192],10);
2879                     dst[13] = av_clip_uintp2(dst[13] + src[m + 200],10);
2880                     dst[14] = av_clip_uintp2(dst[14] + src[m + 224],10);
2881                     dst[15] = av_clip_uintp2(dst[15] + src[m + 232],10);
2882                     m += 1;
2883                     dst += stride;
2884                 }
2885                 if (!i) {
2886                     m128iS0 = _mm_load_si128((__m128i *) (src + 16));
2887                     m128iS1 = _mm_load_si128((__m128i *) (src + 48));
2888                     m128iS2 = _mm_load_si128((__m128i *) (src + 80));
2889                     m128iS3 = _mm_loadu_si128((__m128i *) (src + 112));
2890                     m128iS4 = _mm_load_si128((__m128i *) (src + 144));
2891                     m128iS5 = _mm_load_si128((__m128i *) (src + 176));
2892                     m128iS6 = _mm_load_si128((__m128i *) (src + 208));
2893                     m128iS7 = _mm_load_si128((__m128i *) (src + 240));
2894                     m128iS8 = _mm_load_si128((__m128i *) (src + 24));
2895                     m128iS9 = _mm_load_si128((__m128i *) (src + 56));
2896                     m128iS10 = _mm_load_si128((__m128i *) (src + 88));
2897                     m128iS11 = _mm_loadu_si128((__m128i *) (src + 120));
2898                     m128iS12 = _mm_load_si128((__m128i *) (src + 152));
2899                     m128iS13 = _mm_load_si128((__m128i *) (src + 184));
2900                     m128iS14 = _mm_load_si128((__m128i *) (src + 216));
2901                     m128iS15 = _mm_load_si128((__m128i *) (src + 248));
2902                 }
2903             }
2904         }
2905     }
2906 
2907 }
2908 #endif
2909 
2910 
2911 #if HAVE_SSE4_1
ff_hevc_transform_32x32_add_8_sse4(uint8_t * _dst,const int16_t * coeffs,ptrdiff_t _stride)2912 void ff_hevc_transform_32x32_add_8_sse4(uint8_t *_dst, const int16_t *coeffs,
2913         ptrdiff_t _stride) {
2914     uint8_t shift_2nd = 12; // 20 - Bit depth
2915     uint16_t add_2nd = 1 << 11; //(1 << (shift_2nd - 1))
2916     int i, j;
2917     uint8_t *dst = (uint8_t*) _dst;
2918     ptrdiff_t stride = _stride / sizeof(uint8_t);
2919     int shift;
2920     const int16_t *src = coeffs;
2921 
2922     __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
2923             m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13,
2924             m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2,
2925             m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h,
2926             E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h,
2927             O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l,
2928             E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h;
2929     __m128i E4l, E5l, E6l, E7l, E8l, E9l, E10l, E11l, E12l, E13l, E14l, E15l;
2930     __m128i E4h, E5h, E6h, E7h, E8h, E9h, E10h, E11h, E12h, E13h, E14h, E15h,
2931             EEE0l, EEE1l, EEE0h, EEE1h;
2932     __m128i m128iS16, m128iS17, m128iS18, m128iS19, m128iS20, m128iS21,
2933             m128iS22, m128iS23, m128iS24, m128iS25, m128iS26, m128iS27,
2934             m128iS28, m128iS29, m128iS30, m128iS31, m128Tmp8, m128Tmp9,
2935             m128Tmp10, m128Tmp11, m128Tmp12, m128Tmp13, m128Tmp14, m128Tmp15,
2936             O8h, O9h, O10h, O11h, O12h, O13h, O14h, O15h, O8l, O9l, O10l, O11l,
2937             O12l, O13l, O14l, O15l, E02l, E02h, E03l, E03h, EE7l, EE6l, EE5l,
2938             EE4l, EE7h, EE6h, EE5h, EE4h;
2939 
2940     __m128i r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,r12,r13,r14,r15,r16,r17,r18,r19,r20,r21,r22,r23,r24,r25,r26,r27,r28,r29,r30,r31;
2941     __m128i r32,r33,r34,r35,r36,r37,r38,r39,r40,r41,r42,r43,r44,r45,r46,r47,r48,r49,r50,r51,r52,r53,r54,r55,r56,r57,r58,r59,r60,r61,r62,r63;
2942     __m128i r64,r65,r66,r67,r68,r69,r70,r71,r72,r73,r74,r75,r76,r77,r78,r79,r80,r81,r82,r83,r84,r85,r86,r87,r88,r89,r90,r91,r92,r93,r94,r95;
2943     __m128i r96,r97,r98,r99,r100,r101,r102,r103,r104,r105,r106,r107,r108,r109,r110,r111,r112,r113,r114,r115,r116,r117,r118,r119,r120,r121,r122,r123,r124,r125,r126,r127;
2944 
2945 
2946     m128iS0 = _mm_load_si128((__m128i *) (src));
2947     m128iS1 = _mm_load_si128((__m128i *) (src + 32));
2948     m128iS2 = _mm_load_si128((__m128i *) (src + 64));
2949     m128iS3 = _mm_load_si128((__m128i *) (src + 96));
2950     m128iS4 = _mm_loadu_si128((__m128i *) (src + 128));
2951     m128iS5 = _mm_load_si128((__m128i *) (src + 160));
2952     m128iS6 = _mm_load_si128((__m128i *) (src + 192));
2953     m128iS7 = _mm_load_si128((__m128i *) (src + 224));
2954     m128iS8 = _mm_load_si128((__m128i *) (src + 256));
2955     m128iS9 = _mm_load_si128((__m128i *) (src + 288));
2956     m128iS10 = _mm_load_si128((__m128i *) (src + 320));
2957     m128iS11 = _mm_load_si128((__m128i *) (src + 352));
2958     m128iS12 = _mm_load_si128((__m128i *) (src + 384));
2959     m128iS13 = _mm_load_si128((__m128i *) (src + 416));
2960     m128iS14 = _mm_load_si128((__m128i *) (src + 448));
2961     m128iS15 = _mm_load_si128((__m128i *) (src + 480));
2962     m128iS16 = _mm_load_si128((__m128i *) (src + 512));
2963     m128iS17 = _mm_load_si128((__m128i *) (src + 544));
2964     m128iS18 = _mm_load_si128((__m128i *) (src + 576));
2965     m128iS19 = _mm_load_si128((__m128i *) (src + 608));
2966     m128iS20 = _mm_load_si128((__m128i *) (src + 640));
2967     m128iS21 = _mm_load_si128((__m128i *) (src + 672));
2968     m128iS22 = _mm_load_si128((__m128i *) (src + 704));
2969     m128iS23 = _mm_load_si128((__m128i *) (src + 736));
2970     m128iS24 = _mm_load_si128((__m128i *) (src + 768));
2971     m128iS25 = _mm_load_si128((__m128i *) (src + 800));
2972     m128iS26 = _mm_load_si128((__m128i *) (src + 832));
2973     m128iS27 = _mm_load_si128((__m128i *) (src + 864));
2974     m128iS28 = _mm_load_si128((__m128i *) (src + 896));
2975     m128iS29 = _mm_load_si128((__m128i *) (src + 928));
2976     m128iS30 = _mm_load_si128((__m128i *) (src + 960));
2977     m128iS31 = _mm_load_si128((__m128i *) (src + 992));
2978 
2979     shift = shift_1st;
2980     m128iAdd = _mm_set1_epi32(add_1st);
2981 
2982     for (j = 0; j < 2; j++) {
2983         for (i = 0; i < 32; i += 8) {
2984             m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
2985             E0l = _mm_madd_epi16(m128Tmp0,
2986                     _mm_load_si128((__m128i *) (transform32x32[0][0])));
2987             m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
2988             E0h = _mm_madd_epi16(m128Tmp1,
2989                     _mm_load_si128((__m128i *) (transform32x32[0][0])));
2990 
2991             m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
2992             E1l = _mm_madd_epi16(m128Tmp2,
2993                     _mm_load_si128((__m128i *) (transform32x32[1][0])));
2994             m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
2995             E1h = _mm_madd_epi16(m128Tmp3,
2996                     _mm_load_si128((__m128i *) (transform32x32[1][0])));
2997 
2998             m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11);
2999             E2l = _mm_madd_epi16(m128Tmp4,
3000                     _mm_load_si128((__m128i *) (transform32x32[2][0])));
3001             m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11);
3002             E2h = _mm_madd_epi16(m128Tmp5,
3003                     _mm_load_si128((__m128i *) (transform32x32[2][0])));
3004 
3005             m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15);
3006             E3l = _mm_madd_epi16(m128Tmp6,
3007                     _mm_load_si128((__m128i *) (transform32x32[3][0])));
3008             m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15);
3009             E3h = _mm_madd_epi16(m128Tmp7,
3010                     _mm_load_si128((__m128i *) (transform32x32[3][0])));
3011 
3012             m128Tmp8 = _mm_unpacklo_epi16(m128iS17, m128iS19);
3013             E4l = _mm_madd_epi16(m128Tmp8,
3014                     _mm_load_si128((__m128i *) (transform32x32[4][0])));
3015             m128Tmp9 = _mm_unpackhi_epi16(m128iS17, m128iS19);
3016             E4h = _mm_madd_epi16(m128Tmp9,
3017                     _mm_load_si128((__m128i *) (transform32x32[4][0])));
3018 
3019             m128Tmp10 = _mm_unpacklo_epi16(m128iS21, m128iS23);
3020             E5l = _mm_madd_epi16(m128Tmp10,
3021                     _mm_load_si128((__m128i *) (transform32x32[5][0])));
3022             m128Tmp11 = _mm_unpackhi_epi16(m128iS21, m128iS23);
3023             E5h = _mm_madd_epi16(m128Tmp11,
3024                     _mm_load_si128((__m128i *) (transform32x32[5][0])));
3025 
3026             m128Tmp12 = _mm_unpacklo_epi16(m128iS25, m128iS27);
3027             E6l = _mm_madd_epi16(m128Tmp12,
3028                     _mm_load_si128((__m128i *) (transform32x32[6][0])));
3029             m128Tmp13 = _mm_unpackhi_epi16(m128iS25, m128iS27);
3030             E6h = _mm_madd_epi16(m128Tmp13,
3031                     _mm_load_si128((__m128i *) (transform32x32[6][0])));
3032 
3033             m128Tmp14 = _mm_unpacklo_epi16(m128iS29, m128iS31);
3034             E7l = _mm_madd_epi16(m128Tmp14,
3035                     _mm_load_si128((__m128i *) (transform32x32[7][0])));
3036             m128Tmp15 = _mm_unpackhi_epi16(m128iS29, m128iS31);
3037             E7h = _mm_madd_epi16(m128Tmp15,
3038                     _mm_load_si128((__m128i *) (transform32x32[7][0])));
3039 
3040             O0l = _mm_add_epi32(E0l, E1l);
3041             O0l = _mm_add_epi32(O0l, E2l);
3042             O0l = _mm_add_epi32(O0l, E3l);
3043             O0l = _mm_add_epi32(O0l, E4l);
3044             O0l = _mm_add_epi32(O0l, E5l);
3045             O0l = _mm_add_epi32(O0l, E6l);
3046             O0l = _mm_add_epi32(O0l, E7l);
3047 
3048             O0h = _mm_add_epi32(E0h, E1h);
3049             O0h = _mm_add_epi32(O0h, E2h);
3050             O0h = _mm_add_epi32(O0h, E3h);
3051             O0h = _mm_add_epi32(O0h, E4h);
3052             O0h = _mm_add_epi32(O0h, E5h);
3053             O0h = _mm_add_epi32(O0h, E6h);
3054             O0h = _mm_add_epi32(O0h, E7h);
3055 
3056             /* Compute O1*/
3057             E0l = _mm_madd_epi16(m128Tmp0,
3058                     _mm_load_si128((__m128i *) (transform32x32[0][1])));
3059             E0h = _mm_madd_epi16(m128Tmp1,
3060                     _mm_load_si128((__m128i *) (transform32x32[0][1])));
3061             E1l = _mm_madd_epi16(m128Tmp2,
3062                     _mm_load_si128((__m128i *) (transform32x32[1][1])));
3063             E1h = _mm_madd_epi16(m128Tmp3,
3064                     _mm_load_si128((__m128i *) (transform32x32[1][1])));
3065             E2l = _mm_madd_epi16(m128Tmp4,
3066                     _mm_load_si128((__m128i *) (transform32x32[2][1])));
3067             E2h = _mm_madd_epi16(m128Tmp5,
3068                     _mm_load_si128((__m128i *) (transform32x32[2][1])));
3069             E3l = _mm_madd_epi16(m128Tmp6,
3070                     _mm_load_si128((__m128i *) (transform32x32[3][1])));
3071             E3h = _mm_madd_epi16(m128Tmp7,
3072                     _mm_load_si128((__m128i *) (transform32x32[3][1])));
3073 
3074             E4l = _mm_madd_epi16(m128Tmp8,
3075                     _mm_load_si128((__m128i *) (transform32x32[4][1])));
3076             E4h = _mm_madd_epi16(m128Tmp9,
3077                     _mm_load_si128((__m128i *) (transform32x32[4][1])));
3078             E5l = _mm_madd_epi16(m128Tmp10,
3079                     _mm_load_si128((__m128i *) (transform32x32[5][1])));
3080             E5h = _mm_madd_epi16(m128Tmp11,
3081                     _mm_load_si128((__m128i *) (transform32x32[5][1])));
3082             E6l = _mm_madd_epi16(m128Tmp12,
3083                     _mm_load_si128((__m128i *) (transform32x32[6][1])));
3084             E6h = _mm_madd_epi16(m128Tmp13,
3085                     _mm_load_si128((__m128i *) (transform32x32[6][1])));
3086             E7l = _mm_madd_epi16(m128Tmp14,
3087                     _mm_load_si128((__m128i *) (transform32x32[7][1])));
3088             E7h = _mm_madd_epi16(m128Tmp15,
3089                     _mm_load_si128((__m128i *) (transform32x32[7][1])));
3090 
3091             O1l = _mm_add_epi32(E0l, E1l);
3092             O1l = _mm_add_epi32(O1l, E2l);
3093             O1l = _mm_add_epi32(O1l, E3l);
3094             O1l = _mm_add_epi32(O1l, E4l);
3095             O1l = _mm_add_epi32(O1l, E5l);
3096             O1l = _mm_add_epi32(O1l, E6l);
3097             O1l = _mm_add_epi32(O1l, E7l);
3098 
3099             O1h = _mm_add_epi32(E0h, E1h);
3100             O1h = _mm_add_epi32(O1h, E2h);
3101             O1h = _mm_add_epi32(O1h, E3h);
3102             O1h = _mm_add_epi32(O1h, E4h);
3103             O1h = _mm_add_epi32(O1h, E5h);
3104             O1h = _mm_add_epi32(O1h, E6h);
3105             O1h = _mm_add_epi32(O1h, E7h);
3106             /* Compute O2*/
3107             E0l = _mm_madd_epi16(m128Tmp0,
3108                     _mm_load_si128((__m128i *) (transform32x32[0][2])));
3109             E0h = _mm_madd_epi16(m128Tmp1,
3110                     _mm_load_si128((__m128i *) (transform32x32[0][2])));
3111             E1l = _mm_madd_epi16(m128Tmp2,
3112                     _mm_load_si128((__m128i *) (transform32x32[1][2])));
3113             E1h = _mm_madd_epi16(m128Tmp3,
3114                     _mm_load_si128((__m128i *) (transform32x32[1][2])));
3115             E2l = _mm_madd_epi16(m128Tmp4,
3116                     _mm_load_si128((__m128i *) (transform32x32[2][2])));
3117             E2h = _mm_madd_epi16(m128Tmp5,
3118                     _mm_load_si128((__m128i *) (transform32x32[2][2])));
3119             E3l = _mm_madd_epi16(m128Tmp6,
3120                     _mm_load_si128((__m128i *) (transform32x32[3][2])));
3121             E3h = _mm_madd_epi16(m128Tmp7,
3122                     _mm_load_si128((__m128i *) (transform32x32[3][2])));
3123 
3124             E4l = _mm_madd_epi16(m128Tmp8,
3125                     _mm_load_si128((__m128i *) (transform32x32[4][2])));
3126             E4h = _mm_madd_epi16(m128Tmp9,
3127                     _mm_load_si128((__m128i *) (transform32x32[4][2])));
3128             E5l = _mm_madd_epi16(m128Tmp10,
3129                     _mm_load_si128((__m128i *) (transform32x32[5][2])));
3130             E5h = _mm_madd_epi16(m128Tmp11,
3131                     _mm_load_si128((__m128i *) (transform32x32[5][2])));
3132             E6l = _mm_madd_epi16(m128Tmp12,
3133                     _mm_load_si128((__m128i *) (transform32x32[6][2])));
3134             E6h = _mm_madd_epi16(m128Tmp13,
3135                     _mm_load_si128((__m128i *) (transform32x32[6][2])));
3136             E7l = _mm_madd_epi16(m128Tmp14,
3137                     _mm_load_si128((__m128i *) (transform32x32[7][2])));
3138             E7h = _mm_madd_epi16(m128Tmp15,
3139                     _mm_load_si128((__m128i *) (transform32x32[7][2])));
3140 
3141             O2l = _mm_add_epi32(E0l, E1l);
3142             O2l = _mm_add_epi32(O2l, E2l);
3143             O2l = _mm_add_epi32(O2l, E3l);
3144             O2l = _mm_add_epi32(O2l, E4l);
3145             O2l = _mm_add_epi32(O2l, E5l);
3146             O2l = _mm_add_epi32(O2l, E6l);
3147             O2l = _mm_add_epi32(O2l, E7l);
3148 
3149             O2h = _mm_add_epi32(E0h, E1h);
3150             O2h = _mm_add_epi32(O2h, E2h);
3151             O2h = _mm_add_epi32(O2h, E3h);
3152             O2h = _mm_add_epi32(O2h, E4h);
3153             O2h = _mm_add_epi32(O2h, E5h);
3154             O2h = _mm_add_epi32(O2h, E6h);
3155             O2h = _mm_add_epi32(O2h, E7h);
3156             /* Compute O3*/
3157             E0l = _mm_madd_epi16(m128Tmp0,
3158                     _mm_load_si128((__m128i *) (transform32x32[0][3])));
3159             E0h = _mm_madd_epi16(m128Tmp1,
3160                     _mm_load_si128((__m128i *) (transform32x32[0][3])));
3161             E1l = _mm_madd_epi16(m128Tmp2,
3162                     _mm_load_si128((__m128i *) (transform32x32[1][3])));
3163             E1h = _mm_madd_epi16(m128Tmp3,
3164                     _mm_load_si128((__m128i *) (transform32x32[1][3])));
3165             E2l = _mm_madd_epi16(m128Tmp4,
3166                     _mm_load_si128((__m128i *) (transform32x32[2][3])));
3167             E2h = _mm_madd_epi16(m128Tmp5,
3168                     _mm_load_si128((__m128i *) (transform32x32[2][3])));
3169             E3l = _mm_madd_epi16(m128Tmp6,
3170                     _mm_load_si128((__m128i *) (transform32x32[3][3])));
3171             E3h = _mm_madd_epi16(m128Tmp7,
3172                     _mm_load_si128((__m128i *) (transform32x32[3][3])));
3173 
3174             E4l = _mm_madd_epi16(m128Tmp8,
3175                     _mm_load_si128((__m128i *) (transform32x32[4][3])));
3176             E4h = _mm_madd_epi16(m128Tmp9,
3177                     _mm_load_si128((__m128i *) (transform32x32[4][3])));
3178             E5l = _mm_madd_epi16(m128Tmp10,
3179                     _mm_load_si128((__m128i *) (transform32x32[5][3])));
3180             E5h = _mm_madd_epi16(m128Tmp11,
3181                     _mm_load_si128((__m128i *) (transform32x32[5][3])));
3182             E6l = _mm_madd_epi16(m128Tmp12,
3183                     _mm_load_si128((__m128i *) (transform32x32[6][3])));
3184             E6h = _mm_madd_epi16(m128Tmp13,
3185                     _mm_load_si128((__m128i *) (transform32x32[6][3])));
3186             E7l = _mm_madd_epi16(m128Tmp14,
3187                     _mm_load_si128((__m128i *) (transform32x32[7][3])));
3188             E7h = _mm_madd_epi16(m128Tmp15,
3189                     _mm_load_si128((__m128i *) (transform32x32[7][3])));
3190 
3191             O3l = _mm_add_epi32(E0l, E1l);
3192             O3l = _mm_add_epi32(O3l, E2l);
3193             O3l = _mm_add_epi32(O3l, E3l);
3194             O3l = _mm_add_epi32(O3l, E4l);
3195             O3l = _mm_add_epi32(O3l, E5l);
3196             O3l = _mm_add_epi32(O3l, E6l);
3197             O3l = _mm_add_epi32(O3l, E7l);
3198 
3199             O3h = _mm_add_epi32(E0h, E1h);
3200             O3h = _mm_add_epi32(O3h, E2h);
3201             O3h = _mm_add_epi32(O3h, E3h);
3202             O3h = _mm_add_epi32(O3h, E4h);
3203             O3h = _mm_add_epi32(O3h, E5h);
3204             O3h = _mm_add_epi32(O3h, E6h);
3205             O3h = _mm_add_epi32(O3h, E7h);
3206             /* Compute O4*/
3207 
3208             E0l = _mm_madd_epi16(m128Tmp0,
3209                     _mm_load_si128((__m128i *) (transform32x32[0][4])));
3210             E0h = _mm_madd_epi16(m128Tmp1,
3211                     _mm_load_si128((__m128i *) (transform32x32[0][4])));
3212             E1l = _mm_madd_epi16(m128Tmp2,
3213                     _mm_load_si128((__m128i *) (transform32x32[1][4])));
3214             E1h = _mm_madd_epi16(m128Tmp3,
3215                     _mm_load_si128((__m128i *) (transform32x32[1][4])));
3216             E2l = _mm_madd_epi16(m128Tmp4,
3217                     _mm_load_si128((__m128i *) (transform32x32[2][4])));
3218             E2h = _mm_madd_epi16(m128Tmp5,
3219                     _mm_load_si128((__m128i *) (transform32x32[2][4])));
3220             E3l = _mm_madd_epi16(m128Tmp6,
3221                     _mm_load_si128((__m128i *) (transform32x32[3][4])));
3222             E3h = _mm_madd_epi16(m128Tmp7,
3223                     _mm_load_si128((__m128i *) (transform32x32[3][4])));
3224 
3225             E4l = _mm_madd_epi16(m128Tmp8,
3226                     _mm_load_si128((__m128i *) (transform32x32[4][4])));
3227             E4h = _mm_madd_epi16(m128Tmp9,
3228                     _mm_load_si128((__m128i *) (transform32x32[4][4])));
3229             E5l = _mm_madd_epi16(m128Tmp10,
3230                     _mm_load_si128((__m128i *) (transform32x32[5][4])));
3231             E5h = _mm_madd_epi16(m128Tmp11,
3232                     _mm_load_si128((__m128i *) (transform32x32[5][4])));
3233             E6l = _mm_madd_epi16(m128Tmp12,
3234                     _mm_load_si128((__m128i *) (transform32x32[6][4])));
3235             E6h = _mm_madd_epi16(m128Tmp13,
3236                     _mm_load_si128((__m128i *) (transform32x32[6][4])));
3237             E7l = _mm_madd_epi16(m128Tmp14,
3238                     _mm_load_si128((__m128i *) (transform32x32[7][4])));
3239             E7h = _mm_madd_epi16(m128Tmp15,
3240                     _mm_load_si128((__m128i *) (transform32x32[7][4])));
3241 
3242             O4l = _mm_add_epi32(E0l, E1l);
3243             O4l = _mm_add_epi32(O4l, E2l);
3244             O4l = _mm_add_epi32(O4l, E3l);
3245             O4l = _mm_add_epi32(O4l, E4l);
3246             O4l = _mm_add_epi32(O4l, E5l);
3247             O4l = _mm_add_epi32(O4l, E6l);
3248             O4l = _mm_add_epi32(O4l, E7l);
3249 
3250             O4h = _mm_add_epi32(E0h, E1h);
3251             O4h = _mm_add_epi32(O4h, E2h);
3252             O4h = _mm_add_epi32(O4h, E3h);
3253             O4h = _mm_add_epi32(O4h, E4h);
3254             O4h = _mm_add_epi32(O4h, E5h);
3255             O4h = _mm_add_epi32(O4h, E6h);
3256             O4h = _mm_add_epi32(O4h, E7h);
3257 
3258             /* Compute O5*/
3259             E0l = _mm_madd_epi16(m128Tmp0,
3260                     _mm_load_si128((__m128i *) (transform32x32[0][5])));
3261             E0h = _mm_madd_epi16(m128Tmp1,
3262                     _mm_load_si128((__m128i *) (transform32x32[0][5])));
3263             E1l = _mm_madd_epi16(m128Tmp2,
3264                     _mm_load_si128((__m128i *) (transform32x32[1][5])));
3265             E1h = _mm_madd_epi16(m128Tmp3,
3266                     _mm_load_si128((__m128i *) (transform32x32[1][5])));
3267             E2l = _mm_madd_epi16(m128Tmp4,
3268                     _mm_load_si128((__m128i *) (transform32x32[2][5])));
3269             E2h = _mm_madd_epi16(m128Tmp5,
3270                     _mm_load_si128((__m128i *) (transform32x32[2][5])));
3271             E3l = _mm_madd_epi16(m128Tmp6,
3272                     _mm_load_si128((__m128i *) (transform32x32[3][5])));
3273             E3h = _mm_madd_epi16(m128Tmp7,
3274                     _mm_load_si128((__m128i *) (transform32x32[3][5])));
3275 
3276             E4l = _mm_madd_epi16(m128Tmp8,
3277                     _mm_load_si128((__m128i *) (transform32x32[4][5])));
3278             E4h = _mm_madd_epi16(m128Tmp9,
3279                     _mm_load_si128((__m128i *) (transform32x32[4][5])));
3280             E5l = _mm_madd_epi16(m128Tmp10,
3281                     _mm_load_si128((__m128i *) (transform32x32[5][5])));
3282             E5h = _mm_madd_epi16(m128Tmp11,
3283                     _mm_load_si128((__m128i *) (transform32x32[5][5])));
3284             E6l = _mm_madd_epi16(m128Tmp12,
3285                     _mm_load_si128((__m128i *) (transform32x32[6][5])));
3286             E6h = _mm_madd_epi16(m128Tmp13,
3287                     _mm_load_si128((__m128i *) (transform32x32[6][5])));
3288             E7l = _mm_madd_epi16(m128Tmp14,
3289                     _mm_load_si128((__m128i *) (transform32x32[7][5])));
3290             E7h = _mm_madd_epi16(m128Tmp15,
3291                     _mm_load_si128((__m128i *) (transform32x32[7][5])));
3292 
3293             O5l = _mm_add_epi32(E0l, E1l);
3294             O5l = _mm_add_epi32(O5l, E2l);
3295             O5l = _mm_add_epi32(O5l, E3l);
3296             O5l = _mm_add_epi32(O5l, E4l);
3297             O5l = _mm_add_epi32(O5l, E5l);
3298             O5l = _mm_add_epi32(O5l, E6l);
3299             O5l = _mm_add_epi32(O5l, E7l);
3300 
3301             O5h = _mm_add_epi32(E0h, E1h);
3302             O5h = _mm_add_epi32(O5h, E2h);
3303             O5h = _mm_add_epi32(O5h, E3h);
3304             O5h = _mm_add_epi32(O5h, E4h);
3305             O5h = _mm_add_epi32(O5h, E5h);
3306             O5h = _mm_add_epi32(O5h, E6h);
3307             O5h = _mm_add_epi32(O5h, E7h);
3308 
3309             /* Compute O6*/
3310 
3311             E0l = _mm_madd_epi16(m128Tmp0,
3312                     _mm_load_si128((__m128i *) (transform32x32[0][6])));
3313             E0h = _mm_madd_epi16(m128Tmp1,
3314                     _mm_load_si128((__m128i *) (transform32x32[0][6])));
3315             E1l = _mm_madd_epi16(m128Tmp2,
3316                     _mm_load_si128((__m128i *) (transform32x32[1][6])));
3317             E1h = _mm_madd_epi16(m128Tmp3,
3318                     _mm_load_si128((__m128i *) (transform32x32[1][6])));
3319             E2l = _mm_madd_epi16(m128Tmp4,
3320                     _mm_load_si128((__m128i *) (transform32x32[2][6])));
3321             E2h = _mm_madd_epi16(m128Tmp5,
3322                     _mm_load_si128((__m128i *) (transform32x32[2][6])));
3323             E3l = _mm_madd_epi16(m128Tmp6,
3324                     _mm_load_si128((__m128i *) (transform32x32[3][6])));
3325             E3h = _mm_madd_epi16(m128Tmp7,
3326                     _mm_load_si128((__m128i *) (transform32x32[3][6])));
3327 
3328             E4l = _mm_madd_epi16(m128Tmp8,
3329                     _mm_load_si128((__m128i *) (transform32x32[4][6])));
3330             E4h = _mm_madd_epi16(m128Tmp9,
3331                     _mm_load_si128((__m128i *) (transform32x32[4][6])));
3332             E5l = _mm_madd_epi16(m128Tmp10,
3333                     _mm_load_si128((__m128i *) (transform32x32[5][6])));
3334             E5h = _mm_madd_epi16(m128Tmp11,
3335                     _mm_load_si128((__m128i *) (transform32x32[5][6])));
3336             E6l = _mm_madd_epi16(m128Tmp12,
3337                     _mm_load_si128((__m128i *) (transform32x32[6][6])));
3338             E6h = _mm_madd_epi16(m128Tmp13,
3339                     _mm_load_si128((__m128i *) (transform32x32[6][6])));
3340             E7l = _mm_madd_epi16(m128Tmp14,
3341                     _mm_load_si128((__m128i *) (transform32x32[7][6])));
3342             E7h = _mm_madd_epi16(m128Tmp15,
3343                     _mm_load_si128((__m128i *) (transform32x32[7][6])));
3344 
3345             O6l = _mm_add_epi32(E0l, E1l);
3346             O6l = _mm_add_epi32(O6l, E2l);
3347             O6l = _mm_add_epi32(O6l, E3l);
3348             O6l = _mm_add_epi32(O6l, E4l);
3349             O6l = _mm_add_epi32(O6l, E5l);
3350             O6l = _mm_add_epi32(O6l, E6l);
3351             O6l = _mm_add_epi32(O6l, E7l);
3352 
3353             O6h = _mm_add_epi32(E0h, E1h);
3354             O6h = _mm_add_epi32(O6h, E2h);
3355             O6h = _mm_add_epi32(O6h, E3h);
3356             O6h = _mm_add_epi32(O6h, E4h);
3357             O6h = _mm_add_epi32(O6h, E5h);
3358             O6h = _mm_add_epi32(O6h, E6h);
3359             O6h = _mm_add_epi32(O6h, E7h);
3360 
3361             /* Compute O7*/
3362 
3363             E0l = _mm_madd_epi16(m128Tmp0,
3364                     _mm_load_si128((__m128i *) (transform32x32[0][7])));
3365             E0h = _mm_madd_epi16(m128Tmp1,
3366                     _mm_load_si128((__m128i *) (transform32x32[0][7])));
3367             E1l = _mm_madd_epi16(m128Tmp2,
3368                     _mm_load_si128((__m128i *) (transform32x32[1][7])));
3369             E1h = _mm_madd_epi16(m128Tmp3,
3370                     _mm_load_si128((__m128i *) (transform32x32[1][7])));
3371             E2l = _mm_madd_epi16(m128Tmp4,
3372                     _mm_load_si128((__m128i *) (transform32x32[2][7])));
3373             E2h = _mm_madd_epi16(m128Tmp5,
3374                     _mm_load_si128((__m128i *) (transform32x32[2][7])));
3375             E3l = _mm_madd_epi16(m128Tmp6,
3376                     _mm_load_si128((__m128i *) (transform32x32[3][7])));
3377             E3h = _mm_madd_epi16(m128Tmp7,
3378                     _mm_load_si128((__m128i *) (transform32x32[3][7])));
3379 
3380             E4l = _mm_madd_epi16(m128Tmp8,
3381                     _mm_load_si128((__m128i *) (transform32x32[4][7])));
3382             E4h = _mm_madd_epi16(m128Tmp9,
3383                     _mm_load_si128((__m128i *) (transform32x32[4][7])));
3384             E5l = _mm_madd_epi16(m128Tmp10,
3385                     _mm_load_si128((__m128i *) (transform32x32[5][7])));
3386             E5h = _mm_madd_epi16(m128Tmp11,
3387                     _mm_load_si128((__m128i *) (transform32x32[5][7])));
3388             E6l = _mm_madd_epi16(m128Tmp12,
3389                     _mm_load_si128((__m128i *) (transform32x32[6][7])));
3390             E6h = _mm_madd_epi16(m128Tmp13,
3391                     _mm_load_si128((__m128i *) (transform32x32[6][7])));
3392             E7l = _mm_madd_epi16(m128Tmp14,
3393                     _mm_load_si128((__m128i *) (transform32x32[7][7])));
3394             E7h = _mm_madd_epi16(m128Tmp15,
3395                     _mm_load_si128((__m128i *) (transform32x32[7][7])));
3396 
3397             O7l = _mm_add_epi32(E0l, E1l);
3398             O7l = _mm_add_epi32(O7l, E2l);
3399             O7l = _mm_add_epi32(O7l, E3l);
3400             O7l = _mm_add_epi32(O7l, E4l);
3401             O7l = _mm_add_epi32(O7l, E5l);
3402             O7l = _mm_add_epi32(O7l, E6l);
3403             O7l = _mm_add_epi32(O7l, E7l);
3404 
3405             O7h = _mm_add_epi32(E0h, E1h);
3406             O7h = _mm_add_epi32(O7h, E2h);
3407             O7h = _mm_add_epi32(O7h, E3h);
3408             O7h = _mm_add_epi32(O7h, E4h);
3409             O7h = _mm_add_epi32(O7h, E5h);
3410             O7h = _mm_add_epi32(O7h, E6h);
3411             O7h = _mm_add_epi32(O7h, E7h);
3412 
3413             /* Compute O8*/
3414 
3415             E0l = _mm_madd_epi16(m128Tmp0,
3416                     _mm_load_si128((__m128i *) (transform32x32[0][8])));
3417             E0h = _mm_madd_epi16(m128Tmp1,
3418                     _mm_load_si128((__m128i *) (transform32x32[0][8])));
3419             E1l = _mm_madd_epi16(m128Tmp2,
3420                     _mm_load_si128((__m128i *) (transform32x32[1][8])));
3421             E1h = _mm_madd_epi16(m128Tmp3,
3422                     _mm_load_si128((__m128i *) (transform32x32[1][8])));
3423             E2l = _mm_madd_epi16(m128Tmp4,
3424                     _mm_load_si128((__m128i *) (transform32x32[2][8])));
3425             E2h = _mm_madd_epi16(m128Tmp5,
3426                     _mm_load_si128((__m128i *) (transform32x32[2][8])));
3427             E3l = _mm_madd_epi16(m128Tmp6,
3428                     _mm_load_si128((__m128i *) (transform32x32[3][8])));
3429             E3h = _mm_madd_epi16(m128Tmp7,
3430                     _mm_load_si128((__m128i *) (transform32x32[3][8])));
3431 
3432             E4l = _mm_madd_epi16(m128Tmp8,
3433                     _mm_load_si128((__m128i *) (transform32x32[4][8])));
3434             E4h = _mm_madd_epi16(m128Tmp9,
3435                     _mm_load_si128((__m128i *) (transform32x32[4][8])));
3436             E5l = _mm_madd_epi16(m128Tmp10,
3437                     _mm_load_si128((__m128i *) (transform32x32[5][8])));
3438             E5h = _mm_madd_epi16(m128Tmp11,
3439                     _mm_load_si128((__m128i *) (transform32x32[5][8])));
3440             E6l = _mm_madd_epi16(m128Tmp12,
3441                     _mm_load_si128((__m128i *) (transform32x32[6][8])));
3442             E6h = _mm_madd_epi16(m128Tmp13,
3443                     _mm_load_si128((__m128i *) (transform32x32[6][8])));
3444             E7l = _mm_madd_epi16(m128Tmp14,
3445                     _mm_load_si128((__m128i *) (transform32x32[7][8])));
3446             E7h = _mm_madd_epi16(m128Tmp15,
3447                     _mm_load_si128((__m128i *) (transform32x32[7][8])));
3448 
3449             O8l = _mm_add_epi32(E0l, E1l);
3450             O8l = _mm_add_epi32(O8l, E2l);
3451             O8l = _mm_add_epi32(O8l, E3l);
3452             O8l = _mm_add_epi32(O8l, E4l);
3453             O8l = _mm_add_epi32(O8l, E5l);
3454             O8l = _mm_add_epi32(O8l, E6l);
3455             O8l = _mm_add_epi32(O8l, E7l);
3456 
3457             O8h = _mm_add_epi32(E0h, E1h);
3458             O8h = _mm_add_epi32(O8h, E2h);
3459             O8h = _mm_add_epi32(O8h, E3h);
3460             O8h = _mm_add_epi32(O8h, E4h);
3461             O8h = _mm_add_epi32(O8h, E5h);
3462             O8h = _mm_add_epi32(O8h, E6h);
3463             O8h = _mm_add_epi32(O8h, E7h);
3464 
3465             /* Compute O9*/
3466 
3467             E0l = _mm_madd_epi16(m128Tmp0,
3468                     _mm_load_si128((__m128i *) (transform32x32[0][9])));
3469             E0h = _mm_madd_epi16(m128Tmp1,
3470                     _mm_load_si128((__m128i *) (transform32x32[0][9])));
3471             E1l = _mm_madd_epi16(m128Tmp2,
3472                     _mm_load_si128((__m128i *) (transform32x32[1][9])));
3473             E1h = _mm_madd_epi16(m128Tmp3,
3474                     _mm_load_si128((__m128i *) (transform32x32[1][9])));
3475             E2l = _mm_madd_epi16(m128Tmp4,
3476                     _mm_load_si128((__m128i *) (transform32x32[2][9])));
3477             E2h = _mm_madd_epi16(m128Tmp5,
3478                     _mm_load_si128((__m128i *) (transform32x32[2][9])));
3479             E3l = _mm_madd_epi16(m128Tmp6,
3480                     _mm_load_si128((__m128i *) (transform32x32[3][9])));
3481             E3h = _mm_madd_epi16(m128Tmp7,
3482                     _mm_load_si128((__m128i *) (transform32x32[3][9])));
3483 
3484             E4l = _mm_madd_epi16(m128Tmp8,
3485                     _mm_load_si128((__m128i *) (transform32x32[4][9])));
3486             E4h = _mm_madd_epi16(m128Tmp9,
3487                     _mm_load_si128((__m128i *) (transform32x32[4][9])));
3488             E5l = _mm_madd_epi16(m128Tmp10,
3489                     _mm_load_si128((__m128i *) (transform32x32[5][9])));
3490             E5h = _mm_madd_epi16(m128Tmp11,
3491                     _mm_load_si128((__m128i *) (transform32x32[5][9])));
3492             E6l = _mm_madd_epi16(m128Tmp12,
3493                     _mm_load_si128((__m128i *) (transform32x32[6][9])));
3494             E6h = _mm_madd_epi16(m128Tmp13,
3495                     _mm_load_si128((__m128i *) (transform32x32[6][9])));
3496             E7l = _mm_madd_epi16(m128Tmp14,
3497                     _mm_load_si128((__m128i *) (transform32x32[7][9])));
3498             E7h = _mm_madd_epi16(m128Tmp15,
3499                     _mm_load_si128((__m128i *) (transform32x32[7][9])));
3500 
3501             O9l = _mm_add_epi32(E0l, E1l);
3502             O9l = _mm_add_epi32(O9l, E2l);
3503             O9l = _mm_add_epi32(O9l, E3l);
3504             O9l = _mm_add_epi32(O9l, E4l);
3505             O9l = _mm_add_epi32(O9l, E5l);
3506             O9l = _mm_add_epi32(O9l, E6l);
3507             O9l = _mm_add_epi32(O9l, E7l);
3508 
3509             O9h = _mm_add_epi32(E0h, E1h);
3510             O9h = _mm_add_epi32(O9h, E2h);
3511             O9h = _mm_add_epi32(O9h, E3h);
3512             O9h = _mm_add_epi32(O9h, E4h);
3513             O9h = _mm_add_epi32(O9h, E5h);
3514             O9h = _mm_add_epi32(O9h, E6h);
3515             O9h = _mm_add_epi32(O9h, E7h);
3516 
3517             /* Compute 10*/
3518 
3519             E0l = _mm_madd_epi16(m128Tmp0,
3520                     _mm_load_si128((__m128i *) (transform32x32[0][10])));
3521             E0h = _mm_madd_epi16(m128Tmp1,
3522                     _mm_load_si128((__m128i *) (transform32x32[0][10])));
3523             E1l = _mm_madd_epi16(m128Tmp2,
3524                     _mm_load_si128((__m128i *) (transform32x32[1][10])));
3525             E1h = _mm_madd_epi16(m128Tmp3,
3526                     _mm_load_si128((__m128i *) (transform32x32[1][10])));
3527             E2l = _mm_madd_epi16(m128Tmp4,
3528                     _mm_load_si128((__m128i *) (transform32x32[2][10])));
3529             E2h = _mm_madd_epi16(m128Tmp5,
3530                     _mm_load_si128((__m128i *) (transform32x32[2][10])));
3531             E3l = _mm_madd_epi16(m128Tmp6,
3532                     _mm_load_si128((__m128i *) (transform32x32[3][10])));
3533             E3h = _mm_madd_epi16(m128Tmp7,
3534                     _mm_load_si128((__m128i *) (transform32x32[3][10])));
3535 
3536             E4l = _mm_madd_epi16(m128Tmp8,
3537                     _mm_load_si128((__m128i *) (transform32x32[4][10])));
3538             E4h = _mm_madd_epi16(m128Tmp9,
3539                     _mm_load_si128((__m128i *) (transform32x32[4][10])));
3540             E5l = _mm_madd_epi16(m128Tmp10,
3541                     _mm_load_si128((__m128i *) (transform32x32[5][10])));
3542             E5h = _mm_madd_epi16(m128Tmp11,
3543                     _mm_load_si128((__m128i *) (transform32x32[5][10])));
3544             E6l = _mm_madd_epi16(m128Tmp12,
3545                     _mm_load_si128((__m128i *) (transform32x32[6][10])));
3546             E6h = _mm_madd_epi16(m128Tmp13,
3547                     _mm_load_si128((__m128i *) (transform32x32[6][10])));
3548             E7l = _mm_madd_epi16(m128Tmp14,
3549                     _mm_load_si128((__m128i *) (transform32x32[7][10])));
3550             E7h = _mm_madd_epi16(m128Tmp15,
3551                     _mm_load_si128((__m128i *) (transform32x32[7][10])));
3552 
3553             O10l = _mm_add_epi32(E0l, E1l);
3554             O10l = _mm_add_epi32(O10l, E2l);
3555             O10l = _mm_add_epi32(O10l, E3l);
3556             O10l = _mm_add_epi32(O10l, E4l);
3557             O10l = _mm_add_epi32(O10l, E5l);
3558             O10l = _mm_add_epi32(O10l, E6l);
3559             O10l = _mm_add_epi32(O10l, E7l);
3560 
3561             O10h = _mm_add_epi32(E0h, E1h);
3562             O10h = _mm_add_epi32(O10h, E2h);
3563             O10h = _mm_add_epi32(O10h, E3h);
3564             O10h = _mm_add_epi32(O10h, E4h);
3565             O10h = _mm_add_epi32(O10h, E5h);
3566             O10h = _mm_add_epi32(O10h, E6h);
3567             O10h = _mm_add_epi32(O10h, E7h);
3568 
3569             /* Compute 11*/
3570 
3571             E0l = _mm_madd_epi16(m128Tmp0,
3572                     _mm_load_si128((__m128i *) (transform32x32[0][11])));
3573             E0h = _mm_madd_epi16(m128Tmp1,
3574                     _mm_load_si128((__m128i *) (transform32x32[0][11])));
3575             E1l = _mm_madd_epi16(m128Tmp2,
3576                     _mm_load_si128((__m128i *) (transform32x32[1][11])));
3577             E1h = _mm_madd_epi16(m128Tmp3,
3578                     _mm_load_si128((__m128i *) (transform32x32[1][11])));
3579             E2l = _mm_madd_epi16(m128Tmp4,
3580                     _mm_load_si128((__m128i *) (transform32x32[2][11])));
3581             E2h = _mm_madd_epi16(m128Tmp5,
3582                     _mm_load_si128((__m128i *) (transform32x32[2][11])));
3583             E3l = _mm_madd_epi16(m128Tmp6,
3584                     _mm_load_si128((__m128i *) (transform32x32[3][11])));
3585             E3h = _mm_madd_epi16(m128Tmp7,
3586                     _mm_load_si128((__m128i *) (transform32x32[3][11])));
3587 
3588             E4l = _mm_madd_epi16(m128Tmp8,
3589                     _mm_load_si128((__m128i *) (transform32x32[4][11])));
3590             E4h = _mm_madd_epi16(m128Tmp9,
3591                     _mm_load_si128((__m128i *) (transform32x32[4][11])));
3592             E5l = _mm_madd_epi16(m128Tmp10,
3593                     _mm_load_si128((__m128i *) (transform32x32[5][11])));
3594             E5h = _mm_madd_epi16(m128Tmp11,
3595                     _mm_load_si128((__m128i *) (transform32x32[5][11])));
3596             E6l = _mm_madd_epi16(m128Tmp12,
3597                     _mm_load_si128((__m128i *) (transform32x32[6][11])));
3598             E6h = _mm_madd_epi16(m128Tmp13,
3599                     _mm_load_si128((__m128i *) (transform32x32[6][11])));
3600             E7l = _mm_madd_epi16(m128Tmp14,
3601                     _mm_load_si128((__m128i *) (transform32x32[7][11])));
3602             E7h = _mm_madd_epi16(m128Tmp15,
3603                     _mm_load_si128((__m128i *) (transform32x32[7][11])));
3604 
3605             O11l = _mm_add_epi32(E0l, E1l);
3606             O11l = _mm_add_epi32(O11l, E2l);
3607             O11l = _mm_add_epi32(O11l, E3l);
3608             O11l = _mm_add_epi32(O11l, E4l);
3609             O11l = _mm_add_epi32(O11l, E5l);
3610             O11l = _mm_add_epi32(O11l, E6l);
3611             O11l = _mm_add_epi32(O11l, E7l);
3612 
3613             O11h = _mm_add_epi32(E0h, E1h);
3614             O11h = _mm_add_epi32(O11h, E2h);
3615             O11h = _mm_add_epi32(O11h, E3h);
3616             O11h = _mm_add_epi32(O11h, E4h);
3617             O11h = _mm_add_epi32(O11h, E5h);
3618             O11h = _mm_add_epi32(O11h, E6h);
3619             O11h = _mm_add_epi32(O11h, E7h);
3620 
3621             /* Compute 12*/
3622 
3623             E0l = _mm_madd_epi16(m128Tmp0,
3624                     _mm_load_si128((__m128i *) (transform32x32[0][12])));
3625             E0h = _mm_madd_epi16(m128Tmp1,
3626                     _mm_load_si128((__m128i *) (transform32x32[0][12])));
3627             E1l = _mm_madd_epi16(m128Tmp2,
3628                     _mm_load_si128((__m128i *) (transform32x32[1][12])));
3629             E1h = _mm_madd_epi16(m128Tmp3,
3630                     _mm_load_si128((__m128i *) (transform32x32[1][12])));
3631             E2l = _mm_madd_epi16(m128Tmp4,
3632                     _mm_load_si128((__m128i *) (transform32x32[2][12])));
3633             E2h = _mm_madd_epi16(m128Tmp5,
3634                     _mm_load_si128((__m128i *) (transform32x32[2][12])));
3635             E3l = _mm_madd_epi16(m128Tmp6,
3636                     _mm_load_si128((__m128i *) (transform32x32[3][12])));
3637             E3h = _mm_madd_epi16(m128Tmp7,
3638                     _mm_load_si128((__m128i *) (transform32x32[3][12])));
3639 
3640             E4l = _mm_madd_epi16(m128Tmp8,
3641                     _mm_load_si128((__m128i *) (transform32x32[4][12])));
3642             E4h = _mm_madd_epi16(m128Tmp9,
3643                     _mm_load_si128((__m128i *) (transform32x32[4][12])));
3644             E5l = _mm_madd_epi16(m128Tmp10,
3645                     _mm_load_si128((__m128i *) (transform32x32[5][12])));
3646             E5h = _mm_madd_epi16(m128Tmp11,
3647                     _mm_load_si128((__m128i *) (transform32x32[5][12])));
3648             E6l = _mm_madd_epi16(m128Tmp12,
3649                     _mm_load_si128((__m128i *) (transform32x32[6][12])));
3650             E6h = _mm_madd_epi16(m128Tmp13,
3651                     _mm_load_si128((__m128i *) (transform32x32[6][12])));
3652             E7l = _mm_madd_epi16(m128Tmp14,
3653                     _mm_load_si128((__m128i *) (transform32x32[7][12])));
3654             E7h = _mm_madd_epi16(m128Tmp15,
3655                     _mm_load_si128((__m128i *) (transform32x32[7][12])));
3656 
3657             O12l = _mm_add_epi32(E0l, E1l);
3658             O12l = _mm_add_epi32(O12l, E2l);
3659             O12l = _mm_add_epi32(O12l, E3l);
3660             O12l = _mm_add_epi32(O12l, E4l);
3661             O12l = _mm_add_epi32(O12l, E5l);
3662             O12l = _mm_add_epi32(O12l, E6l);
3663             O12l = _mm_add_epi32(O12l, E7l);
3664 
3665             O12h = _mm_add_epi32(E0h, E1h);
3666             O12h = _mm_add_epi32(O12h, E2h);
3667             O12h = _mm_add_epi32(O12h, E3h);
3668             O12h = _mm_add_epi32(O12h, E4h);
3669             O12h = _mm_add_epi32(O12h, E5h);
3670             O12h = _mm_add_epi32(O12h, E6h);
3671             O12h = _mm_add_epi32(O12h, E7h);
3672 
3673             /* Compute 13*/
3674 
3675             E0l = _mm_madd_epi16(m128Tmp0,
3676                     _mm_load_si128((__m128i *) (transform32x32[0][13])));
3677             E0h = _mm_madd_epi16(m128Tmp1,
3678                     _mm_load_si128((__m128i *) (transform32x32[0][13])));
3679             E1l = _mm_madd_epi16(m128Tmp2,
3680                     _mm_load_si128((__m128i *) (transform32x32[1][13])));
3681             E1h = _mm_madd_epi16(m128Tmp3,
3682                     _mm_load_si128((__m128i *) (transform32x32[1][13])));
3683             E2l = _mm_madd_epi16(m128Tmp4,
3684                     _mm_load_si128((__m128i *) (transform32x32[2][13])));
3685             E2h = _mm_madd_epi16(m128Tmp5,
3686                     _mm_load_si128((__m128i *) (transform32x32[2][13])));
3687             E3l = _mm_madd_epi16(m128Tmp6,
3688                     _mm_load_si128((__m128i *) (transform32x32[3][13])));
3689             E3h = _mm_madd_epi16(m128Tmp7,
3690                     _mm_load_si128((__m128i *) (transform32x32[3][13])));
3691 
3692             E4l = _mm_madd_epi16(m128Tmp8,
3693                     _mm_load_si128((__m128i *) (transform32x32[4][13])));
3694             E4h = _mm_madd_epi16(m128Tmp9,
3695                     _mm_load_si128((__m128i *) (transform32x32[4][13])));
3696             E5l = _mm_madd_epi16(m128Tmp10,
3697                     _mm_load_si128((__m128i *) (transform32x32[5][13])));
3698             E5h = _mm_madd_epi16(m128Tmp11,
3699                     _mm_load_si128((__m128i *) (transform32x32[5][13])));
3700             E6l = _mm_madd_epi16(m128Tmp12,
3701                     _mm_load_si128((__m128i *) (transform32x32[6][13])));
3702             E6h = _mm_madd_epi16(m128Tmp13,
3703                     _mm_load_si128((__m128i *) (transform32x32[6][13])));
3704             E7l = _mm_madd_epi16(m128Tmp14,
3705                     _mm_load_si128((__m128i *) (transform32x32[7][13])));
3706             E7h = _mm_madd_epi16(m128Tmp15,
3707                     _mm_load_si128((__m128i *) (transform32x32[7][13])));
3708 
3709             O13l = _mm_add_epi32(E0l, E1l);
3710             O13l = _mm_add_epi32(O13l, E2l);
3711             O13l = _mm_add_epi32(O13l, E3l);
3712             O13l = _mm_add_epi32(O13l, E4l);
3713             O13l = _mm_add_epi32(O13l, E5l);
3714             O13l = _mm_add_epi32(O13l, E6l);
3715             O13l = _mm_add_epi32(O13l, E7l);
3716 
3717             O13h = _mm_add_epi32(E0h, E1h);
3718             O13h = _mm_add_epi32(O13h, E2h);
3719             O13h = _mm_add_epi32(O13h, E3h);
3720             O13h = _mm_add_epi32(O13h, E4h);
3721             O13h = _mm_add_epi32(O13h, E5h);
3722             O13h = _mm_add_epi32(O13h, E6h);
3723             O13h = _mm_add_epi32(O13h, E7h);
3724 
3725             /* Compute O14  */
3726 
3727             E0l = _mm_madd_epi16(m128Tmp0,
3728                     _mm_load_si128((__m128i *) (transform32x32[0][14])));
3729             E0h = _mm_madd_epi16(m128Tmp1,
3730                     _mm_load_si128((__m128i *) (transform32x32[0][14])));
3731             E1l = _mm_madd_epi16(m128Tmp2,
3732                     _mm_load_si128((__m128i *) (transform32x32[1][14])));
3733             E1h = _mm_madd_epi16(m128Tmp3,
3734                     _mm_load_si128((__m128i *) (transform32x32[1][14])));
3735             E2l = _mm_madd_epi16(m128Tmp4,
3736                     _mm_load_si128((__m128i *) (transform32x32[2][14])));
3737             E2h = _mm_madd_epi16(m128Tmp5,
3738                     _mm_load_si128((__m128i *) (transform32x32[2][14])));
3739             E3l = _mm_madd_epi16(m128Tmp6,
3740                     _mm_load_si128((__m128i *) (transform32x32[3][14])));
3741             E3h = _mm_madd_epi16(m128Tmp7,
3742                     _mm_load_si128((__m128i *) (transform32x32[3][14])));
3743 
3744             E4l = _mm_madd_epi16(m128Tmp8,
3745                     _mm_load_si128((__m128i *) (transform32x32[4][14])));
3746             E4h = _mm_madd_epi16(m128Tmp9,
3747                     _mm_load_si128((__m128i *) (transform32x32[4][14])));
3748             E5l = _mm_madd_epi16(m128Tmp10,
3749                     _mm_load_si128((__m128i *) (transform32x32[5][14])));
3750             E5h = _mm_madd_epi16(m128Tmp11,
3751                     _mm_load_si128((__m128i *) (transform32x32[5][14])));
3752             E6l = _mm_madd_epi16(m128Tmp12,
3753                     _mm_load_si128((__m128i *) (transform32x32[6][14])));
3754             E6h = _mm_madd_epi16(m128Tmp13,
3755                     _mm_load_si128((__m128i *) (transform32x32[6][14])));
3756             E7l = _mm_madd_epi16(m128Tmp14,
3757                     _mm_load_si128((__m128i *) (transform32x32[7][14])));
3758             E7h = _mm_madd_epi16(m128Tmp15,
3759                     _mm_load_si128((__m128i *) (transform32x32[7][14])));
3760 
3761             O14l = _mm_add_epi32(E0l, E1l);
3762             O14l = _mm_add_epi32(O14l, E2l);
3763             O14l = _mm_add_epi32(O14l, E3l);
3764             O14l = _mm_add_epi32(O14l, E4l);
3765             O14l = _mm_add_epi32(O14l, E5l);
3766             O14l = _mm_add_epi32(O14l, E6l);
3767             O14l = _mm_add_epi32(O14l, E7l);
3768 
3769             O14h = _mm_add_epi32(E0h, E1h);
3770             O14h = _mm_add_epi32(O14h, E2h);
3771             O14h = _mm_add_epi32(O14h, E3h);
3772             O14h = _mm_add_epi32(O14h, E4h);
3773             O14h = _mm_add_epi32(O14h, E5h);
3774             O14h = _mm_add_epi32(O14h, E6h);
3775             O14h = _mm_add_epi32(O14h, E7h);
3776 
3777             /* Compute O15*/
3778 
3779             E0l = _mm_madd_epi16(m128Tmp0,
3780                     _mm_load_si128((__m128i *) (transform32x32[0][15])));
3781             E0h = _mm_madd_epi16(m128Tmp1,
3782                     _mm_load_si128((__m128i *) (transform32x32[0][15])));
3783             E1l = _mm_madd_epi16(m128Tmp2,
3784                     _mm_load_si128((__m128i *) (transform32x32[1][15])));
3785             E1h = _mm_madd_epi16(m128Tmp3,
3786                     _mm_load_si128((__m128i *) (transform32x32[1][15])));
3787             E2l = _mm_madd_epi16(m128Tmp4,
3788                     _mm_load_si128((__m128i *) (transform32x32[2][15])));
3789             E2h = _mm_madd_epi16(m128Tmp5,
3790                     _mm_load_si128((__m128i *) (transform32x32[2][15])));
3791             E3l = _mm_madd_epi16(m128Tmp6,
3792                     _mm_load_si128((__m128i *) (transform32x32[3][15])));
3793             E3h = _mm_madd_epi16(m128Tmp7,
3794                     _mm_load_si128((__m128i *) (transform32x32[3][15])));
3795 
3796             E4l = _mm_madd_epi16(m128Tmp8,
3797                     _mm_load_si128((__m128i *) (transform32x32[4][15])));
3798             E4h = _mm_madd_epi16(m128Tmp9,
3799                     _mm_load_si128((__m128i *) (transform32x32[4][15])));
3800             E5l = _mm_madd_epi16(m128Tmp10,
3801                     _mm_load_si128((__m128i *) (transform32x32[5][15])));
3802             E5h = _mm_madd_epi16(m128Tmp11,
3803                     _mm_load_si128((__m128i *) (transform32x32[5][15])));
3804             E6l = _mm_madd_epi16(m128Tmp12,
3805                     _mm_load_si128((__m128i *) (transform32x32[6][15])));
3806             E6h = _mm_madd_epi16(m128Tmp13,
3807                     _mm_load_si128((__m128i *) (transform32x32[6][15])));
3808             E7l = _mm_madd_epi16(m128Tmp14,
3809                     _mm_load_si128((__m128i *) (transform32x32[7][15])));
3810             E7h = _mm_madd_epi16(m128Tmp15,
3811                     _mm_load_si128((__m128i *) (transform32x32[7][15])));
3812 
3813             O15l = _mm_add_epi32(E0l, E1l);
3814             O15l = _mm_add_epi32(O15l, E2l);
3815             O15l = _mm_add_epi32(O15l, E3l);
3816             O15l = _mm_add_epi32(O15l, E4l);
3817             O15l = _mm_add_epi32(O15l, E5l);
3818             O15l = _mm_add_epi32(O15l, E6l);
3819             O15l = _mm_add_epi32(O15l, E7l);
3820 
3821             O15h = _mm_add_epi32(E0h, E1h);
3822             O15h = _mm_add_epi32(O15h, E2h);
3823             O15h = _mm_add_epi32(O15h, E3h);
3824             O15h = _mm_add_epi32(O15h, E4h);
3825             O15h = _mm_add_epi32(O15h, E5h);
3826             O15h = _mm_add_epi32(O15h, E6h);
3827             O15h = _mm_add_epi32(O15h, E7h);
3828             /*  Compute E0  */
3829 
3830             m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
3831             E0l = _mm_madd_epi16(m128Tmp0,
3832                     _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
3833             m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
3834             E0h = _mm_madd_epi16(m128Tmp1,
3835                     _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
3836 
3837             m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14);
3838             E0l = _mm_add_epi32(E0l,
3839                     _mm_madd_epi16(m128Tmp2,
3840                             _mm_load_si128(
3841                                     (__m128i *) (transform16x16_1[1][0]))));
3842             m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14);
3843             E0h = _mm_add_epi32(E0h,
3844                     _mm_madd_epi16(m128Tmp3,
3845                             _mm_load_si128(
3846                                     (__m128i *) (transform16x16_1[1][0]))));
3847 
3848             m128Tmp4 = _mm_unpacklo_epi16(m128iS18, m128iS22);
3849             E0l = _mm_add_epi32(E0l,
3850                     _mm_madd_epi16(m128Tmp4,
3851                             _mm_load_si128(
3852                                     (__m128i *) (transform16x16_1[2][0]))));
3853             m128Tmp5 = _mm_unpackhi_epi16(m128iS18, m128iS22);
3854             E0h = _mm_add_epi32(E0h,
3855                     _mm_madd_epi16(m128Tmp5,
3856                             _mm_load_si128(
3857                                     (__m128i *) (transform16x16_1[2][0]))));
3858 
3859             m128Tmp6 = _mm_unpacklo_epi16(m128iS26, m128iS30);
3860             E0l = _mm_add_epi32(E0l,
3861                     _mm_madd_epi16(m128Tmp6,
3862                             _mm_load_si128(
3863                                     (__m128i *) (transform16x16_1[3][0]))));
3864             m128Tmp7 = _mm_unpackhi_epi16(m128iS26, m128iS30);
3865             E0h = _mm_add_epi32(E0h,
3866                     _mm_madd_epi16(m128Tmp7,
3867                             _mm_load_si128(
3868                                     (__m128i *) (transform16x16_1[3][0]))));
3869 
3870             /*  Compute E1  */
3871             E1l = _mm_madd_epi16(m128Tmp0,
3872                     _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
3873             E1h = _mm_madd_epi16(m128Tmp1,
3874                     _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
3875             E1l = _mm_add_epi32(E1l,
3876                     _mm_madd_epi16(m128Tmp2,
3877                             _mm_load_si128(
3878                                     (__m128i *) (transform16x16_1[1][1]))));
3879             E1h = _mm_add_epi32(E1h,
3880                     _mm_madd_epi16(m128Tmp3,
3881                             _mm_load_si128(
3882                                     (__m128i *) (transform16x16_1[1][1]))));
3883             E1l = _mm_add_epi32(E1l,
3884                     _mm_madd_epi16(m128Tmp4,
3885                             _mm_load_si128(
3886                                     (__m128i *) (transform16x16_1[2][1]))));
3887             E1h = _mm_add_epi32(E1h,
3888                     _mm_madd_epi16(m128Tmp5,
3889                             _mm_load_si128(
3890                                     (__m128i *) (transform16x16_1[2][1]))));
3891             E1l = _mm_add_epi32(E1l,
3892                     _mm_madd_epi16(m128Tmp6,
3893                             _mm_load_si128(
3894                                     (__m128i *) (transform16x16_1[3][1]))));
3895             E1h = _mm_add_epi32(E1h,
3896                     _mm_madd_epi16(m128Tmp7,
3897                             _mm_load_si128(
3898                                     (__m128i *) (transform16x16_1[3][1]))));
3899 
3900             /*  Compute E2  */
3901             E2l = _mm_madd_epi16(m128Tmp0,
3902                     _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
3903             E2h = _mm_madd_epi16(m128Tmp1,
3904                     _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
3905             E2l = _mm_add_epi32(E2l,
3906                     _mm_madd_epi16(m128Tmp2,
3907                             _mm_load_si128(
3908                                     (__m128i *) (transform16x16_1[1][2]))));
3909             E2h = _mm_add_epi32(E2h,
3910                     _mm_madd_epi16(m128Tmp3,
3911                             _mm_load_si128(
3912                                     (__m128i *) (transform16x16_1[1][2]))));
3913             E2l = _mm_add_epi32(E2l,
3914                     _mm_madd_epi16(m128Tmp4,
3915                             _mm_load_si128(
3916                                     (__m128i *) (transform16x16_1[2][2]))));
3917             E2h = _mm_add_epi32(E2h,
3918                     _mm_madd_epi16(m128Tmp5,
3919                             _mm_load_si128(
3920                                     (__m128i *) (transform16x16_1[2][2]))));
3921             E2l = _mm_add_epi32(E2l,
3922                     _mm_madd_epi16(m128Tmp6,
3923                             _mm_load_si128(
3924                                     (__m128i *) (transform16x16_1[3][2]))));
3925             E2h = _mm_add_epi32(E2h,
3926                     _mm_madd_epi16(m128Tmp7,
3927                             _mm_load_si128(
3928                                     (__m128i *) (transform16x16_1[3][2]))));
3929 
3930             /*  Compute E3  */
3931             E3l = _mm_madd_epi16(m128Tmp0,
3932                     _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
3933             E3h = _mm_madd_epi16(m128Tmp1,
3934                     _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
3935             E3l = _mm_add_epi32(E3l,
3936                     _mm_madd_epi16(m128Tmp2,
3937                             _mm_load_si128(
3938                                     (__m128i *) (transform16x16_1[1][3]))));
3939             E3h = _mm_add_epi32(E3h,
3940                     _mm_madd_epi16(m128Tmp3,
3941                             _mm_load_si128(
3942                                     (__m128i *) (transform16x16_1[1][3]))));
3943             E3l = _mm_add_epi32(E3l,
3944                     _mm_madd_epi16(m128Tmp4,
3945                             _mm_load_si128(
3946                                     (__m128i *) (transform16x16_1[2][3]))));
3947             E3h = _mm_add_epi32(E3h,
3948                     _mm_madd_epi16(m128Tmp5,
3949                             _mm_load_si128(
3950                                     (__m128i *) (transform16x16_1[2][3]))));
3951             E3l = _mm_add_epi32(E3l,
3952                     _mm_madd_epi16(m128Tmp6,
3953                             _mm_load_si128(
3954                                     (__m128i *) (transform16x16_1[3][3]))));
3955             E3h = _mm_add_epi32(E3h,
3956                     _mm_madd_epi16(m128Tmp7,
3957                             _mm_load_si128(
3958                                     (__m128i *) (transform16x16_1[3][3]))));
3959 
3960             /*  Compute E4  */
3961             E4l = _mm_madd_epi16(m128Tmp0,
3962                     _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
3963             E4h = _mm_madd_epi16(m128Tmp1,
3964                     _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
3965             E4l = _mm_add_epi32(E4l,
3966                     _mm_madd_epi16(m128Tmp2,
3967                             _mm_load_si128(
3968                                     (__m128i *) (transform16x16_1[1][4]))));
3969             E4h = _mm_add_epi32(E4h,
3970                     _mm_madd_epi16(m128Tmp3,
3971                             _mm_load_si128(
3972                                     (__m128i *) (transform16x16_1[1][4]))));
3973             E4l = _mm_add_epi32(E4l,
3974                     _mm_madd_epi16(m128Tmp4,
3975                             _mm_load_si128(
3976                                     (__m128i *) (transform16x16_1[2][4]))));
3977             E4h = _mm_add_epi32(E4h,
3978                     _mm_madd_epi16(m128Tmp5,
3979                             _mm_load_si128(
3980                                     (__m128i *) (transform16x16_1[2][4]))));
3981             E4l = _mm_add_epi32(E4l,
3982                     _mm_madd_epi16(m128Tmp6,
3983                             _mm_load_si128(
3984                                     (__m128i *) (transform16x16_1[3][4]))));
3985             E4h = _mm_add_epi32(E4h,
3986                     _mm_madd_epi16(m128Tmp7,
3987                             _mm_load_si128(
3988                                     (__m128i *) (transform16x16_1[3][4]))));
3989 
3990             /*  Compute E3  */
3991             E5l = _mm_madd_epi16(m128Tmp0,
3992                     _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
3993             E5h = _mm_madd_epi16(m128Tmp1,
3994                     _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
3995             E5l = _mm_add_epi32(E5l,
3996                     _mm_madd_epi16(m128Tmp2,
3997                             _mm_load_si128(
3998                                     (__m128i *) (transform16x16_1[1][5]))));
3999             E5h = _mm_add_epi32(E5h,
4000                     _mm_madd_epi16(m128Tmp3,
4001                             _mm_load_si128(
4002                                     (__m128i *) (transform16x16_1[1][5]))));
4003             E5l = _mm_add_epi32(E5l,
4004                     _mm_madd_epi16(m128Tmp4,
4005                             _mm_load_si128(
4006                                     (__m128i *) (transform16x16_1[2][5]))));
4007             E5h = _mm_add_epi32(E5h,
4008                     _mm_madd_epi16(m128Tmp5,
4009                             _mm_load_si128(
4010                                     (__m128i *) (transform16x16_1[2][5]))));
4011             E5l = _mm_add_epi32(E5l,
4012                     _mm_madd_epi16(m128Tmp6,
4013                             _mm_load_si128(
4014                                     (__m128i *) (transform16x16_1[3][5]))));
4015             E5h = _mm_add_epi32(E5h,
4016                     _mm_madd_epi16(m128Tmp7,
4017                             _mm_load_si128(
4018                                     (__m128i *) (transform16x16_1[3][5]))));
4019 
4020             /*  Compute E6  */
4021             E6l = _mm_madd_epi16(m128Tmp0,
4022                     _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
4023             E6h = _mm_madd_epi16(m128Tmp1,
4024                     _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
4025             E6l = _mm_add_epi32(E6l,
4026                     _mm_madd_epi16(m128Tmp2,
4027                             _mm_load_si128(
4028                                     (__m128i *) (transform16x16_1[1][6]))));
4029             E6h = _mm_add_epi32(E6h,
4030                     _mm_madd_epi16(m128Tmp3,
4031                             _mm_load_si128(
4032                                     (__m128i *) (transform16x16_1[1][6]))));
4033             E6l = _mm_add_epi32(E6l,
4034                     _mm_madd_epi16(m128Tmp4,
4035                             _mm_load_si128(
4036                                     (__m128i *) (transform16x16_1[2][6]))));
4037             E6h = _mm_add_epi32(E6h,
4038                     _mm_madd_epi16(m128Tmp5,
4039                             _mm_load_si128(
4040                                     (__m128i *) (transform16x16_1[2][6]))));
4041             E6l = _mm_add_epi32(E6l,
4042                     _mm_madd_epi16(m128Tmp6,
4043                             _mm_load_si128(
4044                                     (__m128i *) (transform16x16_1[3][6]))));
4045             E6h = _mm_add_epi32(E6h,
4046                     _mm_madd_epi16(m128Tmp7,
4047                             _mm_load_si128(
4048                                     (__m128i *) (transform16x16_1[3][6]))));
4049 
4050             /*  Compute E7  */
4051             E7l = _mm_madd_epi16(m128Tmp0,
4052                     _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
4053             E7h = _mm_madd_epi16(m128Tmp1,
4054                     _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
4055             E7l = _mm_add_epi32(E7l,
4056                     _mm_madd_epi16(m128Tmp2,
4057                             _mm_load_si128(
4058                                     (__m128i *) (transform16x16_1[1][7]))));
4059             E7h = _mm_add_epi32(E7h,
4060                     _mm_madd_epi16(m128Tmp3,
4061                             _mm_load_si128(
4062                                     (__m128i *) (transform16x16_1[1][7]))));
4063             E7l = _mm_add_epi32(E7l,
4064                     _mm_madd_epi16(m128Tmp4,
4065                             _mm_load_si128(
4066                                     (__m128i *) (transform16x16_1[2][7]))));
4067             E7h = _mm_add_epi32(E7h,
4068                     _mm_madd_epi16(m128Tmp5,
4069                             _mm_load_si128(
4070                                     (__m128i *) (transform16x16_1[2][7]))));
4071             E7l = _mm_add_epi32(E7l,
4072                     _mm_madd_epi16(m128Tmp6,
4073                             _mm_load_si128(
4074                                     (__m128i *) (transform16x16_1[3][7]))));
4075             E7h = _mm_add_epi32(E7h,
4076                     _mm_madd_epi16(m128Tmp7,
4077                             _mm_load_si128(
4078                                     (__m128i *) (transform16x16_1[3][7]))));
4079 
4080             /*  Compute EE0 and EEE */
4081 
4082             m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12);
4083             E00l = _mm_madd_epi16(m128Tmp0,
4084                     _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
4085             m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12);
4086             E00h = _mm_madd_epi16(m128Tmp1,
4087                     _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
4088 
4089             m128Tmp2 = _mm_unpacklo_epi16(m128iS20, m128iS28);
4090             E00l = _mm_add_epi32(E00l,
4091                     _mm_madd_epi16(m128Tmp2,
4092                             _mm_load_si128(
4093                                     (__m128i *) (transform16x16_2[1][0]))));
4094             m128Tmp3 = _mm_unpackhi_epi16(m128iS20, m128iS28);
4095             E00h = _mm_add_epi32(E00h,
4096                     _mm_madd_epi16(m128Tmp3,
4097                             _mm_load_si128(
4098                                     (__m128i *) (transform16x16_2[1][0]))));
4099 
4100             E01l = _mm_madd_epi16(m128Tmp0,
4101                     _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
4102             E01h = _mm_madd_epi16(m128Tmp1,
4103                     _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
4104             E01l = _mm_add_epi32(E01l,
4105                     _mm_madd_epi16(m128Tmp2,
4106                             _mm_load_si128(
4107                                     (__m128i *) (transform16x16_2[1][1]))));
4108             E01h = _mm_add_epi32(E01h,
4109                     _mm_madd_epi16(m128Tmp3,
4110                             _mm_load_si128(
4111                                     (__m128i *) (transform16x16_2[1][1]))));
4112 
4113             E02l = _mm_madd_epi16(m128Tmp0,
4114                     _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
4115             E02h = _mm_madd_epi16(m128Tmp1,
4116                     _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
4117             E02l = _mm_add_epi32(E02l,
4118                     _mm_madd_epi16(m128Tmp2,
4119                             _mm_load_si128(
4120                                     (__m128i *) (transform16x16_2[1][2]))));
4121             E02h = _mm_add_epi32(E02h,
4122                     _mm_madd_epi16(m128Tmp3,
4123                             _mm_load_si128(
4124                                     (__m128i *) (transform16x16_2[1][2]))));
4125 
4126             E03l = _mm_madd_epi16(m128Tmp0,
4127                     _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
4128             E03h = _mm_madd_epi16(m128Tmp1,
4129                     _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
4130             E03l = _mm_add_epi32(E03l,
4131                     _mm_madd_epi16(m128Tmp2,
4132                             _mm_load_si128(
4133                                     (__m128i *) (transform16x16_2[1][3]))));
4134             E03h = _mm_add_epi32(E03h,
4135                     _mm_madd_epi16(m128Tmp3,
4136                             _mm_load_si128(
4137                                     (__m128i *) (transform16x16_2[1][3]))));
4138 
4139             /*  Compute EE0 and EEE */
4140 
4141             m128Tmp0 = _mm_unpacklo_epi16(m128iS8, m128iS24);
4142             EE0l = _mm_madd_epi16(m128Tmp0,
4143                     _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
4144             m128Tmp1 = _mm_unpackhi_epi16(m128iS8, m128iS24);
4145             EE0h = _mm_madd_epi16(m128Tmp1,
4146                     _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
4147 
4148             m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS16);
4149             EEE0l = _mm_madd_epi16(m128Tmp2,
4150                     _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
4151             m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS16);
4152             EEE0h = _mm_madd_epi16(m128Tmp3,
4153                     _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
4154 
4155             EE1l = _mm_madd_epi16(m128Tmp0,
4156                     _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
4157             EE1h = _mm_madd_epi16(m128Tmp1,
4158                     _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
4159 
4160             EEE1l = _mm_madd_epi16(m128Tmp2,
4161                     _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
4162             EEE1h = _mm_madd_epi16(m128Tmp3,
4163                     _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
4164 
4165             /*  Compute EE    */
4166 
4167             EE2l = _mm_sub_epi32(EEE1l, EE1l);
4168             EE3l = _mm_sub_epi32(EEE0l, EE0l);
4169             EE2h = _mm_sub_epi32(EEE1h, EE1h);
4170             EE3h = _mm_sub_epi32(EEE0h, EE0h);
4171 
4172             EE0l = _mm_add_epi32(EEE0l, EE0l);
4173             EE1l = _mm_add_epi32(EEE1l, EE1l);
4174             EE0h = _mm_add_epi32(EEE0h, EE0h);
4175             EE1h = _mm_add_epi32(EEE1h, EE1h);
4176             /**/
4177 
4178             EE7l = _mm_sub_epi32(EE0l, E00l);
4179             EE6l = _mm_sub_epi32(EE1l, E01l);
4180             EE5l = _mm_sub_epi32(EE2l, E02l);
4181             EE4l = _mm_sub_epi32(EE3l, E03l);
4182 
4183             EE7h = _mm_sub_epi32(EE0h, E00h);
4184             EE6h = _mm_sub_epi32(EE1h, E01h);
4185             EE5h = _mm_sub_epi32(EE2h, E02h);
4186             EE4h = _mm_sub_epi32(EE3h, E03h);
4187 
4188             EE0l = _mm_add_epi32(EE0l, E00l);
4189             EE1l = _mm_add_epi32(EE1l, E01l);
4190             EE2l = _mm_add_epi32(EE2l, E02l);
4191             EE3l = _mm_add_epi32(EE3l, E03l);
4192 
4193             EE0h = _mm_add_epi32(EE0h, E00h);
4194             EE1h = _mm_add_epi32(EE1h, E01h);
4195             EE2h = _mm_add_epi32(EE2h, E02h);
4196             EE3h = _mm_add_epi32(EE3h, E03h);
4197             /*      Compute E       */
4198 
4199             E15l = _mm_sub_epi32(EE0l, E0l);
4200             E15l = _mm_add_epi32(E15l, m128iAdd);
4201             E14l = _mm_sub_epi32(EE1l, E1l);
4202             E14l = _mm_add_epi32(E14l, m128iAdd);
4203             E13l = _mm_sub_epi32(EE2l, E2l);
4204             E13l = _mm_add_epi32(E13l, m128iAdd);
4205             E12l = _mm_sub_epi32(EE3l, E3l);
4206             E12l = _mm_add_epi32(E12l, m128iAdd);
4207             E11l = _mm_sub_epi32(EE4l, E4l);
4208             E11l = _mm_add_epi32(E11l, m128iAdd);
4209             E10l = _mm_sub_epi32(EE5l, E5l);
4210             E10l = _mm_add_epi32(E10l, m128iAdd);
4211             E9l = _mm_sub_epi32(EE6l, E6l);
4212             E9l = _mm_add_epi32(E9l, m128iAdd);
4213             E8l = _mm_sub_epi32(EE7l, E7l);
4214             E8l = _mm_add_epi32(E8l, m128iAdd);
4215 
4216             E0l = _mm_add_epi32(EE0l, E0l);
4217             E0l = _mm_add_epi32(E0l, m128iAdd);
4218             E1l = _mm_add_epi32(EE1l, E1l);
4219             E1l = _mm_add_epi32(E1l, m128iAdd);
4220             E2l = _mm_add_epi32(EE2l, E2l);
4221             E2l = _mm_add_epi32(E2l, m128iAdd);
4222             E3l = _mm_add_epi32(EE3l, E3l);
4223             E3l = _mm_add_epi32(E3l, m128iAdd);
4224             E4l = _mm_add_epi32(EE4l, E4l);
4225             E4l = _mm_add_epi32(E4l, m128iAdd);
4226             E5l = _mm_add_epi32(EE5l, E5l);
4227             E5l = _mm_add_epi32(E5l, m128iAdd);
4228             E6l = _mm_add_epi32(EE6l, E6l);
4229             E6l = _mm_add_epi32(E6l, m128iAdd);
4230             E7l = _mm_add_epi32(EE7l, E7l);
4231             E7l = _mm_add_epi32(E7l, m128iAdd);
4232 
4233             E15h = _mm_sub_epi32(EE0h, E0h);
4234             E15h = _mm_add_epi32(E15h, m128iAdd);
4235             E14h = _mm_sub_epi32(EE1h, E1h);
4236             E14h = _mm_add_epi32(E14h, m128iAdd);
4237             E13h = _mm_sub_epi32(EE2h, E2h);
4238             E13h = _mm_add_epi32(E13h, m128iAdd);
4239             E12h = _mm_sub_epi32(EE3h, E3h);
4240             E12h = _mm_add_epi32(E12h, m128iAdd);
4241             E11h = _mm_sub_epi32(EE4h, E4h);
4242             E11h = _mm_add_epi32(E11h, m128iAdd);
4243             E10h = _mm_sub_epi32(EE5h, E5h);
4244             E10h = _mm_add_epi32(E10h, m128iAdd);
4245             E9h = _mm_sub_epi32(EE6h, E6h);
4246             E9h = _mm_add_epi32(E9h, m128iAdd);
4247             E8h = _mm_sub_epi32(EE7h, E7h);
4248             E8h = _mm_add_epi32(E8h, m128iAdd);
4249 
4250             E0h = _mm_add_epi32(EE0h, E0h);
4251             E0h = _mm_add_epi32(E0h, m128iAdd);
4252             E1h = _mm_add_epi32(EE1h, E1h);
4253             E1h = _mm_add_epi32(E1h, m128iAdd);
4254             E2h = _mm_add_epi32(EE2h, E2h);
4255             E2h = _mm_add_epi32(E2h, m128iAdd);
4256             E3h = _mm_add_epi32(EE3h, E3h);
4257             E3h = _mm_add_epi32(E3h, m128iAdd);
4258             E4h = _mm_add_epi32(EE4h, E4h);
4259             E4h = _mm_add_epi32(E4h, m128iAdd);
4260             E5h = _mm_add_epi32(EE5h, E5h);
4261             E5h = _mm_add_epi32(E5h, m128iAdd);
4262             E6h = _mm_add_epi32(EE6h, E6h);
4263             E6h = _mm_add_epi32(E6h, m128iAdd);
4264             E7h = _mm_add_epi32(EE7h, E7h);
4265             E7h = _mm_add_epi32(E7h, m128iAdd);
4266 
4267             m128iS0 = _mm_packs_epi32(
4268                     _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift),
4269                     _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift));
4270             m128iS1 = _mm_packs_epi32(
4271                     _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift),
4272                     _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift));
4273             m128iS2 = _mm_packs_epi32(
4274                     _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift),
4275                     _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift));
4276             m128iS3 = _mm_packs_epi32(
4277                     _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift),
4278                     _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift));
4279             m128iS4 = _mm_packs_epi32(
4280                     _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift),
4281                     _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift));
4282             m128iS5 = _mm_packs_epi32(
4283                     _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift),
4284                     _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift));
4285             m128iS6 = _mm_packs_epi32(
4286                     _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift),
4287                     _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift));
4288             m128iS7 = _mm_packs_epi32(
4289                     _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift),
4290                     _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift));
4291             m128iS8 = _mm_packs_epi32(
4292                     _mm_srai_epi32(_mm_add_epi32(E8l, O8l), shift),
4293                     _mm_srai_epi32(_mm_add_epi32(E8h, O8h), shift));
4294             m128iS9 = _mm_packs_epi32(
4295                     _mm_srai_epi32(_mm_add_epi32(E9l, O9l), shift),
4296                     _mm_srai_epi32(_mm_add_epi32(E9h, O9h), shift));
4297             m128iS10 = _mm_packs_epi32(
4298                     _mm_srai_epi32(_mm_add_epi32(E10l, O10l), shift),
4299                     _mm_srai_epi32(_mm_add_epi32(E10h, O10h), shift));
4300             m128iS11 = _mm_packs_epi32(
4301                     _mm_srai_epi32(_mm_add_epi32(E11l, O11l), shift),
4302                     _mm_srai_epi32(_mm_add_epi32(E11h, O11h), shift));
4303             m128iS12 = _mm_packs_epi32(
4304                     _mm_srai_epi32(_mm_add_epi32(E12l, O12l), shift),
4305                     _mm_srai_epi32(_mm_add_epi32(E12h, O12h), shift));
4306             m128iS13 = _mm_packs_epi32(
4307                     _mm_srai_epi32(_mm_add_epi32(E13l, O13l), shift),
4308                     _mm_srai_epi32(_mm_add_epi32(E13h, O13h), shift));
4309             m128iS14 = _mm_packs_epi32(
4310                     _mm_srai_epi32(_mm_add_epi32(E14l, O14l), shift),
4311                     _mm_srai_epi32(_mm_add_epi32(E14h, O14h), shift));
4312             m128iS15 = _mm_packs_epi32(
4313                     _mm_srai_epi32(_mm_add_epi32(E15l, O15l), shift),
4314                     _mm_srai_epi32(_mm_add_epi32(E15h, O15h), shift));
4315 
4316             m128iS31 = _mm_packs_epi32(
4317                     _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift),
4318                     _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift));
4319             m128iS30 = _mm_packs_epi32(
4320                     _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift),
4321                     _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift));
4322             m128iS29 = _mm_packs_epi32(
4323                     _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift),
4324                     _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift));
4325             m128iS28 = _mm_packs_epi32(
4326                     _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift),
4327                     _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift));
4328             m128iS27 = _mm_packs_epi32(
4329                     _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift),
4330                     _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift));
4331             m128iS26 = _mm_packs_epi32(
4332                     _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift),
4333                     _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift));
4334             m128iS25 = _mm_packs_epi32(
4335                     _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift),
4336                     _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift));
4337             m128iS24 = _mm_packs_epi32(
4338                     _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift),
4339                     _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift));
4340             m128iS23 = _mm_packs_epi32(
4341                     _mm_srai_epi32(_mm_sub_epi32(E8l, O8l), shift),
4342                     _mm_srai_epi32(_mm_sub_epi32(E8h, O8h), shift));
4343             m128iS22 = _mm_packs_epi32(
4344                     _mm_srai_epi32(_mm_sub_epi32(E9l, O9l), shift),
4345                     _mm_srai_epi32(_mm_sub_epi32(E9h, O9h), shift));
4346             m128iS21 = _mm_packs_epi32(
4347                     _mm_srai_epi32(_mm_sub_epi32(E10l, O10l), shift),
4348                     _mm_srai_epi32(_mm_sub_epi32(E10h, O10h), shift));
4349             m128iS20 = _mm_packs_epi32(
4350                     _mm_srai_epi32(_mm_sub_epi32(E11l, O11l), shift),
4351                     _mm_srai_epi32(_mm_sub_epi32(E11h, O11h), shift));
4352             m128iS19 = _mm_packs_epi32(
4353                     _mm_srai_epi32(_mm_sub_epi32(E12l, O12l), shift),
4354                     _mm_srai_epi32(_mm_sub_epi32(E12h, O12h), shift));
4355             m128iS18 = _mm_packs_epi32(
4356                     _mm_srai_epi32(_mm_sub_epi32(E13l, O13l), shift),
4357                     _mm_srai_epi32(_mm_sub_epi32(E13h, O13h), shift));
4358             m128iS17 = _mm_packs_epi32(
4359                     _mm_srai_epi32(_mm_sub_epi32(E14l, O14l), shift),
4360                     _mm_srai_epi32(_mm_sub_epi32(E14h, O14h), shift));
4361             m128iS16 = _mm_packs_epi32(
4362                     _mm_srai_epi32(_mm_sub_epi32(E15l, O15l), shift),
4363                     _mm_srai_epi32(_mm_sub_epi32(E15h, O15h), shift));
4364 
4365             if (!j) {
4366                 /*      Inverse the matrix      */
4367                 E0l = _mm_unpacklo_epi16(m128iS0, m128iS16);
4368                 E1l = _mm_unpacklo_epi16(m128iS1, m128iS17);
4369                 E2l = _mm_unpacklo_epi16(m128iS2, m128iS18);
4370                 E3l = _mm_unpacklo_epi16(m128iS3, m128iS19);
4371                 E4l = _mm_unpacklo_epi16(m128iS4, m128iS20);
4372                 E5l = _mm_unpacklo_epi16(m128iS5, m128iS21);
4373                 E6l = _mm_unpacklo_epi16(m128iS6, m128iS22);
4374                 E7l = _mm_unpacklo_epi16(m128iS7, m128iS23);
4375                 E8l = _mm_unpacklo_epi16(m128iS8, m128iS24);
4376                 E9l = _mm_unpacklo_epi16(m128iS9, m128iS25);
4377                 E10l = _mm_unpacklo_epi16(m128iS10, m128iS26);
4378                 E11l = _mm_unpacklo_epi16(m128iS11, m128iS27);
4379                 E12l = _mm_unpacklo_epi16(m128iS12, m128iS28);
4380                 E13l = _mm_unpacklo_epi16(m128iS13, m128iS29);
4381                 E14l = _mm_unpacklo_epi16(m128iS14, m128iS30);
4382                 E15l = _mm_unpacklo_epi16(m128iS15, m128iS31);
4383 
4384                 O0l = _mm_unpackhi_epi16(m128iS0, m128iS16);
4385                 O1l = _mm_unpackhi_epi16(m128iS1, m128iS17);
4386                 O2l = _mm_unpackhi_epi16(m128iS2, m128iS18);
4387                 O3l = _mm_unpackhi_epi16(m128iS3, m128iS19);
4388                 O4l = _mm_unpackhi_epi16(m128iS4, m128iS20);
4389                 O5l = _mm_unpackhi_epi16(m128iS5, m128iS21);
4390                 O6l = _mm_unpackhi_epi16(m128iS6, m128iS22);
4391                 O7l = _mm_unpackhi_epi16(m128iS7, m128iS23);
4392                 O8l = _mm_unpackhi_epi16(m128iS8, m128iS24);
4393                 O9l = _mm_unpackhi_epi16(m128iS9, m128iS25);
4394                 O10l = _mm_unpackhi_epi16(m128iS10, m128iS26);
4395                 O11l = _mm_unpackhi_epi16(m128iS11, m128iS27);
4396                 O12l = _mm_unpackhi_epi16(m128iS12, m128iS28);
4397                 O13l = _mm_unpackhi_epi16(m128iS13, m128iS29);
4398                 O14l = _mm_unpackhi_epi16(m128iS14, m128iS30);
4399                 O15l = _mm_unpackhi_epi16(m128iS15, m128iS31);
4400 
4401                 E0h = _mm_unpacklo_epi16(E0l, E8l);
4402                 E1h = _mm_unpacklo_epi16(E1l, E9l);
4403                 E2h = _mm_unpacklo_epi16(E2l, E10l);
4404                 E3h = _mm_unpacklo_epi16(E3l, E11l);
4405                 E4h = _mm_unpacklo_epi16(E4l, E12l);
4406                 E5h = _mm_unpacklo_epi16(E5l, E13l);
4407                 E6h = _mm_unpacklo_epi16(E6l, E14l);
4408                 E7h = _mm_unpacklo_epi16(E7l, E15l);
4409 
4410                 E8h = _mm_unpackhi_epi16(E0l, E8l);
4411                 E9h = _mm_unpackhi_epi16(E1l, E9l);
4412                 E10h = _mm_unpackhi_epi16(E2l, E10l);
4413                 E11h = _mm_unpackhi_epi16(E3l, E11l);
4414                 E12h = _mm_unpackhi_epi16(E4l, E12l);
4415                 E13h = _mm_unpackhi_epi16(E5l, E13l);
4416                 E14h = _mm_unpackhi_epi16(E6l, E14l);
4417                 E15h = _mm_unpackhi_epi16(E7l, E15l);
4418 
4419                 m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h);
4420                 m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h);
4421                 m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h);
4422                 m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h);
4423 
4424                 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4425                 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4426                 m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4427                 m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4428 
4429                 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4430                 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4431                 m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4432                 m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4433 
4434                 m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h);
4435                 m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h);
4436                 m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h);
4437                 m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h);
4438 
4439                 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4440                 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4441                 m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4442                 m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4443 
4444                 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4445                 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4446                 m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4447                 m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4448 
4449                 m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h);
4450                 m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h);
4451                 m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h);
4452                 m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h);
4453 
4454                 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4455                 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4456                 m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4457                 m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4458 
4459                 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4460                 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4461                 m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4462                 m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4463 
4464                 m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h);
4465                 m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h);
4466                 m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h);
4467                 m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h);
4468 
4469                 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4470                 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4471                 m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4472                 m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4473 
4474                 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4475                 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4476                 m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4477                 m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4478 
4479                 /*  */
4480                 E0h = _mm_unpacklo_epi16(O0l, O8l);
4481                 E1h = _mm_unpacklo_epi16(O1l, O9l);
4482                 E2h = _mm_unpacklo_epi16(O2l, O10l);
4483                 E3h = _mm_unpacklo_epi16(O3l, O11l);
4484                 E4h = _mm_unpacklo_epi16(O4l, O12l);
4485                 E5h = _mm_unpacklo_epi16(O5l, O13l);
4486                 E6h = _mm_unpacklo_epi16(O6l, O14l);
4487                 E7h = _mm_unpacklo_epi16(O7l, O15l);
4488 
4489                 E8h = _mm_unpackhi_epi16(O0l, O8l);
4490                 E9h = _mm_unpackhi_epi16(O1l, O9l);
4491                 E10h = _mm_unpackhi_epi16(O2l, O10l);
4492                 E11h = _mm_unpackhi_epi16(O3l, O11l);
4493                 E12h = _mm_unpackhi_epi16(O4l, O12l);
4494                 E13h = _mm_unpackhi_epi16(O5l, O13l);
4495                 E14h = _mm_unpackhi_epi16(O6l, O14l);
4496                 E15h = _mm_unpackhi_epi16(O7l, O15l);
4497 
4498                 m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h);
4499                 m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h);
4500                 m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h);
4501                 m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h);
4502 
4503                 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4504                 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4505                 m128iS16 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4506                 m128iS17 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4507 
4508                 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4509                 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4510                 m128iS18 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4511                 m128iS19 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4512 
4513                 m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h);
4514                 m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h);
4515                 m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h);
4516                 m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h);
4517 
4518                 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4519                 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4520                 m128iS20 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4521                 m128iS21 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4522 
4523                 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4524                 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4525                 m128iS22 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4526                 m128iS23 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4527 
4528                 m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h);
4529                 m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h);
4530                 m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h);
4531                 m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h);
4532 
4533                 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4534                 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4535                 m128iS24 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4536                 m128iS25 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4537 
4538                 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4539                 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4540                 m128iS26 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4541                 m128iS27 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4542 
4543                 m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h);
4544                 m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h);
4545                 m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h);
4546                 m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h);
4547 
4548                 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
4549                 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
4550                 m128iS28 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4551                 m128iS29 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4552 
4553                 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
4554                 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
4555                 m128iS30 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
4556                 m128iS31 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
4557 
4558                 if(i==0){
4559                     int k = 8;
4560                     r0=m128iS0;
4561                     r1=m128iS1;
4562                     r2=m128iS2;
4563                     r3=m128iS3;
4564                     r4=m128iS4;
4565                     r5=m128iS5;
4566                     r6=m128iS6;
4567                     r7=m128iS7;
4568                     r8=m128iS8;
4569                     r9=m128iS9;
4570                     r10=m128iS10;
4571                     r11=m128iS11;
4572                     r12=m128iS12;
4573                     r13=m128iS13;
4574                     r14=m128iS14;
4575                     r15=m128iS15;
4576                     r16=m128iS16;
4577                     r17=m128iS17;
4578                     r18=m128iS18;
4579                     r19=m128iS19;
4580                     r20=m128iS20;
4581                     r21=m128iS21;
4582                     r22=m128iS22;
4583                     r23=m128iS23;
4584                     r24=m128iS24;
4585                     r25=m128iS25;
4586                     r26=m128iS26;
4587                     r27=m128iS27;
4588                     r28=m128iS28;
4589                     r29=m128iS29;
4590                     r30=m128iS30;
4591                     r31=m128iS31;
4592                     m128iS0 = _mm_load_si128((__m128i *) (src + k));
4593                     m128iS1 = _mm_load_si128((__m128i *) (src + 32 + k));
4594                     m128iS2 = _mm_load_si128((__m128i *) (src + 64 + k));
4595                     m128iS3 = _mm_load_si128((__m128i *) (src + 96 + k));
4596                     m128iS4 = _mm_load_si128((__m128i *) (src + 128 + k));
4597                     m128iS5 = _mm_load_si128((__m128i *) (src + 160 + k));
4598                     m128iS6 = _mm_load_si128((__m128i *) (src + 192 + k));
4599                     m128iS7 = _mm_load_si128((__m128i *) (src + 224 + k));
4600                     m128iS8 = _mm_load_si128((__m128i *) (src + 256 + k));
4601                     m128iS9 = _mm_load_si128((__m128i *) (src + 288 + k));
4602                     m128iS10 = _mm_load_si128((__m128i *) (src + 320 + k));
4603                     m128iS11 = _mm_load_si128((__m128i *) (src + 352 + k));
4604                     m128iS12 = _mm_load_si128((__m128i *) (src + 384 + k));
4605                     m128iS13 = _mm_load_si128((__m128i *) (src + 416 + k));
4606                     m128iS14 = _mm_load_si128((__m128i *) (src + 448 + k));
4607                     m128iS15 = _mm_load_si128((__m128i *) (src + 480 + k));
4608 
4609                     m128iS16 = _mm_load_si128((__m128i *) (src + 512 + k));
4610                     m128iS17 = _mm_load_si128((__m128i *) (src + 544 + k));
4611                     m128iS18 = _mm_load_si128((__m128i *) (src + 576 + k));
4612                     m128iS19 = _mm_load_si128((__m128i *) (src + 608 + k));
4613                     m128iS20 = _mm_load_si128((__m128i *) (src + 640 + k));
4614                     m128iS21 = _mm_load_si128((__m128i *) (src + 672 + k));
4615                     m128iS22 = _mm_load_si128((__m128i *) (src + 704 + k));
4616                     m128iS23 = _mm_load_si128((__m128i *) (src + 736 + k));
4617                     m128iS24 = _mm_load_si128((__m128i *) (src + 768 + k));
4618                     m128iS25 = _mm_load_si128((__m128i *) (src + 800 + k));
4619                     m128iS26 = _mm_load_si128((__m128i *) (src + 832 + k));
4620                     m128iS27 = _mm_load_si128((__m128i *) (src + 864 + k));
4621                     m128iS28 = _mm_load_si128((__m128i *) (src + 896 + k));
4622                     m128iS29 = _mm_load_si128((__m128i *) (src + 928 + k));
4623                     m128iS30 = _mm_load_si128((__m128i *) (src + 960 + k));
4624                     m128iS31 = _mm_load_si128((__m128i *) (src + 992 + k));
4625 
4626                 }else if(i ==8){
4627 
4628                     r32=m128iS0;
4629                     r33=m128iS1;
4630                     r34=m128iS2;
4631                     r35=m128iS3;
4632                     r36=m128iS4;
4633                     r37=m128iS5;
4634                     r38=m128iS6;
4635                     r39=m128iS7;
4636                     r40=m128iS8;
4637                     r41=m128iS9;
4638                     r42=m128iS10;
4639                     r43=m128iS11;
4640                     r44=m128iS12;
4641                     r45=m128iS13;
4642                     r46=m128iS14;
4643                     r47=m128iS15;
4644                     r48=m128iS16;
4645                     r49=m128iS17;
4646                     r50=m128iS18;
4647                     r51=m128iS19;
4648                     r52=m128iS20;
4649                     r53=m128iS21;
4650                     r54=m128iS22;
4651                     r55=m128iS23;
4652                     r56=m128iS24;
4653                     r57=m128iS25;
4654                     r58=m128iS26;
4655                     r59=m128iS27;
4656                     r60=m128iS28;
4657                     r61=m128iS29;
4658                     r62=m128iS30;
4659                     r63=m128iS31;
4660 
4661                     m128iS0 = _mm_load_si128((__m128i *) (src + 16));
4662                     m128iS1 = _mm_load_si128((__m128i *) (src + 48));
4663                     m128iS2 = _mm_load_si128((__m128i *) (src + 80));
4664                     m128iS3 = _mm_load_si128((__m128i *) (src + 112));
4665                     m128iS4 = _mm_load_si128((__m128i *) (src + 144));
4666                     m128iS5 = _mm_load_si128((__m128i *) (src + 176));
4667                     m128iS6 = _mm_load_si128((__m128i *) (src + 192 + 16));
4668                     m128iS7 = _mm_load_si128((__m128i *) (src + 224 + 16));
4669                     m128iS8 = _mm_load_si128((__m128i *) (src + 256 + 16));
4670                     m128iS9 = _mm_load_si128((__m128i *) (src + 288 + 16));
4671                     m128iS10 = _mm_load_si128((__m128i *) (src + 320 + 16));
4672                     m128iS11 = _mm_load_si128((__m128i *) (src + 352 + 16));
4673                     m128iS12 = _mm_load_si128((__m128i *) (src + 384 + 16));
4674                     m128iS13 = _mm_load_si128((__m128i *) (src + 416 + 16));
4675                     m128iS14 = _mm_load_si128((__m128i *) (src + 448 + 16));
4676                     m128iS15 = _mm_load_si128((__m128i *) (src + 480 + 16));
4677 
4678                     m128iS16 = _mm_load_si128((__m128i *) (src + 512 + 16));
4679                     m128iS17 = _mm_load_si128((__m128i *) (src + 544 + 16));
4680                     m128iS18 = _mm_load_si128((__m128i *) (src + 576 + 16));
4681                     m128iS19 = _mm_load_si128((__m128i *) (src + 608 + 16));
4682                     m128iS20 = _mm_load_si128((__m128i *) (src + 640 + 16));
4683                     m128iS21 = _mm_load_si128((__m128i *) (src + 672 + 16));
4684                     m128iS22 = _mm_load_si128((__m128i *) (src + 704 + 16));
4685                     m128iS23 = _mm_load_si128((__m128i *) (src + 736 + 16));
4686                     m128iS24 = _mm_load_si128((__m128i *) (src + 768 + 16));
4687                     m128iS25 = _mm_load_si128((__m128i *) (src + 800 + 16));
4688                     m128iS26 = _mm_load_si128((__m128i *) (src + 832 + 16));
4689                     m128iS27 = _mm_load_si128((__m128i *) (src + 864 + 16));
4690                     m128iS28 = _mm_load_si128((__m128i *) (src + 896 + 16));
4691                     m128iS29 = _mm_load_si128((__m128i *) (src + 928 + 16));
4692                     m128iS30 = _mm_load_si128((__m128i *) (src + 960 + 16));
4693                     m128iS31 = _mm_load_si128((__m128i *) (src + 992 + 16));
4694 
4695 
4696                 }else if(i ==16){
4697 
4698                     r64=m128iS0;
4699                     r65=m128iS1;
4700                     r66=m128iS2;
4701                     r67=m128iS3;
4702                     r68=m128iS4;
4703                     r69=m128iS5;
4704                     r70=m128iS6;
4705                     r71=m128iS7;
4706                     r72=m128iS8;
4707                     r73=m128iS9;
4708                     r74=m128iS10;
4709                     r75=m128iS11;
4710                     r76=m128iS12;
4711                     r77=m128iS13;
4712                     r78=m128iS14;
4713                     r79=m128iS15;
4714                     r80=m128iS16;
4715                     r81=m128iS17;
4716                     r82=m128iS18;
4717                     r83=m128iS19;
4718                     r84=m128iS20;
4719                     r85=m128iS21;
4720                     r86=m128iS22;
4721                     r87=m128iS23;
4722                     r88=m128iS24;
4723                     r89=m128iS25;
4724                     r90=m128iS26;
4725                     r91=m128iS27;
4726                     r92=m128iS28;
4727                     r93=m128iS29;
4728                     r94=m128iS30;
4729                     r95=m128iS31;
4730 
4731                     m128iS0 = _mm_load_si128((__m128i *) (src + 24));
4732                     m128iS1 = _mm_load_si128((__m128i *) (src + 56));
4733                     m128iS2 = _mm_load_si128((__m128i *) (src + 64 + 24));
4734                     m128iS3 = _mm_load_si128((__m128i *) (src + 96 + 24));
4735                     m128iS4 = _mm_load_si128((__m128i *) (src + 128 + 24));
4736                     m128iS5 = _mm_load_si128((__m128i *) (src + 160 + 24));
4737                     m128iS6 = _mm_load_si128((__m128i *) (src + 192 + 24));
4738                     m128iS7 = _mm_load_si128((__m128i *) (src + 224 + 24));
4739                     m128iS8 = _mm_load_si128((__m128i *) (src + 256 + 24));
4740                     m128iS9 = _mm_load_si128((__m128i *) (src + 288 + 24));
4741                     m128iS10 = _mm_load_si128((__m128i *) (src + 320 + 24));
4742                     m128iS11 = _mm_load_si128((__m128i *) (src + 352 + 24));
4743                     m128iS12 = _mm_load_si128((__m128i *) (src + 384 + 24));
4744                     m128iS13 = _mm_load_si128((__m128i *) (src + 416 + 24));
4745                     m128iS14 = _mm_load_si128((__m128i *) (src + 448 + 24));
4746                     m128iS15 = _mm_load_si128((__m128i *) (src + 480 + 24));
4747 
4748                     m128iS16 = _mm_load_si128((__m128i *) (src + 512 + 24));
4749                     m128iS17 = _mm_load_si128((__m128i *) (src + 544 + 24));
4750                     m128iS18 = _mm_load_si128((__m128i *) (src + 576 + 24));
4751                     m128iS19 = _mm_load_si128((__m128i *) (src + 608 + 24));
4752                     m128iS20 = _mm_load_si128((__m128i *) (src + 640 + 24));
4753                     m128iS21 = _mm_load_si128((__m128i *) (src + 672 + 24));
4754                     m128iS22 = _mm_load_si128((__m128i *) (src + 704 + 24));
4755                     m128iS23 = _mm_load_si128((__m128i *) (src + 736 + 24));
4756                     m128iS24 = _mm_load_si128((__m128i *) (src + 768 + 24));
4757                     m128iS25 = _mm_load_si128((__m128i *) (src + 800 + 24));
4758                     m128iS26 = _mm_load_si128((__m128i *) (src + 832 + 24));
4759                     m128iS27 = _mm_load_si128((__m128i *) (src + 864 + 24));
4760                     m128iS28 = _mm_load_si128((__m128i *) (src + 896 + 24));
4761                     m128iS29 = _mm_load_si128((__m128i *) (src + 928 + 24));
4762                     m128iS30 = _mm_load_si128((__m128i *) (src + 960 + 24));
4763                     m128iS31 = _mm_load_si128((__m128i *) (src + 992 + 24));
4764 
4765                 }else{
4766                     r96=m128iS0;
4767                     r97=m128iS1;
4768                     r98=m128iS2;
4769                     r99=m128iS3;
4770                     r100=m128iS4;
4771                     r101=m128iS5;
4772                     r102=m128iS6;
4773                     r103=m128iS7;
4774                     r104=m128iS8;
4775                     r105=m128iS9;
4776                     r106=m128iS10;
4777                     r107=m128iS11;
4778                     r108=m128iS12;
4779                     r109=m128iS13;
4780                     r110=m128iS14;
4781                     r111=m128iS15;
4782                     r112=m128iS16;
4783                     r113=m128iS17;
4784                     r114=m128iS18;
4785                     r115=m128iS19;
4786                     r116=m128iS20;
4787                     r117=m128iS21;
4788                     r118=m128iS22;
4789                     r119=m128iS23;
4790                     r120=m128iS24;
4791                     r121=m128iS25;
4792                     r122=m128iS26;
4793                     r123=m128iS27;
4794                     r124=m128iS28;
4795                     r125=m128iS29;
4796                     r126=m128iS30;
4797                     r127=m128iS31;
4798 
4799                     //load data for next j :
4800                     m128iS0 =  r0;
4801                     m128iS1 =  r4;
4802                     m128iS2 =  r8;
4803                     m128iS3 =  r12;
4804                     m128iS4 =  r16;
4805                     m128iS5 =  r20;
4806                     m128iS6 =  r24;
4807                     m128iS7 =  r28;
4808                     m128iS8 =  r32;
4809                     m128iS9 =  r36;
4810                     m128iS10 = r40;
4811                     m128iS11 = r44;
4812                     m128iS12 = r48;
4813                     m128iS13 = r52;
4814                     m128iS14 = r56;
4815                     m128iS15 = r60;
4816                     m128iS16 = r64;
4817                     m128iS17 = r68;
4818                     m128iS18 = r72;
4819                     m128iS19 = r76;
4820                     m128iS20 = r80;
4821                     m128iS21 = r84;
4822                     m128iS22 = r88;
4823                     m128iS23 = r92;
4824                     m128iS24 = r96;
4825                     m128iS25 = r100;
4826                     m128iS26 = r104;
4827                     m128iS27 = r108;
4828                     m128iS28 = r112;
4829                     m128iS29 = r116;
4830                     m128iS30 = r120;
4831                     m128iS31 =r124;
4832                     shift = shift_2nd;
4833                     m128iAdd = _mm_set1_epi32(add_2nd);
4834 
4835 
4836                 }
4837 
4838             } else {
4839 
4840                 //Transpose Matrix
4841 
4842                 E0l= _mm_unpacklo_epi16(m128iS0,m128iS1);
4843                 E1l= _mm_unpacklo_epi16(m128iS2,m128iS3);
4844                 E2l= _mm_unpacklo_epi16(m128iS4,m128iS5);
4845                 E3l= _mm_unpacklo_epi16(m128iS6,m128iS7);
4846                 E4l= _mm_unpacklo_epi16(m128iS8,m128iS9);
4847                 E5l= _mm_unpacklo_epi16(m128iS10,m128iS11);
4848                 E6l= _mm_unpacklo_epi16(m128iS12,m128iS13);
4849                 E7l= _mm_unpacklo_epi16(m128iS14,m128iS15);
4850                 E8l= _mm_unpacklo_epi16(m128iS16,m128iS17);
4851                 E9l= _mm_unpacklo_epi16(m128iS18,m128iS19);
4852                 E10l= _mm_unpacklo_epi16(m128iS20,m128iS21);
4853                 E11l= _mm_unpacklo_epi16(m128iS22,m128iS23);
4854                 E12l= _mm_unpacklo_epi16(m128iS24,m128iS25);
4855                 E13l= _mm_unpacklo_epi16(m128iS26,m128iS27);
4856                 E14l= _mm_unpacklo_epi16(m128iS28,m128iS29);
4857                 E15l= _mm_unpacklo_epi16(m128iS30,m128iS31);
4858 
4859 
4860                 E0h= _mm_unpackhi_epi16(m128iS0,m128iS1);
4861                 E1h= _mm_unpackhi_epi16(m128iS2,m128iS3);
4862                 E2h= _mm_unpackhi_epi16(m128iS4,m128iS5);
4863                 E3h= _mm_unpackhi_epi16(m128iS6,m128iS7);
4864                 E4h= _mm_unpackhi_epi16(m128iS8,m128iS9);
4865                 E5h= _mm_unpackhi_epi16(m128iS10,m128iS11);
4866                 E6h= _mm_unpackhi_epi16(m128iS12,m128iS13);
4867                 E7h= _mm_unpackhi_epi16(m128iS14,m128iS15);
4868                 E8h= _mm_unpackhi_epi16(m128iS16,m128iS17);
4869                 E9h= _mm_unpackhi_epi16(m128iS18,m128iS19);
4870                 E10h= _mm_unpackhi_epi16(m128iS20,m128iS21);
4871                 E11h= _mm_unpackhi_epi16(m128iS22,m128iS23);
4872                 E12h= _mm_unpackhi_epi16(m128iS24,m128iS25);
4873                 E13h= _mm_unpackhi_epi16(m128iS26,m128iS27);
4874                 E14h= _mm_unpackhi_epi16(m128iS28,m128iS29);
4875                 E15h= _mm_unpackhi_epi16(m128iS30,m128iS31);
4876 
4877                 m128Tmp0= _mm_unpacklo_epi32(E0l,E1l);
4878                 m128Tmp1= _mm_unpacklo_epi32(E2l,E3l);
4879                 m128Tmp2= _mm_unpacklo_epi32(E4l,E5l);
4880                 m128Tmp3= _mm_unpacklo_epi32(E6l,E7l);
4881                 m128Tmp4= _mm_unpacklo_epi32(E8l,E9l);
4882                 m128Tmp5= _mm_unpacklo_epi32(E10l,E11l);
4883                 m128Tmp6= _mm_unpacklo_epi32(E12l,E13l);
4884                 m128Tmp7= _mm_unpacklo_epi32(E14l,E15l);
4885 
4886                 m128iS0= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter 1st row
4887                 m128iS1= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter 1st row
4888 
4889 
4890                 m128iS2= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter 1st row
4891                 m128iS3= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); //last quarter 1st row
4892 
4893                 //second row
4894 
4895                 m128iS4= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter
4896                 m128iS5= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter
4897 
4898                 m128iS6= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter
4899                 m128iS7= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter
4900 
4901                //third row
4902 
4903                 m128Tmp0= _mm_unpackhi_epi32(E0l,E1l);
4904                 m128Tmp1= _mm_unpackhi_epi32(E2l,E3l);
4905                 m128Tmp2= _mm_unpackhi_epi32(E4l,E5l);
4906                 m128Tmp3= _mm_unpackhi_epi32(E6l,E7l);
4907                 m128Tmp4= _mm_unpackhi_epi32(E8l,E9l);
4908                 m128Tmp5= _mm_unpackhi_epi32(E10l,E11l);
4909                 m128Tmp6= _mm_unpackhi_epi32(E12l,E13l);
4910                 m128Tmp7= _mm_unpackhi_epi32(E14l,E15l);
4911 
4912 
4913                 m128iS8= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter
4914                 m128iS9= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter
4915 
4916                 m128iS10= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter
4917                 m128iS11= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); //last quarter
4918 
4919                 //fourth row
4920 
4921                 m128iS12= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter
4922                 m128iS13= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter
4923 
4924                 m128iS14= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter
4925                 m128iS15= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter
4926 
4927                 //fith row
4928 
4929                 m128Tmp0= _mm_unpacklo_epi32(E0h,E1h);
4930                 m128Tmp1= _mm_unpacklo_epi32(E2h,E3h);
4931                 m128Tmp2= _mm_unpacklo_epi32(E4h,E5h);
4932                 m128Tmp3= _mm_unpacklo_epi32(E6h,E7h);
4933                 m128Tmp4= _mm_unpacklo_epi32(E8h,E9h);
4934                 m128Tmp5= _mm_unpacklo_epi32(E10h,E11h);
4935                 m128Tmp6= _mm_unpacklo_epi32(E12h,E13h);
4936                 m128Tmp7= _mm_unpacklo_epi32(E14h,E15h);
4937 
4938                 m128iS16= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter
4939                 m128iS17= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter
4940 
4941 
4942                 m128iS18= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter
4943                 m128iS19= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7);
4944 
4945                 //sixth row
4946 
4947                 m128iS20= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter
4948                 m128iS21= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter
4949 
4950 
4951                 m128iS22= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter
4952                 m128iS23= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter
4953 
4954                //seventh row
4955 
4956                 m128Tmp0= _mm_unpackhi_epi32(E0h,E1h);
4957                 m128Tmp1= _mm_unpackhi_epi32(E2h,E3h);
4958                 m128Tmp2= _mm_unpackhi_epi32(E4h,E5h);
4959                 m128Tmp3= _mm_unpackhi_epi32(E6h,E7h);
4960                 m128Tmp4= _mm_unpackhi_epi32(E8h,E9h);
4961                 m128Tmp5= _mm_unpackhi_epi32(E10h,E11h);
4962                 m128Tmp6= _mm_unpackhi_epi32(E12h,E13h);
4963                 m128Tmp7= _mm_unpackhi_epi32(E14h,E15h);
4964 
4965 
4966                 m128iS24= _mm_unpacklo_epi64(m128Tmp0,m128Tmp1); //first quarter
4967                 m128iS25= _mm_unpacklo_epi64(m128Tmp2,m128Tmp3); //second quarter
4968 
4969 
4970                 m128iS26= _mm_unpacklo_epi64(m128Tmp4,m128Tmp5); //third quarter
4971                 m128iS27= _mm_unpacklo_epi64(m128Tmp6,m128Tmp7); //last quarter
4972 
4973                 //last row
4974 
4975 
4976                 m128iS28= _mm_unpackhi_epi64(m128Tmp0,m128Tmp1); //first quarter
4977                 m128iS29= _mm_unpackhi_epi64(m128Tmp2,m128Tmp3); //second quarter
4978 
4979                 m128iS30= _mm_unpackhi_epi64(m128Tmp4,m128Tmp5); //third quarter
4980                 m128iS31= _mm_unpackhi_epi64(m128Tmp6,m128Tmp7); //last quarter
4981 
4982 
4983                 m128Tmp0=_mm_setzero_si128();
4984 
4985 
4986                 //store
4987                 dst = (uint8_t*) _dst + i*stride;
4988 
4989 
4990                 E0l= _mm_load_si128((__m128i*)dst); //16 values
4991                 E1l= _mm_load_si128((__m128i*)(dst+16));
4992                 E2l= _mm_load_si128((__m128i*)(dst+stride));
4993                 E3l= _mm_load_si128((__m128i*)(dst+stride+16));
4994                 E4l= _mm_load_si128((__m128i*)(dst+2*stride));
4995                 E5l= _mm_load_si128((__m128i*)(dst+2*stride+16));
4996                 E6l= _mm_load_si128((__m128i*)(dst+3*stride));
4997                 E7l= _mm_load_si128((__m128i*)(dst+3*stride+16));
4998                 E8l= _mm_load_si128((__m128i*)(dst+4*stride));
4999                 E9l= _mm_load_si128((__m128i*)(dst+4*stride+16));
5000                 E10l= _mm_load_si128((__m128i*)(dst+5*stride));
5001                 E11l= _mm_load_si128((__m128i*)(dst+5*stride+16));
5002                 E12l= _mm_load_si128((__m128i*)(dst+6*stride));
5003                 E13l= _mm_load_si128((__m128i*)(dst+6*stride+16));
5004                 E14l= _mm_load_si128((__m128i*)(dst+7*stride));
5005                 E15l= _mm_load_si128((__m128i*)(dst+7*stride+16));
5006 
5007                 m128iS0= _mm_adds_epi16(m128iS0,_mm_unpacklo_epi8(E0l,m128Tmp0));
5008                 m128iS1= _mm_adds_epi16(m128iS1,_mm_unpackhi_epi8(E0l,m128Tmp0));
5009                 m128iS0= _mm_packus_epi16(m128iS0,m128iS1);
5010 
5011                 m128iS2= _mm_adds_epi16(m128iS2,_mm_unpacklo_epi8(E1l,m128Tmp0));
5012                 m128iS3= _mm_adds_epi16(m128iS3,_mm_unpackhi_epi8(E1l,m128Tmp0));
5013                 m128iS2= _mm_packus_epi16(m128iS2,m128iS3);
5014 
5015                 m128iS4= _mm_adds_epi16(m128iS4,_mm_unpacklo_epi8(E2l,m128Tmp0));
5016                 m128iS5= _mm_adds_epi16(m128iS5,_mm_unpackhi_epi8(E2l,m128Tmp0));
5017                 m128iS4= _mm_packus_epi16(m128iS4,m128iS5);
5018 
5019                 m128iS6= _mm_adds_epi16(m128iS6,_mm_unpacklo_epi8(E3l,m128Tmp0));
5020                 m128iS7= _mm_adds_epi16(m128iS7,_mm_unpackhi_epi8(E3l,m128Tmp0));
5021                 m128iS6= _mm_packus_epi16(m128iS6,m128iS7);
5022 
5023                 m128iS8= _mm_adds_epi16(m128iS8,_mm_unpacklo_epi8(E4l,m128Tmp0));
5024                 m128iS9= _mm_adds_epi16(m128iS9,_mm_unpackhi_epi8(E4l,m128Tmp0));
5025                 m128iS8= _mm_packus_epi16(m128iS8,m128iS9);
5026 
5027                 m128iS10= _mm_adds_epi16(m128iS10,_mm_unpacklo_epi8(E5l,m128Tmp0));
5028                 m128iS11= _mm_adds_epi16(m128iS11,_mm_unpackhi_epi8(E5l,m128Tmp0));
5029                 m128iS10= _mm_packus_epi16(m128iS10,m128iS11);
5030 
5031                 m128iS12= _mm_adds_epi16(m128iS12,_mm_unpacklo_epi8(E6l,m128Tmp0));
5032                 m128iS13= _mm_adds_epi16(m128iS13,_mm_unpackhi_epi8(E6l,m128Tmp0));
5033                 m128iS12= _mm_packus_epi16(m128iS12,m128iS13);
5034 
5035                 m128iS14= _mm_adds_epi16(m128iS14,_mm_unpacklo_epi8(E7l,m128Tmp0));
5036                 m128iS15= _mm_adds_epi16(m128iS15,_mm_unpackhi_epi8(E7l,m128Tmp0));
5037                 m128iS14= _mm_packus_epi16(m128iS14,m128iS15);
5038 
5039                 m128iS16= _mm_adds_epi16(m128iS16,_mm_unpacklo_epi8(E8l,m128Tmp0));
5040                 m128iS17= _mm_adds_epi16(m128iS17,_mm_unpackhi_epi8(E8l,m128Tmp0));
5041                 m128iS16= _mm_packus_epi16(m128iS16,m128iS17);
5042 
5043                 m128iS18= _mm_adds_epi16(m128iS18,_mm_unpacklo_epi8(E9l,m128Tmp0));
5044                 m128iS19= _mm_adds_epi16(m128iS19,_mm_unpackhi_epi8(E9l,m128Tmp0));
5045                 m128iS18= _mm_packus_epi16(m128iS18,m128iS19);
5046 
5047                 m128iS20= _mm_adds_epi16(m128iS20,_mm_unpacklo_epi8(E10l,m128Tmp0));
5048                 m128iS21= _mm_adds_epi16(m128iS21,_mm_unpackhi_epi8(E10l,m128Tmp0));
5049                 m128iS20= _mm_packus_epi16(m128iS20,m128iS21);
5050 
5051                 m128iS22= _mm_adds_epi16(m128iS22,_mm_unpacklo_epi8(E11l,m128Tmp0));
5052                 m128iS23= _mm_adds_epi16(m128iS23,_mm_unpackhi_epi8(E11l,m128Tmp0));
5053                 m128iS22= _mm_packus_epi16(m128iS22,m128iS23);
5054 
5055                 m128iS24= _mm_adds_epi16(m128iS24,_mm_unpacklo_epi8(E12l,m128Tmp0));
5056                 m128iS25= _mm_adds_epi16(m128iS25,_mm_unpackhi_epi8(E12l,m128Tmp0));
5057                 m128iS24= _mm_packus_epi16(m128iS24,m128iS25);
5058 
5059                 m128iS26= _mm_adds_epi16(m128iS26,_mm_unpacklo_epi8(E13l,m128Tmp0));
5060                 m128iS27= _mm_adds_epi16(m128iS27,_mm_unpackhi_epi8(E13l,m128Tmp0));
5061                 m128iS26= _mm_packus_epi16(m128iS26,m128iS27);
5062 
5063                 m128iS28= _mm_adds_epi16(m128iS28,_mm_unpacklo_epi8(E14l,m128Tmp0));
5064                 m128iS29= _mm_adds_epi16(m128iS29,_mm_unpackhi_epi8(E14l,m128Tmp0));
5065                 m128iS28= _mm_packus_epi16(m128iS28,m128iS29);
5066 
5067                 m128iS30= _mm_adds_epi16(m128iS30,_mm_unpacklo_epi8(E15l,m128Tmp0));
5068                 m128iS31= _mm_adds_epi16(m128iS31,_mm_unpackhi_epi8(E15l,m128Tmp0));
5069                 m128iS30= _mm_packus_epi16(m128iS30,m128iS31);
5070 
5071 
5072                 _mm_store_si128((__m128i*)dst,m128iS0);
5073                 _mm_store_si128((__m128i*)(dst+16),m128iS2);
5074                 _mm_store_si128((__m128i*)(dst+stride),m128iS4);
5075                 _mm_store_si128((__m128i*)(dst+stride+16),m128iS6);
5076                 _mm_store_si128((__m128i*)(dst+2*stride),m128iS8);
5077                 _mm_store_si128((__m128i*)(dst+2*stride+16),m128iS10);
5078                 _mm_store_si128((__m128i*)(dst+3*stride),m128iS12);
5079                 _mm_store_si128((__m128i*)(dst+3*stride+16),m128iS14);
5080                 _mm_store_si128((__m128i*)(dst+4*stride),m128iS16);
5081                 _mm_store_si128((__m128i*)(dst+4*stride+16),m128iS18);
5082                 _mm_store_si128((__m128i*)(dst+5*stride),m128iS20);
5083                 _mm_store_si128((__m128i*)(dst+5*stride+16),m128iS22);
5084                 _mm_store_si128((__m128i*)(dst+6*stride),m128iS24);
5085                 _mm_store_si128((__m128i*)(dst+6*stride+16),m128iS26);
5086                 _mm_store_si128((__m128i*)(dst+7*stride),m128iS28);
5087                 _mm_store_si128((__m128i*)(dst+7*stride+16),m128iS30);
5088 
5089 
5090                 if(i==0){
5091                     //load next values :
5092                     m128iS0 =  r1;
5093                     m128iS1 =  r5;
5094                     m128iS2 =  r9;
5095                     m128iS3 =  r13;
5096                     m128iS4 =  r17;
5097                     m128iS5 =  r21;
5098                     m128iS6 =  r25;
5099                     m128iS7 =  r29;
5100                     m128iS8 =  r33;
5101                     m128iS9 =  r37;
5102                     m128iS10 = r41;
5103                     m128iS11 = r45;
5104                     m128iS12 = r49;
5105                     m128iS13 = r53;
5106                     m128iS14 = r57;
5107                     m128iS15 = r61;
5108                     m128iS16 = r65;
5109                     m128iS17 = r69;
5110                     m128iS18 = r73;
5111                     m128iS19 = r77;
5112                     m128iS20 = r81;
5113                     m128iS21 = r85;
5114                     m128iS22 = r89;
5115                     m128iS23 = r93;
5116                     m128iS24 = r97;
5117                     m128iS25 = r101;
5118                     m128iS26 = r105;
5119                     m128iS27 = r109;
5120                     m128iS28 = r113;
5121                     m128iS29 = r117;
5122                     m128iS30 = r121;
5123                     m128iS31 =r125;
5124 
5125                 }else if(i ==8){
5126                     //load next values :
5127                     m128iS0 =  r2;
5128                     m128iS1 =  r6;
5129                     m128iS2 =  r10;
5130                     m128iS3 =  r14;
5131                     m128iS4 =  r18;
5132                     m128iS5 =  r22;
5133                     m128iS6 =  r26;
5134                     m128iS7 =  r30;
5135                     m128iS8 =  r34;
5136                     m128iS9 =  r38;
5137                     m128iS10 = r42;
5138                     m128iS11 = r46;
5139                     m128iS12 = r50;
5140                     m128iS13 = r54;
5141                     m128iS14 = r58;
5142                     m128iS15 = r62;
5143                     m128iS16 = r66;
5144                     m128iS17 = r70;
5145                     m128iS18 = r74;
5146                     m128iS19 = r78;
5147                     m128iS20 = r82;
5148                     m128iS21 = r86;
5149                     m128iS22 = r90;
5150                     m128iS23 = r94;
5151                     m128iS24 = r98;
5152                     m128iS25 = r102;
5153                     m128iS26 = r106;
5154                     m128iS27 = r110;
5155                     m128iS28 = r114;
5156                     m128iS29 = r118;
5157                     m128iS30 = r122;
5158                     m128iS31 =r126;
5159 
5160                 }else if(i==16)
5161                 {
5162                     //load next values :
5163                     m128iS0 =  r3;
5164                     m128iS1 =  r7;
5165                     m128iS2 =  r11;
5166                     m128iS3 =  r15;
5167                     m128iS4 =  r19;
5168                     m128iS5 =  r23;
5169                     m128iS6 =  r27;
5170                     m128iS7 =  r31;
5171                     m128iS8 =  r35;
5172                     m128iS9 =  r39;
5173                     m128iS10 = r43;
5174                     m128iS11 = r47;
5175                     m128iS12 = r51;
5176                     m128iS13 = r55;
5177                     m128iS14 = r59;
5178                     m128iS15 = r63;
5179                     m128iS16 = r67;
5180                     m128iS17 = r71;
5181                     m128iS18 = r75;
5182                     m128iS19 = r79;
5183                     m128iS20 = r83;
5184                     m128iS21 = r87;
5185                     m128iS22 = r91;
5186                     m128iS23 = r95;
5187                     m128iS24 = r99;
5188                     m128iS25 = r103;
5189                     m128iS26 = r107;
5190                     m128iS27 = r111;
5191                     m128iS28 = r115;
5192                     m128iS29 = r119;
5193                     m128iS30 = r123;
5194                     m128iS31 =r127;
5195                 }
5196             }
5197         }
5198     }
5199 }
5200 #endif
5201 
5202 
5203 #if 0
5204 void ff_hevc_transform_32x32_add_10_sse4(uint8_t *_dst, const int16_t *coeffs,
5205         ptrdiff_t _stride) {
5206     int i, j;
5207     uint16_t *dst = (uint16_t*) _dst;
5208     ptrdiff_t stride = _stride / 2;
5209     int shift;
5210     uint8_t shift_2nd = 10; //20 - bit depth
5211     uint16_t add_2nd = 1<<9; //shift2 - 1
5212     int16_t *src = coeffs;
5213 
5214     __m128i m128iS0, m128iS1, m128iS2, m128iS3, m128iS4, m128iS5, m128iS6,
5215             m128iS7, m128iS8, m128iS9, m128iS10, m128iS11, m128iS12, m128iS13,
5216             m128iS14, m128iS15, m128iAdd, m128Tmp0, m128Tmp1, m128Tmp2,
5217             m128Tmp3, m128Tmp4, m128Tmp5, m128Tmp6, m128Tmp7, E0h, E1h, E2h,
5218             E3h, E0l, E1l, E2l, E3l, O0h, O1h, O2h, O3h, O4h, O5h, O6h, O7h,
5219             O0l, O1l, O2l, O3l, O4l, O5l, O6l, O7l, EE0l, EE1l, EE2l, EE3l,
5220             E00l, E01l, EE0h, EE1h, EE2h, EE3h, E00h, E01h;
5221     __m128i E4l, E5l, E6l, E7l, E8l, E9l, E10l, E11l, E12l, E13l, E14l, E15l;
5222     __m128i E4h, E5h, E6h, E7h, E8h, E9h, E10h, E11h, E12h, E13h, E14h, E15h,
5223             EEE0l, EEE1l, EEE0h, EEE1h;
5224     __m128i m128iS16, m128iS17, m128iS18, m128iS19, m128iS20, m128iS21,
5225             m128iS22, m128iS23, m128iS24, m128iS25, m128iS26, m128iS27,
5226             m128iS28, m128iS29, m128iS30, m128iS31, m128Tmp8, m128Tmp9,
5227             m128Tmp10, m128Tmp11, m128Tmp12, m128Tmp13, m128Tmp14, m128Tmp15,
5228             O8h, O9h, O10h, O11h, O12h, O13h, O14h, O15h, O8l, O9l, O10l, O11l,
5229             O12l, O13l, O14l, O15l, E02l, E02h, E03l, E03h, EE7l, EE6l, EE5l,
5230             EE4l, EE7h, EE6h, EE5h, EE4h;
5231     m128iS0 = _mm_load_si128((__m128i *) (src));
5232     m128iS1 = _mm_load_si128((__m128i *) (src + 32));
5233     m128iS2 = _mm_load_si128((__m128i *) (src + 64));
5234     m128iS3 = _mm_load_si128((__m128i *) (src + 96));
5235     m128iS4 = _mm_loadu_si128((__m128i *) (src + 128));
5236     m128iS5 = _mm_load_si128((__m128i *) (src + 160));
5237     m128iS6 = _mm_load_si128((__m128i *) (src + 192));
5238     m128iS7 = _mm_load_si128((__m128i *) (src + 224));
5239     m128iS8 = _mm_load_si128((__m128i *) (src + 256));
5240     m128iS9 = _mm_load_si128((__m128i *) (src + 288));
5241     m128iS10 = _mm_load_si128((__m128i *) (src + 320));
5242     m128iS11 = _mm_load_si128((__m128i *) (src + 352));
5243     m128iS12 = _mm_loadu_si128((__m128i *) (src + 384));
5244     m128iS13 = _mm_load_si128((__m128i *) (src + 416));
5245     m128iS14 = _mm_load_si128((__m128i *) (src + 448));
5246     m128iS15 = _mm_load_si128((__m128i *) (src + 480));
5247     m128iS16 = _mm_load_si128((__m128i *) (src + 512));
5248     m128iS17 = _mm_load_si128((__m128i *) (src + 544));
5249     m128iS18 = _mm_load_si128((__m128i *) (src + 576));
5250     m128iS19 = _mm_load_si128((__m128i *) (src + 608));
5251     m128iS20 = _mm_load_si128((__m128i *) (src + 640));
5252     m128iS21 = _mm_load_si128((__m128i *) (src + 672));
5253     m128iS22 = _mm_load_si128((__m128i *) (src + 704));
5254     m128iS23 = _mm_load_si128((__m128i *) (src + 736));
5255     m128iS24 = _mm_load_si128((__m128i *) (src + 768));
5256     m128iS25 = _mm_load_si128((__m128i *) (src + 800));
5257     m128iS26 = _mm_load_si128((__m128i *) (src + 832));
5258     m128iS27 = _mm_load_si128((__m128i *) (src + 864));
5259     m128iS28 = _mm_load_si128((__m128i *) (src + 896));
5260     m128iS29 = _mm_load_si128((__m128i *) (src + 928));
5261     m128iS30 = _mm_load_si128((__m128i *) (src + 960));
5262     m128iS31 = _mm_load_si128((__m128i *) (src + 992));
5263 
5264     shift = shift_1st;
5265     m128iAdd = _mm_set1_epi32(add_1st);
5266 
5267     for (j = 0; j < 2; j++) {
5268         for (i = 0; i < 32; i += 8) {
5269             m128Tmp0 = _mm_unpacklo_epi16(m128iS1, m128iS3);
5270             E0l = _mm_madd_epi16(m128Tmp0,
5271                     _mm_load_si128((__m128i *) (transform32x32[0][0])));
5272             m128Tmp1 = _mm_unpackhi_epi16(m128iS1, m128iS3);
5273             E0h = _mm_madd_epi16(m128Tmp1,
5274                     _mm_load_si128((__m128i *) (transform32x32[0][0])));
5275 
5276             m128Tmp2 = _mm_unpacklo_epi16(m128iS5, m128iS7);
5277             E1l = _mm_madd_epi16(m128Tmp2,
5278                     _mm_load_si128((__m128i *) (transform32x32[1][0])));
5279             m128Tmp3 = _mm_unpackhi_epi16(m128iS5, m128iS7);
5280             E1h = _mm_madd_epi16(m128Tmp3,
5281                     _mm_load_si128((__m128i *) (transform32x32[1][0])));
5282 
5283             m128Tmp4 = _mm_unpacklo_epi16(m128iS9, m128iS11);
5284             E2l = _mm_madd_epi16(m128Tmp4,
5285                     _mm_load_si128((__m128i *) (transform32x32[2][0])));
5286             m128Tmp5 = _mm_unpackhi_epi16(m128iS9, m128iS11);
5287             E2h = _mm_madd_epi16(m128Tmp5,
5288                     _mm_load_si128((__m128i *) (transform32x32[2][0])));
5289 
5290             m128Tmp6 = _mm_unpacklo_epi16(m128iS13, m128iS15);
5291             E3l = _mm_madd_epi16(m128Tmp6,
5292                     _mm_load_si128((__m128i *) (transform32x32[3][0])));
5293             m128Tmp7 = _mm_unpackhi_epi16(m128iS13, m128iS15);
5294             E3h = _mm_madd_epi16(m128Tmp7,
5295                     _mm_load_si128((__m128i *) (transform32x32[3][0])));
5296 
5297             m128Tmp8 = _mm_unpacklo_epi16(m128iS17, m128iS19);
5298             E4l = _mm_madd_epi16(m128Tmp8,
5299                     _mm_load_si128((__m128i *) (transform32x32[4][0])));
5300             m128Tmp9 = _mm_unpackhi_epi16(m128iS17, m128iS19);
5301             E4h = _mm_madd_epi16(m128Tmp9,
5302                     _mm_load_si128((__m128i *) (transform32x32[4][0])));
5303 
5304             m128Tmp10 = _mm_unpacklo_epi16(m128iS21, m128iS23);
5305             E5l = _mm_madd_epi16(m128Tmp10,
5306                     _mm_load_si128((__m128i *) (transform32x32[5][0])));
5307             m128Tmp11 = _mm_unpackhi_epi16(m128iS21, m128iS23);
5308             E5h = _mm_madd_epi16(m128Tmp11,
5309                     _mm_load_si128((__m128i *) (transform32x32[5][0])));
5310 
5311             m128Tmp12 = _mm_unpacklo_epi16(m128iS25, m128iS27);
5312             E6l = _mm_madd_epi16(m128Tmp12,
5313                     _mm_load_si128((__m128i *) (transform32x32[6][0])));
5314             m128Tmp13 = _mm_unpackhi_epi16(m128iS25, m128iS27);
5315             E6h = _mm_madd_epi16(m128Tmp13,
5316                     _mm_load_si128((__m128i *) (transform32x32[6][0])));
5317 
5318             m128Tmp14 = _mm_unpacklo_epi16(m128iS29, m128iS31);
5319             E7l = _mm_madd_epi16(m128Tmp14,
5320                     _mm_load_si128((__m128i *) (transform32x32[7][0])));
5321             m128Tmp15 = _mm_unpackhi_epi16(m128iS29, m128iS31);
5322             E7h = _mm_madd_epi16(m128Tmp15,
5323                     _mm_load_si128((__m128i *) (transform32x32[7][0])));
5324 
5325             O0l = _mm_add_epi32(E0l, E1l);
5326             O0l = _mm_add_epi32(O0l, E2l);
5327             O0l = _mm_add_epi32(O0l, E3l);
5328             O0l = _mm_add_epi32(O0l, E4l);
5329             O0l = _mm_add_epi32(O0l, E5l);
5330             O0l = _mm_add_epi32(O0l, E6l);
5331             O0l = _mm_add_epi32(O0l, E7l);
5332 
5333             O0h = _mm_add_epi32(E0h, E1h);
5334             O0h = _mm_add_epi32(O0h, E2h);
5335             O0h = _mm_add_epi32(O0h, E3h);
5336             O0h = _mm_add_epi32(O0h, E4h);
5337             O0h = _mm_add_epi32(O0h, E5h);
5338             O0h = _mm_add_epi32(O0h, E6h);
5339             O0h = _mm_add_epi32(O0h, E7h);
5340 
5341             /* Compute O1*/
5342             E0l = _mm_madd_epi16(m128Tmp0,
5343                     _mm_load_si128((__m128i *) (transform32x32[0][1])));
5344             E0h = _mm_madd_epi16(m128Tmp1,
5345                     _mm_load_si128((__m128i *) (transform32x32[0][1])));
5346             E1l = _mm_madd_epi16(m128Tmp2,
5347                     _mm_load_si128((__m128i *) (transform32x32[1][1])));
5348             E1h = _mm_madd_epi16(m128Tmp3,
5349                     _mm_load_si128((__m128i *) (transform32x32[1][1])));
5350             E2l = _mm_madd_epi16(m128Tmp4,
5351                     _mm_load_si128((__m128i *) (transform32x32[2][1])));
5352             E2h = _mm_madd_epi16(m128Tmp5,
5353                     _mm_load_si128((__m128i *) (transform32x32[2][1])));
5354             E3l = _mm_madd_epi16(m128Tmp6,
5355                     _mm_load_si128((__m128i *) (transform32x32[3][1])));
5356             E3h = _mm_madd_epi16(m128Tmp7,
5357                     _mm_load_si128((__m128i *) (transform32x32[3][1])));
5358 
5359             E4l = _mm_madd_epi16(m128Tmp8,
5360                     _mm_load_si128((__m128i *) (transform32x32[4][1])));
5361             E4h = _mm_madd_epi16(m128Tmp9,
5362                     _mm_load_si128((__m128i *) (transform32x32[4][1])));
5363             E5l = _mm_madd_epi16(m128Tmp10,
5364                     _mm_load_si128((__m128i *) (transform32x32[5][1])));
5365             E5h = _mm_madd_epi16(m128Tmp11,
5366                     _mm_load_si128((__m128i *) (transform32x32[5][1])));
5367             E6l = _mm_madd_epi16(m128Tmp12,
5368                     _mm_load_si128((__m128i *) (transform32x32[6][1])));
5369             E6h = _mm_madd_epi16(m128Tmp13,
5370                     _mm_load_si128((__m128i *) (transform32x32[6][1])));
5371             E7l = _mm_madd_epi16(m128Tmp14,
5372                     _mm_load_si128((__m128i *) (transform32x32[7][1])));
5373             E7h = _mm_madd_epi16(m128Tmp15,
5374                     _mm_load_si128((__m128i *) (transform32x32[7][1])));
5375 
5376             O1l = _mm_add_epi32(E0l, E1l);
5377             O1l = _mm_add_epi32(O1l, E2l);
5378             O1l = _mm_add_epi32(O1l, E3l);
5379             O1l = _mm_add_epi32(O1l, E4l);
5380             O1l = _mm_add_epi32(O1l, E5l);
5381             O1l = _mm_add_epi32(O1l, E6l);
5382             O1l = _mm_add_epi32(O1l, E7l);
5383 
5384             O1h = _mm_add_epi32(E0h, E1h);
5385             O1h = _mm_add_epi32(O1h, E2h);
5386             O1h = _mm_add_epi32(O1h, E3h);
5387             O1h = _mm_add_epi32(O1h, E4h);
5388             O1h = _mm_add_epi32(O1h, E5h);
5389             O1h = _mm_add_epi32(O1h, E6h);
5390             O1h = _mm_add_epi32(O1h, E7h);
5391             /* Compute O2*/
5392             E0l = _mm_madd_epi16(m128Tmp0,
5393                     _mm_load_si128((__m128i *) (transform32x32[0][2])));
5394             E0h = _mm_madd_epi16(m128Tmp1,
5395                     _mm_load_si128((__m128i *) (transform32x32[0][2])));
5396             E1l = _mm_madd_epi16(m128Tmp2,
5397                     _mm_load_si128((__m128i *) (transform32x32[1][2])));
5398             E1h = _mm_madd_epi16(m128Tmp3,
5399                     _mm_load_si128((__m128i *) (transform32x32[1][2])));
5400             E2l = _mm_madd_epi16(m128Tmp4,
5401                     _mm_load_si128((__m128i *) (transform32x32[2][2])));
5402             E2h = _mm_madd_epi16(m128Tmp5,
5403                     _mm_load_si128((__m128i *) (transform32x32[2][2])));
5404             E3l = _mm_madd_epi16(m128Tmp6,
5405                     _mm_load_si128((__m128i *) (transform32x32[3][2])));
5406             E3h = _mm_madd_epi16(m128Tmp7,
5407                     _mm_load_si128((__m128i *) (transform32x32[3][2])));
5408 
5409             E4l = _mm_madd_epi16(m128Tmp8,
5410                     _mm_load_si128((__m128i *) (transform32x32[4][2])));
5411             E4h = _mm_madd_epi16(m128Tmp9,
5412                     _mm_load_si128((__m128i *) (transform32x32[4][2])));
5413             E5l = _mm_madd_epi16(m128Tmp10,
5414                     _mm_load_si128((__m128i *) (transform32x32[5][2])));
5415             E5h = _mm_madd_epi16(m128Tmp11,
5416                     _mm_load_si128((__m128i *) (transform32x32[5][2])));
5417             E6l = _mm_madd_epi16(m128Tmp12,
5418                     _mm_load_si128((__m128i *) (transform32x32[6][2])));
5419             E6h = _mm_madd_epi16(m128Tmp13,
5420                     _mm_load_si128((__m128i *) (transform32x32[6][2])));
5421             E7l = _mm_madd_epi16(m128Tmp14,
5422                     _mm_load_si128((__m128i *) (transform32x32[7][2])));
5423             E7h = _mm_madd_epi16(m128Tmp15,
5424                     _mm_load_si128((__m128i *) (transform32x32[7][2])));
5425 
5426             O2l = _mm_add_epi32(E0l, E1l);
5427             O2l = _mm_add_epi32(O2l, E2l);
5428             O2l = _mm_add_epi32(O2l, E3l);
5429             O2l = _mm_add_epi32(O2l, E4l);
5430             O2l = _mm_add_epi32(O2l, E5l);
5431             O2l = _mm_add_epi32(O2l, E6l);
5432             O2l = _mm_add_epi32(O2l, E7l);
5433 
5434             O2h = _mm_add_epi32(E0h, E1h);
5435             O2h = _mm_add_epi32(O2h, E2h);
5436             O2h = _mm_add_epi32(O2h, E3h);
5437             O2h = _mm_add_epi32(O2h, E4h);
5438             O2h = _mm_add_epi32(O2h, E5h);
5439             O2h = _mm_add_epi32(O2h, E6h);
5440             O2h = _mm_add_epi32(O2h, E7h);
5441             /* Compute O3*/
5442             E0l = _mm_madd_epi16(m128Tmp0,
5443                     _mm_load_si128((__m128i *) (transform32x32[0][3])));
5444             E0h = _mm_madd_epi16(m128Tmp1,
5445                     _mm_load_si128((__m128i *) (transform32x32[0][3])));
5446             E1l = _mm_madd_epi16(m128Tmp2,
5447                     _mm_load_si128((__m128i *) (transform32x32[1][3])));
5448             E1h = _mm_madd_epi16(m128Tmp3,
5449                     _mm_load_si128((__m128i *) (transform32x32[1][3])));
5450             E2l = _mm_madd_epi16(m128Tmp4,
5451                     _mm_load_si128((__m128i *) (transform32x32[2][3])));
5452             E2h = _mm_madd_epi16(m128Tmp5,
5453                     _mm_load_si128((__m128i *) (transform32x32[2][3])));
5454             E3l = _mm_madd_epi16(m128Tmp6,
5455                     _mm_load_si128((__m128i *) (transform32x32[3][3])));
5456             E3h = _mm_madd_epi16(m128Tmp7,
5457                     _mm_load_si128((__m128i *) (transform32x32[3][3])));
5458 
5459             E4l = _mm_madd_epi16(m128Tmp8,
5460                     _mm_load_si128((__m128i *) (transform32x32[4][3])));
5461             E4h = _mm_madd_epi16(m128Tmp9,
5462                     _mm_load_si128((__m128i *) (transform32x32[4][3])));
5463             E5l = _mm_madd_epi16(m128Tmp10,
5464                     _mm_load_si128((__m128i *) (transform32x32[5][3])));
5465             E5h = _mm_madd_epi16(m128Tmp11,
5466                     _mm_load_si128((__m128i *) (transform32x32[5][3])));
5467             E6l = _mm_madd_epi16(m128Tmp12,
5468                     _mm_load_si128((__m128i *) (transform32x32[6][3])));
5469             E6h = _mm_madd_epi16(m128Tmp13,
5470                     _mm_load_si128((__m128i *) (transform32x32[6][3])));
5471             E7l = _mm_madd_epi16(m128Tmp14,
5472                     _mm_load_si128((__m128i *) (transform32x32[7][3])));
5473             E7h = _mm_madd_epi16(m128Tmp15,
5474                     _mm_load_si128((__m128i *) (transform32x32[7][3])));
5475 
5476             O3l = _mm_add_epi32(E0l, E1l);
5477             O3l = _mm_add_epi32(O3l, E2l);
5478             O3l = _mm_add_epi32(O3l, E3l);
5479             O3l = _mm_add_epi32(O3l, E4l);
5480             O3l = _mm_add_epi32(O3l, E5l);
5481             O3l = _mm_add_epi32(O3l, E6l);
5482             O3l = _mm_add_epi32(O3l, E7l);
5483 
5484             O3h = _mm_add_epi32(E0h, E1h);
5485             O3h = _mm_add_epi32(O3h, E2h);
5486             O3h = _mm_add_epi32(O3h, E3h);
5487             O3h = _mm_add_epi32(O3h, E4h);
5488             O3h = _mm_add_epi32(O3h, E5h);
5489             O3h = _mm_add_epi32(O3h, E6h);
5490             O3h = _mm_add_epi32(O3h, E7h);
5491             /* Compute O4*/
5492 
5493             E0l = _mm_madd_epi16(m128Tmp0,
5494                     _mm_load_si128((__m128i *) (transform32x32[0][4])));
5495             E0h = _mm_madd_epi16(m128Tmp1,
5496                     _mm_load_si128((__m128i *) (transform32x32[0][4])));
5497             E1l = _mm_madd_epi16(m128Tmp2,
5498                     _mm_load_si128((__m128i *) (transform32x32[1][4])));
5499             E1h = _mm_madd_epi16(m128Tmp3,
5500                     _mm_load_si128((__m128i *) (transform32x32[1][4])));
5501             E2l = _mm_madd_epi16(m128Tmp4,
5502                     _mm_load_si128((__m128i *) (transform32x32[2][4])));
5503             E2h = _mm_madd_epi16(m128Tmp5,
5504                     _mm_load_si128((__m128i *) (transform32x32[2][4])));
5505             E3l = _mm_madd_epi16(m128Tmp6,
5506                     _mm_load_si128((__m128i *) (transform32x32[3][4])));
5507             E3h = _mm_madd_epi16(m128Tmp7,
5508                     _mm_load_si128((__m128i *) (transform32x32[3][4])));
5509 
5510             E4l = _mm_madd_epi16(m128Tmp8,
5511                     _mm_load_si128((__m128i *) (transform32x32[4][4])));
5512             E4h = _mm_madd_epi16(m128Tmp9,
5513                     _mm_load_si128((__m128i *) (transform32x32[4][4])));
5514             E5l = _mm_madd_epi16(m128Tmp10,
5515                     _mm_load_si128((__m128i *) (transform32x32[5][4])));
5516             E5h = _mm_madd_epi16(m128Tmp11,
5517                     _mm_load_si128((__m128i *) (transform32x32[5][4])));
5518             E6l = _mm_madd_epi16(m128Tmp12,
5519                     _mm_load_si128((__m128i *) (transform32x32[6][4])));
5520             E6h = _mm_madd_epi16(m128Tmp13,
5521                     _mm_load_si128((__m128i *) (transform32x32[6][4])));
5522             E7l = _mm_madd_epi16(m128Tmp14,
5523                     _mm_load_si128((__m128i *) (transform32x32[7][4])));
5524             E7h = _mm_madd_epi16(m128Tmp15,
5525                     _mm_load_si128((__m128i *) (transform32x32[7][4])));
5526 
5527             O4l = _mm_add_epi32(E0l, E1l);
5528             O4l = _mm_add_epi32(O4l, E2l);
5529             O4l = _mm_add_epi32(O4l, E3l);
5530             O4l = _mm_add_epi32(O4l, E4l);
5531             O4l = _mm_add_epi32(O4l, E5l);
5532             O4l = _mm_add_epi32(O4l, E6l);
5533             O4l = _mm_add_epi32(O4l, E7l);
5534 
5535             O4h = _mm_add_epi32(E0h, E1h);
5536             O4h = _mm_add_epi32(O4h, E2h);
5537             O4h = _mm_add_epi32(O4h, E3h);
5538             O4h = _mm_add_epi32(O4h, E4h);
5539             O4h = _mm_add_epi32(O4h, E5h);
5540             O4h = _mm_add_epi32(O4h, E6h);
5541             O4h = _mm_add_epi32(O4h, E7h);
5542 
5543             /* Compute O5*/
5544             E0l = _mm_madd_epi16(m128Tmp0,
5545                     _mm_load_si128((__m128i *) (transform32x32[0][5])));
5546             E0h = _mm_madd_epi16(m128Tmp1,
5547                     _mm_load_si128((__m128i *) (transform32x32[0][5])));
5548             E1l = _mm_madd_epi16(m128Tmp2,
5549                     _mm_load_si128((__m128i *) (transform32x32[1][5])));
5550             E1h = _mm_madd_epi16(m128Tmp3,
5551                     _mm_load_si128((__m128i *) (transform32x32[1][5])));
5552             E2l = _mm_madd_epi16(m128Tmp4,
5553                     _mm_load_si128((__m128i *) (transform32x32[2][5])));
5554             E2h = _mm_madd_epi16(m128Tmp5,
5555                     _mm_load_si128((__m128i *) (transform32x32[2][5])));
5556             E3l = _mm_madd_epi16(m128Tmp6,
5557                     _mm_load_si128((__m128i *) (transform32x32[3][5])));
5558             E3h = _mm_madd_epi16(m128Tmp7,
5559                     _mm_load_si128((__m128i *) (transform32x32[3][5])));
5560 
5561             E4l = _mm_madd_epi16(m128Tmp8,
5562                     _mm_load_si128((__m128i *) (transform32x32[4][5])));
5563             E4h = _mm_madd_epi16(m128Tmp9,
5564                     _mm_load_si128((__m128i *) (transform32x32[4][5])));
5565             E5l = _mm_madd_epi16(m128Tmp10,
5566                     _mm_load_si128((__m128i *) (transform32x32[5][5])));
5567             E5h = _mm_madd_epi16(m128Tmp11,
5568                     _mm_load_si128((__m128i *) (transform32x32[5][5])));
5569             E6l = _mm_madd_epi16(m128Tmp12,
5570                     _mm_load_si128((__m128i *) (transform32x32[6][5])));
5571             E6h = _mm_madd_epi16(m128Tmp13,
5572                     _mm_load_si128((__m128i *) (transform32x32[6][5])));
5573             E7l = _mm_madd_epi16(m128Tmp14,
5574                     _mm_load_si128((__m128i *) (transform32x32[7][5])));
5575             E7h = _mm_madd_epi16(m128Tmp15,
5576                     _mm_load_si128((__m128i *) (transform32x32[7][5])));
5577 
5578             O5l = _mm_add_epi32(E0l, E1l);
5579             O5l = _mm_add_epi32(O5l, E2l);
5580             O5l = _mm_add_epi32(O5l, E3l);
5581             O5l = _mm_add_epi32(O5l, E4l);
5582             O5l = _mm_add_epi32(O5l, E5l);
5583             O5l = _mm_add_epi32(O5l, E6l);
5584             O5l = _mm_add_epi32(O5l, E7l);
5585 
5586             O5h = _mm_add_epi32(E0h, E1h);
5587             O5h = _mm_add_epi32(O5h, E2h);
5588             O5h = _mm_add_epi32(O5h, E3h);
5589             O5h = _mm_add_epi32(O5h, E4h);
5590             O5h = _mm_add_epi32(O5h, E5h);
5591             O5h = _mm_add_epi32(O5h, E6h);
5592             O5h = _mm_add_epi32(O5h, E7h);
5593 
5594             /* Compute O6*/
5595 
5596             E0l = _mm_madd_epi16(m128Tmp0,
5597                     _mm_load_si128((__m128i *) (transform32x32[0][6])));
5598             E0h = _mm_madd_epi16(m128Tmp1,
5599                     _mm_load_si128((__m128i *) (transform32x32[0][6])));
5600             E1l = _mm_madd_epi16(m128Tmp2,
5601                     _mm_load_si128((__m128i *) (transform32x32[1][6])));
5602             E1h = _mm_madd_epi16(m128Tmp3,
5603                     _mm_load_si128((__m128i *) (transform32x32[1][6])));
5604             E2l = _mm_madd_epi16(m128Tmp4,
5605                     _mm_load_si128((__m128i *) (transform32x32[2][6])));
5606             E2h = _mm_madd_epi16(m128Tmp5,
5607                     _mm_load_si128((__m128i *) (transform32x32[2][6])));
5608             E3l = _mm_madd_epi16(m128Tmp6,
5609                     _mm_load_si128((__m128i *) (transform32x32[3][6])));
5610             E3h = _mm_madd_epi16(m128Tmp7,
5611                     _mm_load_si128((__m128i *) (transform32x32[3][6])));
5612 
5613             E4l = _mm_madd_epi16(m128Tmp8,
5614                     _mm_load_si128((__m128i *) (transform32x32[4][6])));
5615             E4h = _mm_madd_epi16(m128Tmp9,
5616                     _mm_load_si128((__m128i *) (transform32x32[4][6])));
5617             E5l = _mm_madd_epi16(m128Tmp10,
5618                     _mm_load_si128((__m128i *) (transform32x32[5][6])));
5619             E5h = _mm_madd_epi16(m128Tmp11,
5620                     _mm_load_si128((__m128i *) (transform32x32[5][6])));
5621             E6l = _mm_madd_epi16(m128Tmp12,
5622                     _mm_load_si128((__m128i *) (transform32x32[6][6])));
5623             E6h = _mm_madd_epi16(m128Tmp13,
5624                     _mm_load_si128((__m128i *) (transform32x32[6][6])));
5625             E7l = _mm_madd_epi16(m128Tmp14,
5626                     _mm_load_si128((__m128i *) (transform32x32[7][6])));
5627             E7h = _mm_madd_epi16(m128Tmp15,
5628                     _mm_load_si128((__m128i *) (transform32x32[7][6])));
5629 
5630             O6l = _mm_add_epi32(E0l, E1l);
5631             O6l = _mm_add_epi32(O6l, E2l);
5632             O6l = _mm_add_epi32(O6l, E3l);
5633             O6l = _mm_add_epi32(O6l, E4l);
5634             O6l = _mm_add_epi32(O6l, E5l);
5635             O6l = _mm_add_epi32(O6l, E6l);
5636             O6l = _mm_add_epi32(O6l, E7l);
5637 
5638             O6h = _mm_add_epi32(E0h, E1h);
5639             O6h = _mm_add_epi32(O6h, E2h);
5640             O6h = _mm_add_epi32(O6h, E3h);
5641             O6h = _mm_add_epi32(O6h, E4h);
5642             O6h = _mm_add_epi32(O6h, E5h);
5643             O6h = _mm_add_epi32(O6h, E6h);
5644             O6h = _mm_add_epi32(O6h, E7h);
5645 
5646             /* Compute O7*/
5647 
5648             E0l = _mm_madd_epi16(m128Tmp0,
5649                     _mm_load_si128((__m128i *) (transform32x32[0][7])));
5650             E0h = _mm_madd_epi16(m128Tmp1,
5651                     _mm_load_si128((__m128i *) (transform32x32[0][7])));
5652             E1l = _mm_madd_epi16(m128Tmp2,
5653                     _mm_load_si128((__m128i *) (transform32x32[1][7])));
5654             E1h = _mm_madd_epi16(m128Tmp3,
5655                     _mm_load_si128((__m128i *) (transform32x32[1][7])));
5656             E2l = _mm_madd_epi16(m128Tmp4,
5657                     _mm_load_si128((__m128i *) (transform32x32[2][7])));
5658             E2h = _mm_madd_epi16(m128Tmp5,
5659                     _mm_load_si128((__m128i *) (transform32x32[2][7])));
5660             E3l = _mm_madd_epi16(m128Tmp6,
5661                     _mm_load_si128((__m128i *) (transform32x32[3][7])));
5662             E3h = _mm_madd_epi16(m128Tmp7,
5663                     _mm_load_si128((__m128i *) (transform32x32[3][7])));
5664 
5665             E4l = _mm_madd_epi16(m128Tmp8,
5666                     _mm_load_si128((__m128i *) (transform32x32[4][7])));
5667             E4h = _mm_madd_epi16(m128Tmp9,
5668                     _mm_load_si128((__m128i *) (transform32x32[4][7])));
5669             E5l = _mm_madd_epi16(m128Tmp10,
5670                     _mm_load_si128((__m128i *) (transform32x32[5][7])));
5671             E5h = _mm_madd_epi16(m128Tmp11,
5672                     _mm_load_si128((__m128i *) (transform32x32[5][7])));
5673             E6l = _mm_madd_epi16(m128Tmp12,
5674                     _mm_load_si128((__m128i *) (transform32x32[6][7])));
5675             E6h = _mm_madd_epi16(m128Tmp13,
5676                     _mm_load_si128((__m128i *) (transform32x32[6][7])));
5677             E7l = _mm_madd_epi16(m128Tmp14,
5678                     _mm_load_si128((__m128i *) (transform32x32[7][7])));
5679             E7h = _mm_madd_epi16(m128Tmp15,
5680                     _mm_load_si128((__m128i *) (transform32x32[7][7])));
5681 
5682             O7l = _mm_add_epi32(E0l, E1l);
5683             O7l = _mm_add_epi32(O7l, E2l);
5684             O7l = _mm_add_epi32(O7l, E3l);
5685             O7l = _mm_add_epi32(O7l, E4l);
5686             O7l = _mm_add_epi32(O7l, E5l);
5687             O7l = _mm_add_epi32(O7l, E6l);
5688             O7l = _mm_add_epi32(O7l, E7l);
5689 
5690             O7h = _mm_add_epi32(E0h, E1h);
5691             O7h = _mm_add_epi32(O7h, E2h);
5692             O7h = _mm_add_epi32(O7h, E3h);
5693             O7h = _mm_add_epi32(O7h, E4h);
5694             O7h = _mm_add_epi32(O7h, E5h);
5695             O7h = _mm_add_epi32(O7h, E6h);
5696             O7h = _mm_add_epi32(O7h, E7h);
5697 
5698             /* Compute O8*/
5699 
5700             E0l = _mm_madd_epi16(m128Tmp0,
5701                     _mm_load_si128((__m128i *) (transform32x32[0][8])));
5702             E0h = _mm_madd_epi16(m128Tmp1,
5703                     _mm_load_si128((__m128i *) (transform32x32[0][8])));
5704             E1l = _mm_madd_epi16(m128Tmp2,
5705                     _mm_load_si128((__m128i *) (transform32x32[1][8])));
5706             E1h = _mm_madd_epi16(m128Tmp3,
5707                     _mm_load_si128((__m128i *) (transform32x32[1][8])));
5708             E2l = _mm_madd_epi16(m128Tmp4,
5709                     _mm_load_si128((__m128i *) (transform32x32[2][8])));
5710             E2h = _mm_madd_epi16(m128Tmp5,
5711                     _mm_load_si128((__m128i *) (transform32x32[2][8])));
5712             E3l = _mm_madd_epi16(m128Tmp6,
5713                     _mm_load_si128((__m128i *) (transform32x32[3][8])));
5714             E3h = _mm_madd_epi16(m128Tmp7,
5715                     _mm_load_si128((__m128i *) (transform32x32[3][8])));
5716 
5717             E4l = _mm_madd_epi16(m128Tmp8,
5718                     _mm_load_si128((__m128i *) (transform32x32[4][8])));
5719             E4h = _mm_madd_epi16(m128Tmp9,
5720                     _mm_load_si128((__m128i *) (transform32x32[4][8])));
5721             E5l = _mm_madd_epi16(m128Tmp10,
5722                     _mm_load_si128((__m128i *) (transform32x32[5][8])));
5723             E5h = _mm_madd_epi16(m128Tmp11,
5724                     _mm_load_si128((__m128i *) (transform32x32[5][8])));
5725             E6l = _mm_madd_epi16(m128Tmp12,
5726                     _mm_load_si128((__m128i *) (transform32x32[6][8])));
5727             E6h = _mm_madd_epi16(m128Tmp13,
5728                     _mm_load_si128((__m128i *) (transform32x32[6][8])));
5729             E7l = _mm_madd_epi16(m128Tmp14,
5730                     _mm_load_si128((__m128i *) (transform32x32[7][8])));
5731             E7h = _mm_madd_epi16(m128Tmp15,
5732                     _mm_load_si128((__m128i *) (transform32x32[7][8])));
5733 
5734             O8l = _mm_add_epi32(E0l, E1l);
5735             O8l = _mm_add_epi32(O8l, E2l);
5736             O8l = _mm_add_epi32(O8l, E3l);
5737             O8l = _mm_add_epi32(O8l, E4l);
5738             O8l = _mm_add_epi32(O8l, E5l);
5739             O8l = _mm_add_epi32(O8l, E6l);
5740             O8l = _mm_add_epi32(O8l, E7l);
5741 
5742             O8h = _mm_add_epi32(E0h, E1h);
5743             O8h = _mm_add_epi32(O8h, E2h);
5744             O8h = _mm_add_epi32(O8h, E3h);
5745             O8h = _mm_add_epi32(O8h, E4h);
5746             O8h = _mm_add_epi32(O8h, E5h);
5747             O8h = _mm_add_epi32(O8h, E6h);
5748             O8h = _mm_add_epi32(O8h, E7h);
5749 
5750             /* Compute O9*/
5751 
5752             E0l = _mm_madd_epi16(m128Tmp0,
5753                     _mm_load_si128((__m128i *) (transform32x32[0][9])));
5754             E0h = _mm_madd_epi16(m128Tmp1,
5755                     _mm_load_si128((__m128i *) (transform32x32[0][9])));
5756             E1l = _mm_madd_epi16(m128Tmp2,
5757                     _mm_load_si128((__m128i *) (transform32x32[1][9])));
5758             E1h = _mm_madd_epi16(m128Tmp3,
5759                     _mm_load_si128((__m128i *) (transform32x32[1][9])));
5760             E2l = _mm_madd_epi16(m128Tmp4,
5761                     _mm_load_si128((__m128i *) (transform32x32[2][9])));
5762             E2h = _mm_madd_epi16(m128Tmp5,
5763                     _mm_load_si128((__m128i *) (transform32x32[2][9])));
5764             E3l = _mm_madd_epi16(m128Tmp6,
5765                     _mm_load_si128((__m128i *) (transform32x32[3][9])));
5766             E3h = _mm_madd_epi16(m128Tmp7,
5767                     _mm_load_si128((__m128i *) (transform32x32[3][9])));
5768 
5769             E4l = _mm_madd_epi16(m128Tmp8,
5770                     _mm_load_si128((__m128i *) (transform32x32[4][9])));
5771             E4h = _mm_madd_epi16(m128Tmp9,
5772                     _mm_load_si128((__m128i *) (transform32x32[4][9])));
5773             E5l = _mm_madd_epi16(m128Tmp10,
5774                     _mm_load_si128((__m128i *) (transform32x32[5][9])));
5775             E5h = _mm_madd_epi16(m128Tmp11,
5776                     _mm_load_si128((__m128i *) (transform32x32[5][9])));
5777             E6l = _mm_madd_epi16(m128Tmp12,
5778                     _mm_load_si128((__m128i *) (transform32x32[6][9])));
5779             E6h = _mm_madd_epi16(m128Tmp13,
5780                     _mm_load_si128((__m128i *) (transform32x32[6][9])));
5781             E7l = _mm_madd_epi16(m128Tmp14,
5782                     _mm_load_si128((__m128i *) (transform32x32[7][9])));
5783             E7h = _mm_madd_epi16(m128Tmp15,
5784                     _mm_load_si128((__m128i *) (transform32x32[7][9])));
5785 
5786             O9l = _mm_add_epi32(E0l, E1l);
5787             O9l = _mm_add_epi32(O9l, E2l);
5788             O9l = _mm_add_epi32(O9l, E3l);
5789             O9l = _mm_add_epi32(O9l, E4l);
5790             O9l = _mm_add_epi32(O9l, E5l);
5791             O9l = _mm_add_epi32(O9l, E6l);
5792             O9l = _mm_add_epi32(O9l, E7l);
5793 
5794             O9h = _mm_add_epi32(E0h, E1h);
5795             O9h = _mm_add_epi32(O9h, E2h);
5796             O9h = _mm_add_epi32(O9h, E3h);
5797             O9h = _mm_add_epi32(O9h, E4h);
5798             O9h = _mm_add_epi32(O9h, E5h);
5799             O9h = _mm_add_epi32(O9h, E6h);
5800             O9h = _mm_add_epi32(O9h, E7h);
5801 
5802             /* Compute 10*/
5803 
5804             E0l = _mm_madd_epi16(m128Tmp0,
5805                     _mm_load_si128((__m128i *) (transform32x32[0][10])));
5806             E0h = _mm_madd_epi16(m128Tmp1,
5807                     _mm_load_si128((__m128i *) (transform32x32[0][10])));
5808             E1l = _mm_madd_epi16(m128Tmp2,
5809                     _mm_load_si128((__m128i *) (transform32x32[1][10])));
5810             E1h = _mm_madd_epi16(m128Tmp3,
5811                     _mm_load_si128((__m128i *) (transform32x32[1][10])));
5812             E2l = _mm_madd_epi16(m128Tmp4,
5813                     _mm_load_si128((__m128i *) (transform32x32[2][10])));
5814             E2h = _mm_madd_epi16(m128Tmp5,
5815                     _mm_load_si128((__m128i *) (transform32x32[2][10])));
5816             E3l = _mm_madd_epi16(m128Tmp6,
5817                     _mm_load_si128((__m128i *) (transform32x32[3][10])));
5818             E3h = _mm_madd_epi16(m128Tmp7,
5819                     _mm_load_si128((__m128i *) (transform32x32[3][10])));
5820 
5821             E4l = _mm_madd_epi16(m128Tmp8,
5822                     _mm_load_si128((__m128i *) (transform32x32[4][10])));
5823             E4h = _mm_madd_epi16(m128Tmp9,
5824                     _mm_load_si128((__m128i *) (transform32x32[4][10])));
5825             E5l = _mm_madd_epi16(m128Tmp10,
5826                     _mm_load_si128((__m128i *) (transform32x32[5][10])));
5827             E5h = _mm_madd_epi16(m128Tmp11,
5828                     _mm_load_si128((__m128i *) (transform32x32[5][10])));
5829             E6l = _mm_madd_epi16(m128Tmp12,
5830                     _mm_load_si128((__m128i *) (transform32x32[6][10])));
5831             E6h = _mm_madd_epi16(m128Tmp13,
5832                     _mm_load_si128((__m128i *) (transform32x32[6][10])));
5833             E7l = _mm_madd_epi16(m128Tmp14,
5834                     _mm_load_si128((__m128i *) (transform32x32[7][10])));
5835             E7h = _mm_madd_epi16(m128Tmp15,
5836                     _mm_load_si128((__m128i *) (transform32x32[7][10])));
5837 
5838             O10l = _mm_add_epi32(E0l, E1l);
5839             O10l = _mm_add_epi32(O10l, E2l);
5840             O10l = _mm_add_epi32(O10l, E3l);
5841             O10l = _mm_add_epi32(O10l, E4l);
5842             O10l = _mm_add_epi32(O10l, E5l);
5843             O10l = _mm_add_epi32(O10l, E6l);
5844             O10l = _mm_add_epi32(O10l, E7l);
5845 
5846             O10h = _mm_add_epi32(E0h, E1h);
5847             O10h = _mm_add_epi32(O10h, E2h);
5848             O10h = _mm_add_epi32(O10h, E3h);
5849             O10h = _mm_add_epi32(O10h, E4h);
5850             O10h = _mm_add_epi32(O10h, E5h);
5851             O10h = _mm_add_epi32(O10h, E6h);
5852             O10h = _mm_add_epi32(O10h, E7h);
5853 
5854             /* Compute 11*/
5855 
5856             E0l = _mm_madd_epi16(m128Tmp0,
5857                     _mm_load_si128((__m128i *) (transform32x32[0][11])));
5858             E0h = _mm_madd_epi16(m128Tmp1,
5859                     _mm_load_si128((__m128i *) (transform32x32[0][11])));
5860             E1l = _mm_madd_epi16(m128Tmp2,
5861                     _mm_load_si128((__m128i *) (transform32x32[1][11])));
5862             E1h = _mm_madd_epi16(m128Tmp3,
5863                     _mm_load_si128((__m128i *) (transform32x32[1][11])));
5864             E2l = _mm_madd_epi16(m128Tmp4,
5865                     _mm_load_si128((__m128i *) (transform32x32[2][11])));
5866             E2h = _mm_madd_epi16(m128Tmp5,
5867                     _mm_load_si128((__m128i *) (transform32x32[2][11])));
5868             E3l = _mm_madd_epi16(m128Tmp6,
5869                     _mm_load_si128((__m128i *) (transform32x32[3][11])));
5870             E3h = _mm_madd_epi16(m128Tmp7,
5871                     _mm_load_si128((__m128i *) (transform32x32[3][11])));
5872 
5873             E4l = _mm_madd_epi16(m128Tmp8,
5874                     _mm_load_si128((__m128i *) (transform32x32[4][11])));
5875             E4h = _mm_madd_epi16(m128Tmp9,
5876                     _mm_load_si128((__m128i *) (transform32x32[4][11])));
5877             E5l = _mm_madd_epi16(m128Tmp10,
5878                     _mm_load_si128((__m128i *) (transform32x32[5][11])));
5879             E5h = _mm_madd_epi16(m128Tmp11,
5880                     _mm_load_si128((__m128i *) (transform32x32[5][11])));
5881             E6l = _mm_madd_epi16(m128Tmp12,
5882                     _mm_load_si128((__m128i *) (transform32x32[6][11])));
5883             E6h = _mm_madd_epi16(m128Tmp13,
5884                     _mm_load_si128((__m128i *) (transform32x32[6][11])));
5885             E7l = _mm_madd_epi16(m128Tmp14,
5886                     _mm_load_si128((__m128i *) (transform32x32[7][11])));
5887             E7h = _mm_madd_epi16(m128Tmp15,
5888                     _mm_load_si128((__m128i *) (transform32x32[7][11])));
5889 
5890             O11l = _mm_add_epi32(E0l, E1l);
5891             O11l = _mm_add_epi32(O11l, E2l);
5892             O11l = _mm_add_epi32(O11l, E3l);
5893             O11l = _mm_add_epi32(O11l, E4l);
5894             O11l = _mm_add_epi32(O11l, E5l);
5895             O11l = _mm_add_epi32(O11l, E6l);
5896             O11l = _mm_add_epi32(O11l, E7l);
5897 
5898             O11h = _mm_add_epi32(E0h, E1h);
5899             O11h = _mm_add_epi32(O11h, E2h);
5900             O11h = _mm_add_epi32(O11h, E3h);
5901             O11h = _mm_add_epi32(O11h, E4h);
5902             O11h = _mm_add_epi32(O11h, E5h);
5903             O11h = _mm_add_epi32(O11h, E6h);
5904             O11h = _mm_add_epi32(O11h, E7h);
5905 
5906             /* Compute 12*/
5907 
5908             E0l = _mm_madd_epi16(m128Tmp0,
5909                     _mm_load_si128((__m128i *) (transform32x32[0][12])));
5910             E0h = _mm_madd_epi16(m128Tmp1,
5911                     _mm_load_si128((__m128i *) (transform32x32[0][12])));
5912             E1l = _mm_madd_epi16(m128Tmp2,
5913                     _mm_load_si128((__m128i *) (transform32x32[1][12])));
5914             E1h = _mm_madd_epi16(m128Tmp3,
5915                     _mm_load_si128((__m128i *) (transform32x32[1][12])));
5916             E2l = _mm_madd_epi16(m128Tmp4,
5917                     _mm_load_si128((__m128i *) (transform32x32[2][12])));
5918             E2h = _mm_madd_epi16(m128Tmp5,
5919                     _mm_load_si128((__m128i *) (transform32x32[2][12])));
5920             E3l = _mm_madd_epi16(m128Tmp6,
5921                     _mm_load_si128((__m128i *) (transform32x32[3][12])));
5922             E3h = _mm_madd_epi16(m128Tmp7,
5923                     _mm_load_si128((__m128i *) (transform32x32[3][12])));
5924 
5925             E4l = _mm_madd_epi16(m128Tmp8,
5926                     _mm_load_si128((__m128i *) (transform32x32[4][12])));
5927             E4h = _mm_madd_epi16(m128Tmp9,
5928                     _mm_load_si128((__m128i *) (transform32x32[4][12])));
5929             E5l = _mm_madd_epi16(m128Tmp10,
5930                     _mm_load_si128((__m128i *) (transform32x32[5][12])));
5931             E5h = _mm_madd_epi16(m128Tmp11,
5932                     _mm_load_si128((__m128i *) (transform32x32[5][12])));
5933             E6l = _mm_madd_epi16(m128Tmp12,
5934                     _mm_load_si128((__m128i *) (transform32x32[6][12])));
5935             E6h = _mm_madd_epi16(m128Tmp13,
5936                     _mm_load_si128((__m128i *) (transform32x32[6][12])));
5937             E7l = _mm_madd_epi16(m128Tmp14,
5938                     _mm_load_si128((__m128i *) (transform32x32[7][12])));
5939             E7h = _mm_madd_epi16(m128Tmp15,
5940                     _mm_load_si128((__m128i *) (transform32x32[7][12])));
5941 
5942             O12l = _mm_add_epi32(E0l, E1l);
5943             O12l = _mm_add_epi32(O12l, E2l);
5944             O12l = _mm_add_epi32(O12l, E3l);
5945             O12l = _mm_add_epi32(O12l, E4l);
5946             O12l = _mm_add_epi32(O12l, E5l);
5947             O12l = _mm_add_epi32(O12l, E6l);
5948             O12l = _mm_add_epi32(O12l, E7l);
5949 
5950             O12h = _mm_add_epi32(E0h, E1h);
5951             O12h = _mm_add_epi32(O12h, E2h);
5952             O12h = _mm_add_epi32(O12h, E3h);
5953             O12h = _mm_add_epi32(O12h, E4h);
5954             O12h = _mm_add_epi32(O12h, E5h);
5955             O12h = _mm_add_epi32(O12h, E6h);
5956             O12h = _mm_add_epi32(O12h, E7h);
5957 
5958             /* Compute 13*/
5959 
5960             E0l = _mm_madd_epi16(m128Tmp0,
5961                     _mm_load_si128((__m128i *) (transform32x32[0][13])));
5962             E0h = _mm_madd_epi16(m128Tmp1,
5963                     _mm_load_si128((__m128i *) (transform32x32[0][13])));
5964             E1l = _mm_madd_epi16(m128Tmp2,
5965                     _mm_load_si128((__m128i *) (transform32x32[1][13])));
5966             E1h = _mm_madd_epi16(m128Tmp3,
5967                     _mm_load_si128((__m128i *) (transform32x32[1][13])));
5968             E2l = _mm_madd_epi16(m128Tmp4,
5969                     _mm_load_si128((__m128i *) (transform32x32[2][13])));
5970             E2h = _mm_madd_epi16(m128Tmp5,
5971                     _mm_load_si128((__m128i *) (transform32x32[2][13])));
5972             E3l = _mm_madd_epi16(m128Tmp6,
5973                     _mm_load_si128((__m128i *) (transform32x32[3][13])));
5974             E3h = _mm_madd_epi16(m128Tmp7,
5975                     _mm_load_si128((__m128i *) (transform32x32[3][13])));
5976 
5977             E4l = _mm_madd_epi16(m128Tmp8,
5978                     _mm_load_si128((__m128i *) (transform32x32[4][13])));
5979             E4h = _mm_madd_epi16(m128Tmp9,
5980                     _mm_load_si128((__m128i *) (transform32x32[4][13])));
5981             E5l = _mm_madd_epi16(m128Tmp10,
5982                     _mm_load_si128((__m128i *) (transform32x32[5][13])));
5983             E5h = _mm_madd_epi16(m128Tmp11,
5984                     _mm_load_si128((__m128i *) (transform32x32[5][13])));
5985             E6l = _mm_madd_epi16(m128Tmp12,
5986                     _mm_load_si128((__m128i *) (transform32x32[6][13])));
5987             E6h = _mm_madd_epi16(m128Tmp13,
5988                     _mm_load_si128((__m128i *) (transform32x32[6][13])));
5989             E7l = _mm_madd_epi16(m128Tmp14,
5990                     _mm_load_si128((__m128i *) (transform32x32[7][13])));
5991             E7h = _mm_madd_epi16(m128Tmp15,
5992                     _mm_load_si128((__m128i *) (transform32x32[7][13])));
5993 
5994             O13l = _mm_add_epi32(E0l, E1l);
5995             O13l = _mm_add_epi32(O13l, E2l);
5996             O13l = _mm_add_epi32(O13l, E3l);
5997             O13l = _mm_add_epi32(O13l, E4l);
5998             O13l = _mm_add_epi32(O13l, E5l);
5999             O13l = _mm_add_epi32(O13l, E6l);
6000             O13l = _mm_add_epi32(O13l, E7l);
6001 
6002             O13h = _mm_add_epi32(E0h, E1h);
6003             O13h = _mm_add_epi32(O13h, E2h);
6004             O13h = _mm_add_epi32(O13h, E3h);
6005             O13h = _mm_add_epi32(O13h, E4h);
6006             O13h = _mm_add_epi32(O13h, E5h);
6007             O13h = _mm_add_epi32(O13h, E6h);
6008             O13h = _mm_add_epi32(O13h, E7h);
6009 
6010             /* Compute O14  */
6011 
6012             E0l = _mm_madd_epi16(m128Tmp0,
6013                     _mm_load_si128((__m128i *) (transform32x32[0][14])));
6014             E0h = _mm_madd_epi16(m128Tmp1,
6015                     _mm_load_si128((__m128i *) (transform32x32[0][14])));
6016             E1l = _mm_madd_epi16(m128Tmp2,
6017                     _mm_load_si128((__m128i *) (transform32x32[1][14])));
6018             E1h = _mm_madd_epi16(m128Tmp3,
6019                     _mm_load_si128((__m128i *) (transform32x32[1][14])));
6020             E2l = _mm_madd_epi16(m128Tmp4,
6021                     _mm_load_si128((__m128i *) (transform32x32[2][14])));
6022             E2h = _mm_madd_epi16(m128Tmp5,
6023                     _mm_load_si128((__m128i *) (transform32x32[2][14])));
6024             E3l = _mm_madd_epi16(m128Tmp6,
6025                     _mm_load_si128((__m128i *) (transform32x32[3][14])));
6026             E3h = _mm_madd_epi16(m128Tmp7,
6027                     _mm_load_si128((__m128i *) (transform32x32[3][14])));
6028 
6029             E4l = _mm_madd_epi16(m128Tmp8,
6030                     _mm_load_si128((__m128i *) (transform32x32[4][14])));
6031             E4h = _mm_madd_epi16(m128Tmp9,
6032                     _mm_load_si128((__m128i *) (transform32x32[4][14])));
6033             E5l = _mm_madd_epi16(m128Tmp10,
6034                     _mm_load_si128((__m128i *) (transform32x32[5][14])));
6035             E5h = _mm_madd_epi16(m128Tmp11,
6036                     _mm_load_si128((__m128i *) (transform32x32[5][14])));
6037             E6l = _mm_madd_epi16(m128Tmp12,
6038                     _mm_load_si128((__m128i *) (transform32x32[6][14])));
6039             E6h = _mm_madd_epi16(m128Tmp13,
6040                     _mm_load_si128((__m128i *) (transform32x32[6][14])));
6041             E7l = _mm_madd_epi16(m128Tmp14,
6042                     _mm_load_si128((__m128i *) (transform32x32[7][14])));
6043             E7h = _mm_madd_epi16(m128Tmp15,
6044                     _mm_load_si128((__m128i *) (transform32x32[7][14])));
6045 
6046             O14l = _mm_add_epi32(E0l, E1l);
6047             O14l = _mm_add_epi32(O14l, E2l);
6048             O14l = _mm_add_epi32(O14l, E3l);
6049             O14l = _mm_add_epi32(O14l, E4l);
6050             O14l = _mm_add_epi32(O14l, E5l);
6051             O14l = _mm_add_epi32(O14l, E6l);
6052             O14l = _mm_add_epi32(O14l, E7l);
6053 
6054             O14h = _mm_add_epi32(E0h, E1h);
6055             O14h = _mm_add_epi32(O14h, E2h);
6056             O14h = _mm_add_epi32(O14h, E3h);
6057             O14h = _mm_add_epi32(O14h, E4h);
6058             O14h = _mm_add_epi32(O14h, E5h);
6059             O14h = _mm_add_epi32(O14h, E6h);
6060             O14h = _mm_add_epi32(O14h, E7h);
6061 
6062             /* Compute O15*/
6063 
6064             E0l = _mm_madd_epi16(m128Tmp0,
6065                     _mm_load_si128((__m128i *) (transform32x32[0][15])));
6066             E0h = _mm_madd_epi16(m128Tmp1,
6067                     _mm_load_si128((__m128i *) (transform32x32[0][15])));
6068             E1l = _mm_madd_epi16(m128Tmp2,
6069                     _mm_load_si128((__m128i *) (transform32x32[1][15])));
6070             E1h = _mm_madd_epi16(m128Tmp3,
6071                     _mm_load_si128((__m128i *) (transform32x32[1][15])));
6072             E2l = _mm_madd_epi16(m128Tmp4,
6073                     _mm_load_si128((__m128i *) (transform32x32[2][15])));
6074             E2h = _mm_madd_epi16(m128Tmp5,
6075                     _mm_load_si128((__m128i *) (transform32x32[2][15])));
6076             E3l = _mm_madd_epi16(m128Tmp6,
6077                     _mm_load_si128((__m128i *) (transform32x32[3][15])));
6078             E3h = _mm_madd_epi16(m128Tmp7,
6079                     _mm_load_si128((__m128i *) (transform32x32[3][15])));
6080 
6081             E4l = _mm_madd_epi16(m128Tmp8,
6082                     _mm_load_si128((__m128i *) (transform32x32[4][15])));
6083             E4h = _mm_madd_epi16(m128Tmp9,
6084                     _mm_load_si128((__m128i *) (transform32x32[4][15])));
6085             E5l = _mm_madd_epi16(m128Tmp10,
6086                     _mm_load_si128((__m128i *) (transform32x32[5][15])));
6087             E5h = _mm_madd_epi16(m128Tmp11,
6088                     _mm_load_si128((__m128i *) (transform32x32[5][15])));
6089             E6l = _mm_madd_epi16(m128Tmp12,
6090                     _mm_load_si128((__m128i *) (transform32x32[6][15])));
6091             E6h = _mm_madd_epi16(m128Tmp13,
6092                     _mm_load_si128((__m128i *) (transform32x32[6][15])));
6093             E7l = _mm_madd_epi16(m128Tmp14,
6094                     _mm_load_si128((__m128i *) (transform32x32[7][15])));
6095             E7h = _mm_madd_epi16(m128Tmp15,
6096                     _mm_load_si128((__m128i *) (transform32x32[7][15])));
6097 
6098             O15l = _mm_add_epi32(E0l, E1l);
6099             O15l = _mm_add_epi32(O15l, E2l);
6100             O15l = _mm_add_epi32(O15l, E3l);
6101             O15l = _mm_add_epi32(O15l, E4l);
6102             O15l = _mm_add_epi32(O15l, E5l);
6103             O15l = _mm_add_epi32(O15l, E6l);
6104             O15l = _mm_add_epi32(O15l, E7l);
6105 
6106             O15h = _mm_add_epi32(E0h, E1h);
6107             O15h = _mm_add_epi32(O15h, E2h);
6108             O15h = _mm_add_epi32(O15h, E3h);
6109             O15h = _mm_add_epi32(O15h, E4h);
6110             O15h = _mm_add_epi32(O15h, E5h);
6111             O15h = _mm_add_epi32(O15h, E6h);
6112             O15h = _mm_add_epi32(O15h, E7h);
6113             /*  Compute E0  */
6114 
6115             m128Tmp0 = _mm_unpacklo_epi16(m128iS2, m128iS6);
6116             E0l = _mm_madd_epi16(m128Tmp0,
6117                     _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
6118             m128Tmp1 = _mm_unpackhi_epi16(m128iS2, m128iS6);
6119             E0h = _mm_madd_epi16(m128Tmp1,
6120                     _mm_load_si128((__m128i *) (transform16x16_1[0][0])));
6121 
6122             m128Tmp2 = _mm_unpacklo_epi16(m128iS10, m128iS14);
6123             E0l = _mm_add_epi32(E0l,
6124                     _mm_madd_epi16(m128Tmp2,
6125                             _mm_load_si128(
6126                                     (__m128i *) (transform16x16_1[1][0]))));
6127             m128Tmp3 = _mm_unpackhi_epi16(m128iS10, m128iS14);
6128             E0h = _mm_add_epi32(E0h,
6129                     _mm_madd_epi16(m128Tmp3,
6130                             _mm_load_si128(
6131                                     (__m128i *) (transform16x16_1[1][0]))));
6132 
6133             m128Tmp4 = _mm_unpacklo_epi16(m128iS18, m128iS22);
6134             E0l = _mm_add_epi32(E0l,
6135                     _mm_madd_epi16(m128Tmp4,
6136                             _mm_load_si128(
6137                                     (__m128i *) (transform16x16_1[2][0]))));
6138             m128Tmp5 = _mm_unpackhi_epi16(m128iS18, m128iS22);
6139             E0h = _mm_add_epi32(E0h,
6140                     _mm_madd_epi16(m128Tmp5,
6141                             _mm_load_si128(
6142                                     (__m128i *) (transform16x16_1[2][0]))));
6143 
6144             m128Tmp6 = _mm_unpacklo_epi16(m128iS26, m128iS30);
6145             E0l = _mm_add_epi32(E0l,
6146                     _mm_madd_epi16(m128Tmp6,
6147                             _mm_load_si128(
6148                                     (__m128i *) (transform16x16_1[3][0]))));
6149             m128Tmp7 = _mm_unpackhi_epi16(m128iS26, m128iS30);
6150             E0h = _mm_add_epi32(E0h,
6151                     _mm_madd_epi16(m128Tmp7,
6152                             _mm_load_si128(
6153                                     (__m128i *) (transform16x16_1[3][0]))));
6154 
6155             /*  Compute E1  */
6156             E1l = _mm_madd_epi16(m128Tmp0,
6157                     _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
6158             E1h = _mm_madd_epi16(m128Tmp1,
6159                     _mm_load_si128((__m128i *) (transform16x16_1[0][1])));
6160             E1l = _mm_add_epi32(E1l,
6161                     _mm_madd_epi16(m128Tmp2,
6162                             _mm_load_si128(
6163                                     (__m128i *) (transform16x16_1[1][1]))));
6164             E1h = _mm_add_epi32(E1h,
6165                     _mm_madd_epi16(m128Tmp3,
6166                             _mm_load_si128(
6167                                     (__m128i *) (transform16x16_1[1][1]))));
6168             E1l = _mm_add_epi32(E1l,
6169                     _mm_madd_epi16(m128Tmp4,
6170                             _mm_load_si128(
6171                                     (__m128i *) (transform16x16_1[2][1]))));
6172             E1h = _mm_add_epi32(E1h,
6173                     _mm_madd_epi16(m128Tmp5,
6174                             _mm_load_si128(
6175                                     (__m128i *) (transform16x16_1[2][1]))));
6176             E1l = _mm_add_epi32(E1l,
6177                     _mm_madd_epi16(m128Tmp6,
6178                             _mm_load_si128(
6179                                     (__m128i *) (transform16x16_1[3][1]))));
6180             E1h = _mm_add_epi32(E1h,
6181                     _mm_madd_epi16(m128Tmp7,
6182                             _mm_load_si128(
6183                                     (__m128i *) (transform16x16_1[3][1]))));
6184 
6185             /*  Compute E2  */
6186             E2l = _mm_madd_epi16(m128Tmp0,
6187                     _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
6188             E2h = _mm_madd_epi16(m128Tmp1,
6189                     _mm_load_si128((__m128i *) (transform16x16_1[0][2])));
6190             E2l = _mm_add_epi32(E2l,
6191                     _mm_madd_epi16(m128Tmp2,
6192                             _mm_load_si128(
6193                                     (__m128i *) (transform16x16_1[1][2]))));
6194             E2h = _mm_add_epi32(E2h,
6195                     _mm_madd_epi16(m128Tmp3,
6196                             _mm_load_si128(
6197                                     (__m128i *) (transform16x16_1[1][2]))));
6198             E2l = _mm_add_epi32(E2l,
6199                     _mm_madd_epi16(m128Tmp4,
6200                             _mm_load_si128(
6201                                     (__m128i *) (transform16x16_1[2][2]))));
6202             E2h = _mm_add_epi32(E2h,
6203                     _mm_madd_epi16(m128Tmp5,
6204                             _mm_load_si128(
6205                                     (__m128i *) (transform16x16_1[2][2]))));
6206             E2l = _mm_add_epi32(E2l,
6207                     _mm_madd_epi16(m128Tmp6,
6208                             _mm_load_si128(
6209                                     (__m128i *) (transform16x16_1[3][2]))));
6210             E2h = _mm_add_epi32(E2h,
6211                     _mm_madd_epi16(m128Tmp7,
6212                             _mm_load_si128(
6213                                     (__m128i *) (transform16x16_1[3][2]))));
6214 
6215             /*  Compute E3  */
6216             E3l = _mm_madd_epi16(m128Tmp0,
6217                     _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
6218             E3h = _mm_madd_epi16(m128Tmp1,
6219                     _mm_load_si128((__m128i *) (transform16x16_1[0][3])));
6220             E3l = _mm_add_epi32(E3l,
6221                     _mm_madd_epi16(m128Tmp2,
6222                             _mm_load_si128(
6223                                     (__m128i *) (transform16x16_1[1][3]))));
6224             E3h = _mm_add_epi32(E3h,
6225                     _mm_madd_epi16(m128Tmp3,
6226                             _mm_load_si128(
6227                                     (__m128i *) (transform16x16_1[1][3]))));
6228             E3l = _mm_add_epi32(E3l,
6229                     _mm_madd_epi16(m128Tmp4,
6230                             _mm_load_si128(
6231                                     (__m128i *) (transform16x16_1[2][3]))));
6232             E3h = _mm_add_epi32(E3h,
6233                     _mm_madd_epi16(m128Tmp5,
6234                             _mm_load_si128(
6235                                     (__m128i *) (transform16x16_1[2][3]))));
6236             E3l = _mm_add_epi32(E3l,
6237                     _mm_madd_epi16(m128Tmp6,
6238                             _mm_load_si128(
6239                                     (__m128i *) (transform16x16_1[3][3]))));
6240             E3h = _mm_add_epi32(E3h,
6241                     _mm_madd_epi16(m128Tmp7,
6242                             _mm_load_si128(
6243                                     (__m128i *) (transform16x16_1[3][3]))));
6244 
6245             /*  Compute E4  */
6246             E4l = _mm_madd_epi16(m128Tmp0,
6247                     _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
6248             E4h = _mm_madd_epi16(m128Tmp1,
6249                     _mm_load_si128((__m128i *) (transform16x16_1[0][4])));
6250             E4l = _mm_add_epi32(E4l,
6251                     _mm_madd_epi16(m128Tmp2,
6252                             _mm_load_si128(
6253                                     (__m128i *) (transform16x16_1[1][4]))));
6254             E4h = _mm_add_epi32(E4h,
6255                     _mm_madd_epi16(m128Tmp3,
6256                             _mm_load_si128(
6257                                     (__m128i *) (transform16x16_1[1][4]))));
6258             E4l = _mm_add_epi32(E4l,
6259                     _mm_madd_epi16(m128Tmp4,
6260                             _mm_load_si128(
6261                                     (__m128i *) (transform16x16_1[2][4]))));
6262             E4h = _mm_add_epi32(E4h,
6263                     _mm_madd_epi16(m128Tmp5,
6264                             _mm_load_si128(
6265                                     (__m128i *) (transform16x16_1[2][4]))));
6266             E4l = _mm_add_epi32(E4l,
6267                     _mm_madd_epi16(m128Tmp6,
6268                             _mm_load_si128(
6269                                     (__m128i *) (transform16x16_1[3][4]))));
6270             E4h = _mm_add_epi32(E4h,
6271                     _mm_madd_epi16(m128Tmp7,
6272                             _mm_load_si128(
6273                                     (__m128i *) (transform16x16_1[3][4]))));
6274 
6275             /*  Compute E3  */
6276             E5l = _mm_madd_epi16(m128Tmp0,
6277                     _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
6278             E5h = _mm_madd_epi16(m128Tmp1,
6279                     _mm_load_si128((__m128i *) (transform16x16_1[0][5])));
6280             E5l = _mm_add_epi32(E5l,
6281                     _mm_madd_epi16(m128Tmp2,
6282                             _mm_load_si128(
6283                                     (__m128i *) (transform16x16_1[1][5]))));
6284             E5h = _mm_add_epi32(E5h,
6285                     _mm_madd_epi16(m128Tmp3,
6286                             _mm_load_si128(
6287                                     (__m128i *) (transform16x16_1[1][5]))));
6288             E5l = _mm_add_epi32(E5l,
6289                     _mm_madd_epi16(m128Tmp4,
6290                             _mm_load_si128(
6291                                     (__m128i *) (transform16x16_1[2][5]))));
6292             E5h = _mm_add_epi32(E5h,
6293                     _mm_madd_epi16(m128Tmp5,
6294                             _mm_load_si128(
6295                                     (__m128i *) (transform16x16_1[2][5]))));
6296             E5l = _mm_add_epi32(E5l,
6297                     _mm_madd_epi16(m128Tmp6,
6298                             _mm_load_si128(
6299                                     (__m128i *) (transform16x16_1[3][5]))));
6300             E5h = _mm_add_epi32(E5h,
6301                     _mm_madd_epi16(m128Tmp7,
6302                             _mm_load_si128(
6303                                     (__m128i *) (transform16x16_1[3][5]))));
6304 
6305             /*  Compute E6  */
6306             E6l = _mm_madd_epi16(m128Tmp0,
6307                     _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
6308             E6h = _mm_madd_epi16(m128Tmp1,
6309                     _mm_load_si128((__m128i *) (transform16x16_1[0][6])));
6310             E6l = _mm_add_epi32(E6l,
6311                     _mm_madd_epi16(m128Tmp2,
6312                             _mm_load_si128(
6313                                     (__m128i *) (transform16x16_1[1][6]))));
6314             E6h = _mm_add_epi32(E6h,
6315                     _mm_madd_epi16(m128Tmp3,
6316                             _mm_load_si128(
6317                                     (__m128i *) (transform16x16_1[1][6]))));
6318             E6l = _mm_add_epi32(E6l,
6319                     _mm_madd_epi16(m128Tmp4,
6320                             _mm_load_si128(
6321                                     (__m128i *) (transform16x16_1[2][6]))));
6322             E6h = _mm_add_epi32(E6h,
6323                     _mm_madd_epi16(m128Tmp5,
6324                             _mm_load_si128(
6325                                     (__m128i *) (transform16x16_1[2][6]))));
6326             E6l = _mm_add_epi32(E6l,
6327                     _mm_madd_epi16(m128Tmp6,
6328                             _mm_load_si128(
6329                                     (__m128i *) (transform16x16_1[3][6]))));
6330             E6h = _mm_add_epi32(E6h,
6331                     _mm_madd_epi16(m128Tmp7,
6332                             _mm_load_si128(
6333                                     (__m128i *) (transform16x16_1[3][6]))));
6334 
6335             /*  Compute E7  */
6336             E7l = _mm_madd_epi16(m128Tmp0,
6337                     _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
6338             E7h = _mm_madd_epi16(m128Tmp1,
6339                     _mm_load_si128((__m128i *) (transform16x16_1[0][7])));
6340             E7l = _mm_add_epi32(E7l,
6341                     _mm_madd_epi16(m128Tmp2,
6342                             _mm_load_si128(
6343                                     (__m128i *) (transform16x16_1[1][7]))));
6344             E7h = _mm_add_epi32(E7h,
6345                     _mm_madd_epi16(m128Tmp3,
6346                             _mm_load_si128(
6347                                     (__m128i *) (transform16x16_1[1][7]))));
6348             E7l = _mm_add_epi32(E7l,
6349                     _mm_madd_epi16(m128Tmp4,
6350                             _mm_load_si128(
6351                                     (__m128i *) (transform16x16_1[2][7]))));
6352             E7h = _mm_add_epi32(E7h,
6353                     _mm_madd_epi16(m128Tmp5,
6354                             _mm_load_si128(
6355                                     (__m128i *) (transform16x16_1[2][7]))));
6356             E7l = _mm_add_epi32(E7l,
6357                     _mm_madd_epi16(m128Tmp6,
6358                             _mm_load_si128(
6359                                     (__m128i *) (transform16x16_1[3][7]))));
6360             E7h = _mm_add_epi32(E7h,
6361                     _mm_madd_epi16(m128Tmp7,
6362                             _mm_load_si128(
6363                                     (__m128i *) (transform16x16_1[3][7]))));
6364 
6365             /*  Compute EE0 and EEE */
6366 
6367             m128Tmp0 = _mm_unpacklo_epi16(m128iS4, m128iS12);
6368             E00l = _mm_madd_epi16(m128Tmp0,
6369                     _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
6370             m128Tmp1 = _mm_unpackhi_epi16(m128iS4, m128iS12);
6371             E00h = _mm_madd_epi16(m128Tmp1,
6372                     _mm_load_si128((__m128i *) (transform16x16_2[0][0])));
6373 
6374             m128Tmp2 = _mm_unpacklo_epi16(m128iS20, m128iS28);
6375             E00l = _mm_add_epi32(E00l,
6376                     _mm_madd_epi16(m128Tmp2,
6377                             _mm_load_si128(
6378                                     (__m128i *) (transform16x16_2[1][0]))));
6379             m128Tmp3 = _mm_unpackhi_epi16(m128iS20, m128iS28);
6380             E00h = _mm_add_epi32(E00h,
6381                     _mm_madd_epi16(m128Tmp3,
6382                             _mm_load_si128(
6383                                     (__m128i *) (transform16x16_2[1][0]))));
6384 
6385             E01l = _mm_madd_epi16(m128Tmp0,
6386                     _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
6387             E01h = _mm_madd_epi16(m128Tmp1,
6388                     _mm_load_si128((__m128i *) (transform16x16_2[0][1])));
6389             E01l = _mm_add_epi32(E01l,
6390                     _mm_madd_epi16(m128Tmp2,
6391                             _mm_load_si128(
6392                                     (__m128i *) (transform16x16_2[1][1]))));
6393             E01h = _mm_add_epi32(E01h,
6394                     _mm_madd_epi16(m128Tmp3,
6395                             _mm_load_si128(
6396                                     (__m128i *) (transform16x16_2[1][1]))));
6397 
6398             E02l = _mm_madd_epi16(m128Tmp0,
6399                     _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
6400             E02h = _mm_madd_epi16(m128Tmp1,
6401                     _mm_load_si128((__m128i *) (transform16x16_2[0][2])));
6402             E02l = _mm_add_epi32(E02l,
6403                     _mm_madd_epi16(m128Tmp2,
6404                             _mm_load_si128(
6405                                     (__m128i *) (transform16x16_2[1][2]))));
6406             E02h = _mm_add_epi32(E02h,
6407                     _mm_madd_epi16(m128Tmp3,
6408                             _mm_load_si128(
6409                                     (__m128i *) (transform16x16_2[1][2]))));
6410 
6411             E03l = _mm_madd_epi16(m128Tmp0,
6412                     _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
6413             E03h = _mm_madd_epi16(m128Tmp1,
6414                     _mm_load_si128((__m128i *) (transform16x16_2[0][3])));
6415             E03l = _mm_add_epi32(E03l,
6416                     _mm_madd_epi16(m128Tmp2,
6417                             _mm_load_si128(
6418                                     (__m128i *) (transform16x16_2[1][3]))));
6419             E03h = _mm_add_epi32(E03h,
6420                     _mm_madd_epi16(m128Tmp3,
6421                             _mm_load_si128(
6422                                     (__m128i *) (transform16x16_2[1][3]))));
6423 
6424             /*  Compute EE0 and EEE */
6425 
6426             m128Tmp0 = _mm_unpacklo_epi16(m128iS8, m128iS24);
6427             EE0l = _mm_madd_epi16(m128Tmp0,
6428                     _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
6429             m128Tmp1 = _mm_unpackhi_epi16(m128iS8, m128iS24);
6430             EE0h = _mm_madd_epi16(m128Tmp1,
6431                     _mm_load_si128((__m128i *) (transform16x16_3[0][0])));
6432 
6433             m128Tmp2 = _mm_unpacklo_epi16(m128iS0, m128iS16);
6434             EEE0l = _mm_madd_epi16(m128Tmp2,
6435                     _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
6436             m128Tmp3 = _mm_unpackhi_epi16(m128iS0, m128iS16);
6437             EEE0h = _mm_madd_epi16(m128Tmp3,
6438                     _mm_load_si128((__m128i *) (transform16x16_3[1][0])));
6439 
6440             EE1l = _mm_madd_epi16(m128Tmp0,
6441                     _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
6442             EE1h = _mm_madd_epi16(m128Tmp1,
6443                     _mm_load_si128((__m128i *) (transform16x16_3[0][1])));
6444 
6445             EEE1l = _mm_madd_epi16(m128Tmp2,
6446                     _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
6447             EEE1h = _mm_madd_epi16(m128Tmp3,
6448                     _mm_load_si128((__m128i *) (transform16x16_3[1][1])));
6449 
6450             /*  Compute EE    */
6451 
6452             EE2l = _mm_sub_epi32(EEE1l, EE1l);
6453             EE3l = _mm_sub_epi32(EEE0l, EE0l);
6454             EE2h = _mm_sub_epi32(EEE1h, EE1h);
6455             EE3h = _mm_sub_epi32(EEE0h, EE0h);
6456 
6457             EE0l = _mm_add_epi32(EEE0l, EE0l);
6458             EE1l = _mm_add_epi32(EEE1l, EE1l);
6459             EE0h = _mm_add_epi32(EEE0h, EE0h);
6460             EE1h = _mm_add_epi32(EEE1h, EE1h);
6461             /**/
6462 
6463             EE7l = _mm_sub_epi32(EE0l, E00l);
6464             EE6l = _mm_sub_epi32(EE1l, E01l);
6465             EE5l = _mm_sub_epi32(EE2l, E02l);
6466             EE4l = _mm_sub_epi32(EE3l, E03l);
6467 
6468             EE7h = _mm_sub_epi32(EE0h, E00h);
6469             EE6h = _mm_sub_epi32(EE1h, E01h);
6470             EE5h = _mm_sub_epi32(EE2h, E02h);
6471             EE4h = _mm_sub_epi32(EE3h, E03h);
6472 
6473             EE0l = _mm_add_epi32(EE0l, E00l);
6474             EE1l = _mm_add_epi32(EE1l, E01l);
6475             EE2l = _mm_add_epi32(EE2l, E02l);
6476             EE3l = _mm_add_epi32(EE3l, E03l);
6477 
6478             EE0h = _mm_add_epi32(EE0h, E00h);
6479             EE1h = _mm_add_epi32(EE1h, E01h);
6480             EE2h = _mm_add_epi32(EE2h, E02h);
6481             EE3h = _mm_add_epi32(EE3h, E03h);
6482             /*      Compute E       */
6483 
6484             E15l = _mm_sub_epi32(EE0l, E0l);
6485             E15l = _mm_add_epi32(E15l, m128iAdd);
6486             E14l = _mm_sub_epi32(EE1l, E1l);
6487             E14l = _mm_add_epi32(E14l, m128iAdd);
6488             E13l = _mm_sub_epi32(EE2l, E2l);
6489             E13l = _mm_add_epi32(E13l, m128iAdd);
6490             E12l = _mm_sub_epi32(EE3l, E3l);
6491             E12l = _mm_add_epi32(E12l, m128iAdd);
6492             E11l = _mm_sub_epi32(EE4l, E4l);
6493             E11l = _mm_add_epi32(E11l, m128iAdd);
6494             E10l = _mm_sub_epi32(EE5l, E5l);
6495             E10l = _mm_add_epi32(E10l, m128iAdd);
6496             E9l = _mm_sub_epi32(EE6l, E6l);
6497             E9l = _mm_add_epi32(E9l, m128iAdd);
6498             E8l = _mm_sub_epi32(EE7l, E7l);
6499             E8l = _mm_add_epi32(E8l, m128iAdd);
6500 
6501             E0l = _mm_add_epi32(EE0l, E0l);
6502             E0l = _mm_add_epi32(E0l, m128iAdd);
6503             E1l = _mm_add_epi32(EE1l, E1l);
6504             E1l = _mm_add_epi32(E1l, m128iAdd);
6505             E2l = _mm_add_epi32(EE2l, E2l);
6506             E2l = _mm_add_epi32(E2l, m128iAdd);
6507             E3l = _mm_add_epi32(EE3l, E3l);
6508             E3l = _mm_add_epi32(E3l, m128iAdd);
6509             E4l = _mm_add_epi32(EE4l, E4l);
6510             E4l = _mm_add_epi32(E4l, m128iAdd);
6511             E5l = _mm_add_epi32(EE5l, E5l);
6512             E5l = _mm_add_epi32(E5l, m128iAdd);
6513             E6l = _mm_add_epi32(EE6l, E6l);
6514             E6l = _mm_add_epi32(E6l, m128iAdd);
6515             E7l = _mm_add_epi32(EE7l, E7l);
6516             E7l = _mm_add_epi32(E7l, m128iAdd);
6517 
6518             E15h = _mm_sub_epi32(EE0h, E0h);
6519             E15h = _mm_add_epi32(E15h, m128iAdd);
6520             E14h = _mm_sub_epi32(EE1h, E1h);
6521             E14h = _mm_add_epi32(E14h, m128iAdd);
6522             E13h = _mm_sub_epi32(EE2h, E2h);
6523             E13h = _mm_add_epi32(E13h, m128iAdd);
6524             E12h = _mm_sub_epi32(EE3h, E3h);
6525             E12h = _mm_add_epi32(E12h, m128iAdd);
6526             E11h = _mm_sub_epi32(EE4h, E4h);
6527             E11h = _mm_add_epi32(E11h, m128iAdd);
6528             E10h = _mm_sub_epi32(EE5h, E5h);
6529             E10h = _mm_add_epi32(E10h, m128iAdd);
6530             E9h = _mm_sub_epi32(EE6h, E6h);
6531             E9h = _mm_add_epi32(E9h, m128iAdd);
6532             E8h = _mm_sub_epi32(EE7h, E7h);
6533             E8h = _mm_add_epi32(E8h, m128iAdd);
6534 
6535             E0h = _mm_add_epi32(EE0h, E0h);
6536             E0h = _mm_add_epi32(E0h, m128iAdd);
6537             E1h = _mm_add_epi32(EE1h, E1h);
6538             E1h = _mm_add_epi32(E1h, m128iAdd);
6539             E2h = _mm_add_epi32(EE2h, E2h);
6540             E2h = _mm_add_epi32(E2h, m128iAdd);
6541             E3h = _mm_add_epi32(EE3h, E3h);
6542             E3h = _mm_add_epi32(E3h, m128iAdd);
6543             E4h = _mm_add_epi32(EE4h, E4h);
6544             E4h = _mm_add_epi32(E4h, m128iAdd);
6545             E5h = _mm_add_epi32(EE5h, E5h);
6546             E5h = _mm_add_epi32(E5h, m128iAdd);
6547             E6h = _mm_add_epi32(EE6h, E6h);
6548             E6h = _mm_add_epi32(E6h, m128iAdd);
6549             E7h = _mm_add_epi32(EE7h, E7h);
6550             E7h = _mm_add_epi32(E7h, m128iAdd);
6551 
6552             m128iS0 = _mm_packs_epi32(
6553                     _mm_srai_epi32(_mm_add_epi32(E0l, O0l), shift),
6554                     _mm_srai_epi32(_mm_add_epi32(E0h, O0h), shift));
6555             m128iS1 = _mm_packs_epi32(
6556                     _mm_srai_epi32(_mm_add_epi32(E1l, O1l), shift),
6557                     _mm_srai_epi32(_mm_add_epi32(E1h, O1h), shift));
6558             m128iS2 = _mm_packs_epi32(
6559                     _mm_srai_epi32(_mm_add_epi32(E2l, O2l), shift),
6560                     _mm_srai_epi32(_mm_add_epi32(E2h, O2h), shift));
6561             m128iS3 = _mm_packs_epi32(
6562                     _mm_srai_epi32(_mm_add_epi32(E3l, O3l), shift),
6563                     _mm_srai_epi32(_mm_add_epi32(E3h, O3h), shift));
6564             m128iS4 = _mm_packs_epi32(
6565                     _mm_srai_epi32(_mm_add_epi32(E4l, O4l), shift),
6566                     _mm_srai_epi32(_mm_add_epi32(E4h, O4h), shift));
6567             m128iS5 = _mm_packs_epi32(
6568                     _mm_srai_epi32(_mm_add_epi32(E5l, O5l), shift),
6569                     _mm_srai_epi32(_mm_add_epi32(E5h, O5h), shift));
6570             m128iS6 = _mm_packs_epi32(
6571                     _mm_srai_epi32(_mm_add_epi32(E6l, O6l), shift),
6572                     _mm_srai_epi32(_mm_add_epi32(E6h, O6h), shift));
6573             m128iS7 = _mm_packs_epi32(
6574                     _mm_srai_epi32(_mm_add_epi32(E7l, O7l), shift),
6575                     _mm_srai_epi32(_mm_add_epi32(E7h, O7h), shift));
6576             m128iS8 = _mm_packs_epi32(
6577                     _mm_srai_epi32(_mm_add_epi32(E8l, O8l), shift),
6578                     _mm_srai_epi32(_mm_add_epi32(E8h, O8h), shift));
6579             m128iS9 = _mm_packs_epi32(
6580                     _mm_srai_epi32(_mm_add_epi32(E9l, O9l), shift),
6581                     _mm_srai_epi32(_mm_add_epi32(E9h, O9h), shift));
6582             m128iS10 = _mm_packs_epi32(
6583                     _mm_srai_epi32(_mm_add_epi32(E10l, O10l), shift),
6584                     _mm_srai_epi32(_mm_add_epi32(E10h, O10h), shift));
6585             m128iS11 = _mm_packs_epi32(
6586                     _mm_srai_epi32(_mm_add_epi32(E11l, O11l), shift),
6587                     _mm_srai_epi32(_mm_add_epi32(E11h, O11h), shift));
6588             m128iS12 = _mm_packs_epi32(
6589                     _mm_srai_epi32(_mm_add_epi32(E12l, O12l), shift),
6590                     _mm_srai_epi32(_mm_add_epi32(E12h, O12h), shift));
6591             m128iS13 = _mm_packs_epi32(
6592                     _mm_srai_epi32(_mm_add_epi32(E13l, O13l), shift),
6593                     _mm_srai_epi32(_mm_add_epi32(E13h, O13h), shift));
6594             m128iS14 = _mm_packs_epi32(
6595                     _mm_srai_epi32(_mm_add_epi32(E14l, O14l), shift),
6596                     _mm_srai_epi32(_mm_add_epi32(E14h, O14h), shift));
6597             m128iS15 = _mm_packs_epi32(
6598                     _mm_srai_epi32(_mm_add_epi32(E15l, O15l), shift),
6599                     _mm_srai_epi32(_mm_add_epi32(E15h, O15h), shift));
6600 
6601             m128iS31 = _mm_packs_epi32(
6602                     _mm_srai_epi32(_mm_sub_epi32(E0l, O0l), shift),
6603                     _mm_srai_epi32(_mm_sub_epi32(E0h, O0h), shift));
6604             m128iS30 = _mm_packs_epi32(
6605                     _mm_srai_epi32(_mm_sub_epi32(E1l, O1l), shift),
6606                     _mm_srai_epi32(_mm_sub_epi32(E1h, O1h), shift));
6607             m128iS29 = _mm_packs_epi32(
6608                     _mm_srai_epi32(_mm_sub_epi32(E2l, O2l), shift),
6609                     _mm_srai_epi32(_mm_sub_epi32(E2h, O2h), shift));
6610             m128iS28 = _mm_packs_epi32(
6611                     _mm_srai_epi32(_mm_sub_epi32(E3l, O3l), shift),
6612                     _mm_srai_epi32(_mm_sub_epi32(E3h, O3h), shift));
6613             m128iS27 = _mm_packs_epi32(
6614                     _mm_srai_epi32(_mm_sub_epi32(E4l, O4l), shift),
6615                     _mm_srai_epi32(_mm_sub_epi32(E4h, O4h), shift));
6616             m128iS26 = _mm_packs_epi32(
6617                     _mm_srai_epi32(_mm_sub_epi32(E5l, O5l), shift),
6618                     _mm_srai_epi32(_mm_sub_epi32(E5h, O5h), shift));
6619             m128iS25 = _mm_packs_epi32(
6620                     _mm_srai_epi32(_mm_sub_epi32(E6l, O6l), shift),
6621                     _mm_srai_epi32(_mm_sub_epi32(E6h, O6h), shift));
6622             m128iS24 = _mm_packs_epi32(
6623                     _mm_srai_epi32(_mm_sub_epi32(E7l, O7l), shift),
6624                     _mm_srai_epi32(_mm_sub_epi32(E7h, O7h), shift));
6625             m128iS23 = _mm_packs_epi32(
6626                     _mm_srai_epi32(_mm_sub_epi32(E8l, O8l), shift),
6627                     _mm_srai_epi32(_mm_sub_epi32(E8h, O8h), shift));
6628             m128iS22 = _mm_packs_epi32(
6629                     _mm_srai_epi32(_mm_sub_epi32(E9l, O9l), shift),
6630                     _mm_srai_epi32(_mm_sub_epi32(E9h, O9h), shift));
6631             m128iS21 = _mm_packs_epi32(
6632                     _mm_srai_epi32(_mm_sub_epi32(E10l, O10l), shift),
6633                     _mm_srai_epi32(_mm_sub_epi32(E10h, O10h), shift));
6634             m128iS20 = _mm_packs_epi32(
6635                     _mm_srai_epi32(_mm_sub_epi32(E11l, O11l), shift),
6636                     _mm_srai_epi32(_mm_sub_epi32(E11h, O11h), shift));
6637             m128iS19 = _mm_packs_epi32(
6638                     _mm_srai_epi32(_mm_sub_epi32(E12l, O12l), shift),
6639                     _mm_srai_epi32(_mm_sub_epi32(E12h, O12h), shift));
6640             m128iS18 = _mm_packs_epi32(
6641                     _mm_srai_epi32(_mm_sub_epi32(E13l, O13l), shift),
6642                     _mm_srai_epi32(_mm_sub_epi32(E13h, O13h), shift));
6643             m128iS17 = _mm_packs_epi32(
6644                     _mm_srai_epi32(_mm_sub_epi32(E14l, O14l), shift),
6645                     _mm_srai_epi32(_mm_sub_epi32(E14h, O14h), shift));
6646             m128iS16 = _mm_packs_epi32(
6647                     _mm_srai_epi32(_mm_sub_epi32(E15l, O15l), shift),
6648                     _mm_srai_epi32(_mm_sub_epi32(E15h, O15h), shift));
6649 
6650             if (!j) {
6651                 /*      Inverse the matrix      */
6652                 E0l = _mm_unpacklo_epi16(m128iS0, m128iS16);
6653                 E1l = _mm_unpacklo_epi16(m128iS1, m128iS17);
6654                 E2l = _mm_unpacklo_epi16(m128iS2, m128iS18);
6655                 E3l = _mm_unpacklo_epi16(m128iS3, m128iS19);
6656                 E4l = _mm_unpacklo_epi16(m128iS4, m128iS20);
6657                 E5l = _mm_unpacklo_epi16(m128iS5, m128iS21);
6658                 E6l = _mm_unpacklo_epi16(m128iS6, m128iS22);
6659                 E7l = _mm_unpacklo_epi16(m128iS7, m128iS23);
6660                 E8l = _mm_unpacklo_epi16(m128iS8, m128iS24);
6661                 E9l = _mm_unpacklo_epi16(m128iS9, m128iS25);
6662                 E10l = _mm_unpacklo_epi16(m128iS10, m128iS26);
6663                 E11l = _mm_unpacklo_epi16(m128iS11, m128iS27);
6664                 E12l = _mm_unpacklo_epi16(m128iS12, m128iS28);
6665                 E13l = _mm_unpacklo_epi16(m128iS13, m128iS29);
6666                 E14l = _mm_unpacklo_epi16(m128iS14, m128iS30);
6667                 E15l = _mm_unpacklo_epi16(m128iS15, m128iS31);
6668 
6669                 O0l = _mm_unpackhi_epi16(m128iS0, m128iS16);
6670                 O1l = _mm_unpackhi_epi16(m128iS1, m128iS17);
6671                 O2l = _mm_unpackhi_epi16(m128iS2, m128iS18);
6672                 O3l = _mm_unpackhi_epi16(m128iS3, m128iS19);
6673                 O4l = _mm_unpackhi_epi16(m128iS4, m128iS20);
6674                 O5l = _mm_unpackhi_epi16(m128iS5, m128iS21);
6675                 O6l = _mm_unpackhi_epi16(m128iS6, m128iS22);
6676                 O7l = _mm_unpackhi_epi16(m128iS7, m128iS23);
6677                 O8l = _mm_unpackhi_epi16(m128iS8, m128iS24);
6678                 O9l = _mm_unpackhi_epi16(m128iS9, m128iS25);
6679                 O10l = _mm_unpackhi_epi16(m128iS10, m128iS26);
6680                 O11l = _mm_unpackhi_epi16(m128iS11, m128iS27);
6681                 O12l = _mm_unpackhi_epi16(m128iS12, m128iS28);
6682                 O13l = _mm_unpackhi_epi16(m128iS13, m128iS29);
6683                 O14l = _mm_unpackhi_epi16(m128iS14, m128iS30);
6684                 O15l = _mm_unpackhi_epi16(m128iS15, m128iS31);
6685 
6686                 E0h = _mm_unpacklo_epi16(E0l, E8l);
6687                 E1h = _mm_unpacklo_epi16(E1l, E9l);
6688                 E2h = _mm_unpacklo_epi16(E2l, E10l);
6689                 E3h = _mm_unpacklo_epi16(E3l, E11l);
6690                 E4h = _mm_unpacklo_epi16(E4l, E12l);
6691                 E5h = _mm_unpacklo_epi16(E5l, E13l);
6692                 E6h = _mm_unpacklo_epi16(E6l, E14l);
6693                 E7h = _mm_unpacklo_epi16(E7l, E15l);
6694 
6695                 E8h = _mm_unpackhi_epi16(E0l, E8l);
6696                 E9h = _mm_unpackhi_epi16(E1l, E9l);
6697                 E10h = _mm_unpackhi_epi16(E2l, E10l);
6698                 E11h = _mm_unpackhi_epi16(E3l, E11l);
6699                 E12h = _mm_unpackhi_epi16(E4l, E12l);
6700                 E13h = _mm_unpackhi_epi16(E5l, E13l);
6701                 E14h = _mm_unpackhi_epi16(E6l, E14l);
6702                 E15h = _mm_unpackhi_epi16(E7l, E15l);
6703 
6704                 m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h);
6705                 m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h);
6706                 m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h);
6707                 m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h);
6708 
6709                 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6710                 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6711                 m128iS0 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6712                 m128iS1 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6713 
6714                 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6715                 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6716                 m128iS2 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6717                 m128iS3 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6718 
6719                 m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h);
6720                 m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h);
6721                 m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h);
6722                 m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h);
6723 
6724                 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6725                 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6726                 m128iS4 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6727                 m128iS5 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6728 
6729                 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6730                 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6731                 m128iS6 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6732                 m128iS7 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6733 
6734                 m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h);
6735                 m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h);
6736                 m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h);
6737                 m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h);
6738 
6739                 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6740                 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6741                 m128iS8 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6742                 m128iS9 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6743 
6744                 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6745                 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6746                 m128iS10 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6747                 m128iS11 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6748 
6749                 m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h);
6750                 m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h);
6751                 m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h);
6752                 m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h);
6753 
6754                 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6755                 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6756                 m128iS12 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6757                 m128iS13 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6758 
6759                 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6760                 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6761                 m128iS14 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6762                 m128iS15 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6763 
6764                 /*  */
6765                 E0h = _mm_unpacklo_epi16(O0l, O8l);
6766                 E1h = _mm_unpacklo_epi16(O1l, O9l);
6767                 E2h = _mm_unpacklo_epi16(O2l, O10l);
6768                 E3h = _mm_unpacklo_epi16(O3l, O11l);
6769                 E4h = _mm_unpacklo_epi16(O4l, O12l);
6770                 E5h = _mm_unpacklo_epi16(O5l, O13l);
6771                 E6h = _mm_unpacklo_epi16(O6l, O14l);
6772                 E7h = _mm_unpacklo_epi16(O7l, O15l);
6773 
6774                 E8h = _mm_unpackhi_epi16(O0l, O8l);
6775                 E9h = _mm_unpackhi_epi16(O1l, O9l);
6776                 E10h = _mm_unpackhi_epi16(O2l, O10l);
6777                 E11h = _mm_unpackhi_epi16(O3l, O11l);
6778                 E12h = _mm_unpackhi_epi16(O4l, O12l);
6779                 E13h = _mm_unpackhi_epi16(O5l, O13l);
6780                 E14h = _mm_unpackhi_epi16(O6l, O14l);
6781                 E15h = _mm_unpackhi_epi16(O7l, O15l);
6782 
6783                 m128Tmp0 = _mm_unpacklo_epi16(E0h, E4h);
6784                 m128Tmp1 = _mm_unpacklo_epi16(E1h, E5h);
6785                 m128Tmp2 = _mm_unpacklo_epi16(E2h, E6h);
6786                 m128Tmp3 = _mm_unpacklo_epi16(E3h, E7h);
6787 
6788                 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6789                 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6790                 m128iS16 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6791                 m128iS17 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6792 
6793                 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6794                 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6795                 m128iS18 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6796                 m128iS19 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6797 
6798                 m128Tmp0 = _mm_unpackhi_epi16(E0h, E4h);
6799                 m128Tmp1 = _mm_unpackhi_epi16(E1h, E5h);
6800                 m128Tmp2 = _mm_unpackhi_epi16(E2h, E6h);
6801                 m128Tmp3 = _mm_unpackhi_epi16(E3h, E7h);
6802 
6803                 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6804                 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6805                 m128iS20 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6806                 m128iS21 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6807 
6808                 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6809                 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6810                 m128iS22 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6811                 m128iS23 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6812 
6813                 m128Tmp0 = _mm_unpacklo_epi16(E8h, E12h);
6814                 m128Tmp1 = _mm_unpacklo_epi16(E9h, E13h);
6815                 m128Tmp2 = _mm_unpacklo_epi16(E10h, E14h);
6816                 m128Tmp3 = _mm_unpacklo_epi16(E11h, E15h);
6817 
6818                 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6819                 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6820                 m128iS24 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6821                 m128iS25 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6822 
6823                 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6824                 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6825                 m128iS26 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6826                 m128iS27 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6827 
6828                 m128Tmp0 = _mm_unpackhi_epi16(E8h, E12h);
6829                 m128Tmp1 = _mm_unpackhi_epi16(E9h, E13h);
6830                 m128Tmp2 = _mm_unpackhi_epi16(E10h, E14h);
6831                 m128Tmp3 = _mm_unpackhi_epi16(E11h, E15h);
6832 
6833                 m128Tmp4 = _mm_unpacklo_epi16(m128Tmp0, m128Tmp2);
6834                 m128Tmp5 = _mm_unpacklo_epi16(m128Tmp1, m128Tmp3);
6835                 m128iS28 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6836                 m128iS29 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6837 
6838                 m128Tmp4 = _mm_unpackhi_epi16(m128Tmp0, m128Tmp2);
6839                 m128Tmp5 = _mm_unpackhi_epi16(m128Tmp1, m128Tmp3);
6840                 m128iS30 = _mm_unpacklo_epi16(m128Tmp4, m128Tmp5);
6841                 m128iS31 = _mm_unpackhi_epi16(m128Tmp4, m128Tmp5);
6842                 /*  */
6843                 _mm_store_si128((__m128i *) (src + i), m128iS0);
6844                 _mm_store_si128((__m128i *) (src + 32 + i), m128iS1);
6845                 _mm_store_si128((__m128i *) (src + 64 + i), m128iS2);
6846                 _mm_store_si128((__m128i *) (src + 96 + i), m128iS3);
6847                 _mm_store_si128((__m128i *) (src + 128 + i), m128iS4);
6848                 _mm_store_si128((__m128i *) (src + 160 + i), m128iS5);
6849                 _mm_store_si128((__m128i *) (src + 192 + i), m128iS6);
6850                 _mm_store_si128((__m128i *) (src + 224 + i), m128iS7);
6851                 _mm_store_si128((__m128i *) (src + 256 + i), m128iS8);
6852                 _mm_store_si128((__m128i *) (src + 288 + i), m128iS9);
6853                 _mm_store_si128((__m128i *) (src + 320 + i), m128iS10);
6854                 _mm_store_si128((__m128i *) (src + 352 + i), m128iS11);
6855                 _mm_store_si128((__m128i *) (src + 384 + i), m128iS12);
6856                 _mm_store_si128((__m128i *) (src + 416 + i), m128iS13);
6857                 _mm_store_si128((__m128i *) (src + 448 + i), m128iS14);
6858                 _mm_store_si128((__m128i *) (src + 480 + i), m128iS15);
6859                 _mm_store_si128((__m128i *) (src + 512 + i), m128iS16);
6860                 _mm_store_si128((__m128i *) (src + 544 + i), m128iS17);
6861                 _mm_store_si128((__m128i *) (src + 576 + i), m128iS18);
6862                 _mm_store_si128((__m128i *) (src + 608 + i), m128iS19);
6863                 _mm_store_si128((__m128i *) (src + 640 + i), m128iS20);
6864                 _mm_store_si128((__m128i *) (src + 672 + i), m128iS21);
6865                 _mm_store_si128((__m128i *) (src + 704 + i), m128iS22);
6866                 _mm_store_si128((__m128i *) (src + 736 + i), m128iS23);
6867                 _mm_store_si128((__m128i *) (src + 768 + i), m128iS24);
6868                 _mm_store_si128((__m128i *) (src + 800 + i), m128iS25);
6869                 _mm_store_si128((__m128i *) (src + 832 + i), m128iS26);
6870                 _mm_store_si128((__m128i *) (src + 864 + i), m128iS27);
6871                 _mm_store_si128((__m128i *) (src + 896 + i), m128iS28);
6872                 _mm_store_si128((__m128i *) (src + 928 + i), m128iS29);
6873                 _mm_store_si128((__m128i *) (src + 960 + i), m128iS30);
6874                 _mm_store_si128((__m128i *) (src + 992 + i), m128iS31);
6875 
6876                 if (i <= 16) {
6877                     int k = i + 8;
6878                     m128iS0 = _mm_load_si128((__m128i *) (src + k));
6879                     m128iS1 = _mm_load_si128((__m128i *) (src + 32 + k));
6880                     m128iS2 = _mm_load_si128((__m128i *) (src + 64 + k));
6881                     m128iS3 = _mm_load_si128((__m128i *) (src + 96 + k));
6882                     m128iS4 = _mm_load_si128((__m128i *) (src + 128 + k));
6883                     m128iS5 = _mm_load_si128((__m128i *) (src + 160 + k));
6884                     m128iS6 = _mm_load_si128((__m128i *) (src + 192 + k));
6885                     m128iS7 = _mm_load_si128((__m128i *) (src + 224 + k));
6886                     m128iS8 = _mm_load_si128((__m128i *) (src + 256 + k));
6887                     m128iS9 = _mm_load_si128((__m128i *) (src + 288 + k));
6888                     m128iS10 = _mm_load_si128((__m128i *) (src + 320 + k));
6889                     m128iS11 = _mm_load_si128((__m128i *) (src + 352 + k));
6890                     m128iS12 = _mm_load_si128((__m128i *) (src + 384 + k));
6891                     m128iS13 = _mm_load_si128((__m128i *) (src + 416 + k));
6892                     m128iS14 = _mm_load_si128((__m128i *) (src + 448 + k));
6893                     m128iS15 = _mm_load_si128((__m128i *) (src + 480 + k));
6894 
6895                     m128iS16 = _mm_load_si128((__m128i *) (src + 512 + k));
6896                     m128iS17 = _mm_load_si128((__m128i *) (src + 544 + k));
6897                     m128iS18 = _mm_load_si128((__m128i *) (src + 576 + k));
6898                     m128iS19 = _mm_load_si128((__m128i *) (src + 608 + k));
6899                     m128iS20 = _mm_load_si128((__m128i *) (src + 640 + k));
6900                     m128iS21 = _mm_load_si128((__m128i *) (src + 672 + k));
6901                     m128iS22 = _mm_load_si128((__m128i *) (src + 704 + k));
6902                     m128iS23 = _mm_load_si128((__m128i *) (src + 736 + k));
6903                     m128iS24 = _mm_load_si128((__m128i *) (src + 768 + k));
6904                     m128iS25 = _mm_load_si128((__m128i *) (src + 800 + k));
6905                     m128iS26 = _mm_load_si128((__m128i *) (src + 832 + k));
6906                     m128iS27 = _mm_load_si128((__m128i *) (src + 864 + k));
6907                     m128iS28 = _mm_load_si128((__m128i *) (src + 896 + k));
6908                     m128iS29 = _mm_load_si128((__m128i *) (src + 928 + k));
6909                     m128iS30 = _mm_load_si128((__m128i *) (src + 960 + k));
6910                     m128iS31 = _mm_load_si128((__m128i *) (src + 992 + k));
6911                 } else {
6912                     m128iS0 = _mm_load_si128((__m128i *) (src));
6913                     m128iS1 = _mm_load_si128((__m128i *) (src + 128));
6914                     m128iS2 = _mm_load_si128((__m128i *) (src + 256));
6915                     m128iS3 = _mm_load_si128((__m128i *) (src + 384));
6916                     m128iS4 = _mm_loadu_si128((__m128i *) (src + 512));
6917                     m128iS5 = _mm_load_si128((__m128i *) (src + 640));
6918                     m128iS6 = _mm_load_si128((__m128i *) (src + 768));
6919                     m128iS7 = _mm_load_si128((__m128i *) (src + 896));
6920                     m128iS8 = _mm_load_si128((__m128i *) (src + 8));
6921                     m128iS9 = _mm_load_si128((__m128i *) (src + 128 + 8));
6922                     m128iS10 = _mm_load_si128((__m128i *) (src + 256 + 8));
6923                     m128iS11 = _mm_load_si128((__m128i *) (src + 384 + 8));
6924                     m128iS12 = _mm_loadu_si128((__m128i *) (src + 512 + 8));
6925                     m128iS13 = _mm_load_si128((__m128i *) (src + 640 + 8));
6926                     m128iS14 = _mm_load_si128((__m128i *) (src + 768 + 8));
6927                     m128iS15 = _mm_load_si128((__m128i *) (src + 896 + 8));
6928                     m128iS16 = _mm_load_si128((__m128i *) (src + 16));
6929                     m128iS17 = _mm_load_si128((__m128i *) (src + 128 + 16));
6930                     m128iS18 = _mm_load_si128((__m128i *) (src + 256 + 16));
6931                     m128iS19 = _mm_load_si128((__m128i *) (src + 384 + 16));
6932                     m128iS20 = _mm_loadu_si128((__m128i *) (src + 512 + 16));
6933                     m128iS21 = _mm_load_si128((__m128i *) (src + 640 + 16));
6934                     m128iS22 = _mm_load_si128((__m128i *) (src + 768 + 16));
6935                     m128iS23 = _mm_load_si128((__m128i *) (src + 896 + 16));
6936                     m128iS24 = _mm_load_si128((__m128i *) (src + 24));
6937                     m128iS25 = _mm_load_si128((__m128i *) (src + 128 + 24));
6938                     m128iS26 = _mm_load_si128((__m128i *) (src + 256 + 24));
6939                     m128iS27 = _mm_load_si128((__m128i *) (src + 384 + 24));
6940                     m128iS28 = _mm_loadu_si128((__m128i *) (src + 512 + 24));
6941                     m128iS29 = _mm_load_si128((__m128i *) (src + 640 + 24));
6942                     m128iS30 = _mm_load_si128((__m128i *) (src + 768 + 24));
6943                     m128iS31 = _mm_load_si128((__m128i *) (src + 896 + 24));
6944                     shift = shift_2nd;
6945                     m128iAdd = _mm_set1_epi32(add_2nd);
6946                 }
6947 
6948             } else {
6949                 int k, m = 0;
6950                 _mm_storeu_si128((__m128i *) (src), m128iS0);
6951                 _mm_storeu_si128((__m128i *) (src + 8), m128iS1);
6952                 _mm_storeu_si128((__m128i *) (src + 16), m128iS2);
6953                 _mm_storeu_si128((__m128i *) (src + 24), m128iS3);
6954                 _mm_storeu_si128((__m128i *) (src + 128), m128iS4);
6955                 _mm_storeu_si128((__m128i *) (src + 128 + 8), m128iS5);
6956                 _mm_storeu_si128((__m128i *) (src + 128 + 16), m128iS6);
6957                 _mm_storeu_si128((__m128i *) (src + 128 + 24), m128iS7);
6958                 _mm_storeu_si128((__m128i *) (src + 256), m128iS8);
6959                 _mm_storeu_si128((__m128i *) (src + 256 + 8), m128iS9);
6960                 _mm_storeu_si128((__m128i *) (src + 256 + 16), m128iS10);
6961                 _mm_storeu_si128((__m128i *) (src + 256 + 24), m128iS11);
6962                 _mm_storeu_si128((__m128i *) (src + 384), m128iS12);
6963                 _mm_storeu_si128((__m128i *) (src + 384 + 8), m128iS13);
6964                 _mm_storeu_si128((__m128i *) (src + 384 + 16), m128iS14);
6965                 _mm_storeu_si128((__m128i *) (src + 384 + 24), m128iS15);
6966 
6967                 _mm_storeu_si128((__m128i *) (src + 512), m128iS16);
6968                 _mm_storeu_si128((__m128i *) (src + 512 + 8), m128iS17);
6969                 _mm_storeu_si128((__m128i *) (src + 512 + 16), m128iS18);
6970                 _mm_storeu_si128((__m128i *) (src + 512 + 24), m128iS19);
6971                 _mm_storeu_si128((__m128i *) (src + 640), m128iS20);
6972                 _mm_storeu_si128((__m128i *) (src + 640 + 8), m128iS21);
6973                 _mm_storeu_si128((__m128i *) (src + 640 + 16), m128iS22);
6974                 _mm_storeu_si128((__m128i *) (src + 640 + 24), m128iS23);
6975                 _mm_storeu_si128((__m128i *) (src + 768), m128iS24);
6976                 _mm_storeu_si128((__m128i *) (src + 768 + 8), m128iS25);
6977                 _mm_storeu_si128((__m128i *) (src + 768 + 16), m128iS26);
6978                 _mm_storeu_si128((__m128i *) (src + 768 + 24), m128iS27);
6979                 _mm_storeu_si128((__m128i *) (src + 896), m128iS28);
6980                 _mm_storeu_si128((__m128i *) (src + 896 + 8), m128iS29);
6981                 _mm_storeu_si128((__m128i *) (src + 896 + 16), m128iS30);
6982                 _mm_storeu_si128((__m128i *) (src + 896 + 24), m128iS31);
6983                 dst = (uint16_t*) _dst + (i * stride);
6984                 for (k = 0; k < 8; k++) {
6985                     dst[0] = av_clip_uintp2(dst[0] + src[m],10);
6986                     dst[1] = av_clip_uintp2(dst[1] + src[m + 8],10);
6987                     dst[2] = av_clip_uintp2(dst[2] + src[m + 16],10);
6988                     dst[3] = av_clip_uintp2(dst[3] + src[m + 24],10);
6989                     dst[4] = av_clip_uintp2(
6990                             dst[4] + src[m + 128],10);
6991                     dst[5] = av_clip_uintp2(
6992                             dst[5] + src[m + 128 + 8],10);
6993                     dst[6] = av_clip_uintp2(
6994                             dst[6] + src[m + 128 + 16],10);
6995                     dst[7] = av_clip_uintp2(
6996                             dst[7] + src[m + 128 + 24],10);
6997 
6998                     dst[8] = av_clip_uintp2(
6999                             dst[8] + src[m + 256],10);
7000                     dst[9] = av_clip_uintp2(
7001                             dst[9] + src[m + 256 + 8],10);
7002                     dst[10] = av_clip_uintp2(
7003                             dst[10] + src[m + 256 + 16],10);
7004                     dst[11] = av_clip_uintp2(
7005                             dst[11] + src[m + 256 + 24],10);
7006                     dst[12] = av_clip_uintp2(
7007                             dst[12] + src[m + 384],10);
7008                     dst[13] = av_clip_uintp2(
7009                             dst[13] + src[m + 384 + 8],10);
7010                     dst[14] = av_clip_uintp2(
7011                             dst[14] + src[m + 384 + 16],10);
7012                     dst[15] = av_clip_uintp2(
7013                             dst[15] + src[m + 384 + 24],10);
7014 
7015                     dst[16] = av_clip_uintp2(
7016                             dst[16] + src[m + 512],10);
7017                     dst[17] = av_clip_uintp2(
7018                             dst[17] + src[m + 512 + 8],10);
7019                     dst[18] = av_clip_uintp2(
7020                             dst[18] + src[m + 512 + 16],10);
7021                     dst[19] = av_clip_uintp2(
7022                             dst[19] + src[m + 512 + 24],10);
7023                     dst[20] = av_clip_uintp2(
7024                             dst[20] + src[m + 640],10);
7025                     dst[21] = av_clip_uintp2(
7026                             dst[21] + src[m + 640 + 8],10);
7027                     dst[22] = av_clip_uintp2(
7028                             dst[22] + src[m + 640 + 16],10);
7029                     dst[23] = av_clip_uintp2(
7030                             dst[23] + src[m + 640 + 24],10);
7031 
7032                     dst[24] = av_clip_uintp2(
7033                             dst[24] + src[m + 768],10);
7034                     dst[25] = av_clip_uintp2(
7035                             dst[25] + src[m + 768 + 8],10);
7036                     dst[26] = av_clip_uintp2(
7037                             dst[26] + src[m + 768 + 16],10);
7038                     dst[27] = av_clip_uintp2(
7039                             dst[27] + src[m + 768 + 24],10);
7040                     dst[28] = av_clip_uintp2(
7041                             dst[28] + src[m + 896],10);
7042                     dst[29] = av_clip_uintp2(
7043                             dst[29] + src[m + 896 + 8],10);
7044                     dst[30] = av_clip_uintp2(
7045                             dst[30] + src[m + 896 + 16],10);
7046                     dst[31] = av_clip_uintp2(
7047                             dst[31] + src[m + 896 + 24],10);
7048 
7049                     m += 1;
7050                     dst += stride;
7051                 }
7052                 if (i <= 16) {
7053                     int k = (i + 8) * 4;
7054                     m128iS0 = _mm_load_si128((__m128i *) (src + k));
7055                     m128iS1 = _mm_load_si128((__m128i *) (src + 128 + k));
7056                     m128iS2 = _mm_load_si128((__m128i *) (src + 256 + k));
7057                     m128iS3 = _mm_load_si128((__m128i *) (src + 384 + k));
7058                     m128iS4 = _mm_loadu_si128((__m128i *) (src + 512 + k));
7059                     m128iS5 = _mm_load_si128((__m128i *) (src + 640 + k));
7060                     m128iS6 = _mm_load_si128((__m128i *) (src + 768 + k));
7061                     m128iS7 = _mm_load_si128((__m128i *) (src + 896 + k));
7062                     m128iS8 = _mm_load_si128((__m128i *) (src + 8 + k));
7063                     m128iS9 = _mm_load_si128((__m128i *) (src + 128 + 8 + k));
7064                     m128iS10 = _mm_load_si128((__m128i *) (src + 256 + 8 + k));
7065                     m128iS11 = _mm_load_si128((__m128i *) (src + 384 + 8 + k));
7066                     m128iS12 = _mm_loadu_si128((__m128i *) (src + 512 + 8 + k));
7067                     m128iS13 = _mm_load_si128((__m128i *) (src + 640 + 8 + k));
7068                     m128iS14 = _mm_load_si128((__m128i *) (src + 768 + 8 + k));
7069                     m128iS15 = _mm_load_si128((__m128i *) (src + 896 + 8 + k));
7070                     m128iS16 = _mm_load_si128((__m128i *) (src + 16 + k));
7071                     m128iS17 = _mm_load_si128((__m128i *) (src + 128 + 16 + k));
7072                     m128iS18 = _mm_load_si128((__m128i *) (src + 256 + 16 + k));
7073                     m128iS19 = _mm_load_si128((__m128i *) (src + 384 + 16 + k));
7074                     m128iS20 = _mm_loadu_si128(
7075                             (__m128i *) (src + 512 + 16 + k));
7076                     m128iS21 = _mm_load_si128((__m128i *) (src + 640 + 16 + k));
7077                     m128iS22 = _mm_load_si128((__m128i *) (src + 768 + 16 + k));
7078                     m128iS23 = _mm_load_si128((__m128i *) (src + 896 + 16 + k));
7079                     m128iS24 = _mm_load_si128((__m128i *) (src + 24 + k));
7080                     m128iS25 = _mm_load_si128((__m128i *) (src + 128 + 24 + k));
7081                     m128iS26 = _mm_load_si128((__m128i *) (src + 256 + 24 + k));
7082                     m128iS27 = _mm_load_si128((__m128i *) (src + 384 + 24 + k));
7083                     m128iS28 = _mm_loadu_si128(
7084                             (__m128i *) (src + 512 + 24 + k));
7085                     m128iS29 = _mm_load_si128((__m128i *) (src + 640 + 24 + k));
7086                     m128iS30 = _mm_load_si128((__m128i *) (src + 768 + 24 + k));
7087                     m128iS31 = _mm_load_si128((__m128i *) (src + 896 + 24 + k));
7088                 }
7089             }
7090         }
7091     }
7092 }
7093 #endif
7094 
7095