1 ///////////////////////////////////////////////////////////////////////////
2 //
3 // Copyright (c) 2009-2014 DreamWorks Animation LLC.
4 //
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are
9 // met:
10 // *       Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // *       Redistributions in binary form must reproduce the above
13 // copyright notice, this list of conditions and the following disclaimer
14 // in the documentation and/or other materials provided with the
15 // distribution.
16 // *       Neither the name of DreamWorks Animation nor the names of
17 // its contributors may be used to endorse or promote products derived
18 // from this software without specific prior written permission.
19 //
20 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 //
32 ///////////////////////////////////////////////////////////////////////////
33 
34 #ifndef IMF_DWACOMPRESSORSIMD_H_HAS_BEEN_INCLUDED
35 #define IMF_DWACOMPRESSORSIMD_H_HAS_BEEN_INCLUDED
36 
37 //
38 // Various SSE accelerated functions, used by Imf::DwaCompressor.
39 // These have been separated into a separate .h file, as the fast
40 // paths are done with template specialization.
41 //
42 // Unless otherwise noted, all pointers are assumed to be 32-byte
43 // aligned. Unaligned pointers may risk seg-faulting.
44 //
45 
46 #include "ImfNamespace.h"
47 #include "ImfSimd.h"
48 #include "ImfSystemSpecific.h"
49 #include "OpenEXRConfig.h"
50 
51 #include <half.h>
52 #include <assert.h>
53 
54 OPENEXR_IMF_INTERNAL_NAMESPACE_HEADER_ENTER
55 
56 #define _SSE_ALIGNMENT        32
57 #define _SSE_ALIGNMENT_MASK 0x0F
58 #define _AVX_ALIGNMENT_MASK 0x1F
59 
60 //
61 // Test if we should enable GCC inline asm paths for AVX
62 //
63 
64 #ifdef OPENEXR_IMF_HAVE_GCC_INLINE_ASM_AVX
65 
66     #define IMF_HAVE_GCC_INLINEASM
67 
68     #ifdef __LP64__
69         #define IMF_HAVE_GCC_INLINEASM_64
70     #endif /* __LP64__ */
71 
72 #endif /* OPENEXR_IMF_HAVE_GCC_INLINE_ASM_AVX */
73 
74 //
75 // A simple 64-element array, aligned properly for SIMD access.
76 //
77 
78 template <class T>
79 class SimdAlignedBuffer64
80 {
81     public:
82 
SimdAlignedBuffer64()83         SimdAlignedBuffer64(): _buffer (0), _handle (0)
84         {
85             alloc();
86         }
87 
SimdAlignedBuffer64(const SimdAlignedBuffer64 & rhs)88         SimdAlignedBuffer64(const SimdAlignedBuffer64 &rhs): _handle(0)
89         {
90             alloc();
91             memcpy (_buffer, rhs._buffer, 64 * sizeof (T));
92         }
93 
~SimdAlignedBuffer64()94         ~SimdAlignedBuffer64 ()
95         {
96             EXRFreeAligned (_handle);
97             _handle = 0;
98             _buffer = 0;
99         }
100 
alloc()101         void alloc()
102         {
103             //
104             // Try EXRAllocAligned first - but it might fallback to
105             // unaligned allocs. If so, overalloc.
106             //
107 
108             _handle = (char *) EXRAllocAligned
109                 (64 * sizeof(T), _SSE_ALIGNMENT);
110 
111             if (((size_t)_handle & (_SSE_ALIGNMENT - 1)) == 0)
112             {
113                 _buffer = (T *)_handle;
114                 return;
115             }
116 
117             EXRFreeAligned(_handle);
118             _handle = (char *) EXRAllocAligned
119                 (64 * sizeof(T) + _SSE_ALIGNMENT, _SSE_ALIGNMENT);
120 
121             char *aligned = _handle;
122 
123             while ((size_t)aligned & (_SSE_ALIGNMENT - 1))
124                 aligned++;
125 
126             _buffer = (T *)aligned;
127         }
128 
129         T     *_buffer;
130 
131     private:
132 
133         char  *_handle;
134 };
135 
136 typedef SimdAlignedBuffer64<float>          SimdAlignedBuffer64f;
137 typedef SimdAlignedBuffer64<unsigned short> SimdAlignedBuffer64us;
138 
139 namespace {
140 
141 //
142 // Color space conversion, Inverse 709 CSC, Y'CbCr -> R'G'B'
143 //
144 
145 void
csc709Inverse(float & comp0,float & comp1,float & comp2)146 csc709Inverse (float &comp0, float &comp1, float &comp2)
147 {
148     float src[3];
149 
150     src[0] = comp0;
151     src[1] = comp1;
152     src[2] = comp2;
153 
154     comp0 = src[0]                    + 1.5747f * src[2];
155     comp1 = src[0] - 0.1873f * src[1] - 0.4682f * src[2];
156     comp2 = src[0] + 1.8556f * src[1];
157 }
158 
159 #ifndef IMF_HAVE_SSE2
160 
161 
162 //
163 // Scalar color space conversion, based on 709 primiary chromaticies.
164 // No scaling or offsets, just the matrix
165 //
166 
167 void
csc709Inverse64(float * comp0,float * comp1,float * comp2)168 csc709Inverse64 (float *comp0, float *comp1, float *comp2)
169 {
170     for (int i = 0; i < 64; ++i)
171         csc709Inverse (comp0[i], comp1[i], comp2[i]);
172 }
173 
174 #else /* IMF_HAVE_SSE2 */
175 
176 //
177 // SSE2 color space conversion
178 //
179 
180 void
csc709Inverse64(float * comp0,float * comp1,float * comp2)181 csc709Inverse64 (float *comp0, float *comp1, float *comp2)
182 {
183     __m128 c0 = { 1.5747f,  1.5747f,  1.5747f,  1.5747f};
184     __m128 c1 = { 1.8556f,  1.8556f,  1.8556f,  1.8556f};
185     __m128 c2 = {-0.1873f, -0.1873f, -0.1873f, -0.1873f};
186     __m128 c3 = {-0.4682f, -0.4682f, -0.4682f, -0.4682f};
187 
188     __m128 *r = (__m128 *)comp0;
189     __m128 *g = (__m128 *)comp1;
190     __m128 *b = (__m128 *)comp2;
191     __m128 src[3];
192 
193     #define CSC_INVERSE_709_SSE2_LOOP(i)                       \
194             src[0] = r[i];                                     \
195             src[1] = g[i];                                     \
196             src[2] = b[i];                                     \
197                                                                \
198             r[i] = _mm_add_ps (r[i], _mm_mul_ps (src[2], c0)); \
199                                                                \
200             g[i]   = _mm_mul_ps (g[i], c2);                    \
201             src[2] = _mm_mul_ps (src[2], c3);                  \
202             g[i]   = _mm_add_ps (g[i], src[0]);                \
203             g[i]   = _mm_add_ps (g[i], src[2]);                \
204                                                                \
205             b[i] = _mm_mul_ps (c1,   src[1]);                  \
206             b[i] = _mm_add_ps (b[i], src[0]);
207 
208     CSC_INVERSE_709_SSE2_LOOP (0)
209     CSC_INVERSE_709_SSE2_LOOP (1)
210     CSC_INVERSE_709_SSE2_LOOP (2)
211     CSC_INVERSE_709_SSE2_LOOP (3)
212 
213     CSC_INVERSE_709_SSE2_LOOP (4)
214     CSC_INVERSE_709_SSE2_LOOP (5)
215     CSC_INVERSE_709_SSE2_LOOP (6)
216     CSC_INVERSE_709_SSE2_LOOP (7)
217 
218     CSC_INVERSE_709_SSE2_LOOP (8)
219     CSC_INVERSE_709_SSE2_LOOP (9)
220     CSC_INVERSE_709_SSE2_LOOP (10)
221     CSC_INVERSE_709_SSE2_LOOP (11)
222 
223     CSC_INVERSE_709_SSE2_LOOP (12)
224     CSC_INVERSE_709_SSE2_LOOP (13)
225     CSC_INVERSE_709_SSE2_LOOP (14)
226     CSC_INVERSE_709_SSE2_LOOP (15)
227 }
228 
229 #endif /* IMF_HAVE_SSE2 */
230 
231 
232 //
233 // Color space conversion, Forward 709 CSC, R'G'B' -> Y'CbCr
234 //
235 // Simple FPU color space conversion. Based on the 709
236 // primary chromaticies, with no scaling or offsets.
237 //
238 
239 void
csc709Forward64(float * comp0,float * comp1,float * comp2)240 csc709Forward64 (float *comp0, float *comp1, float *comp2)
241 {
242     float src[3];
243 
244     for (int i = 0; i<64; ++i)
245     {
246         src[0] = comp0[i];
247         src[1] = comp1[i];
248         src[2] = comp2[i];
249 
250         comp0[i] =  0.2126f * src[0] + 0.7152f * src[1] + 0.0722f * src[2];
251         comp1[i] = -0.1146f * src[0] - 0.3854f * src[1] + 0.5000f * src[2];
252         comp2[i] =  0.5000f * src[0] - 0.4542f * src[1] - 0.0458f * src[2];
253     }
254 }
255 
256 
257 //
258 // Byte interleaving of 2 byte arrays:
259 //    src0 = AAAA
260 //    src1 = BBBB
261 //    dst  = ABABABAB
262 //
263 // numBytes is the size of each of the source buffers
264 //
265 
266 #ifndef IMF_HAVE_SSE2
267 
268 //
269 // Scalar default implementation
270 //
271 
272 void
interleaveByte2(char * dst,char * src0,char * src1,int numBytes)273 interleaveByte2 (char *dst, char *src0, char *src1, int numBytes)
274 {
275     for (int x = 0; x < numBytes; ++x)
276     {
277         dst[2 * x]     = src0[x];
278         dst[2 * x + 1] = src1[x];
279     }
280 }
281 
282 #else  /* IMF_HAVE_SSE2 */
283 
284 //
285 // SSE2 byte interleaving
286 //
287 
288 void
interleaveByte2(char * dst,char * src0,char * src1,int numBytes)289 interleaveByte2 (char *dst, char *src0, char *src1, int numBytes)
290 {
291     int dstAlignment  = (size_t)dst  % 16;
292     int src0Alignment = (size_t)src0 % 16;
293     int src1Alignment = (size_t)src1 % 16;
294 
295     __m128i *dst_epi8  = (__m128i*)dst;
296     __m128i *src0_epi8 = (__m128i*)src0;
297     __m128i *src1_epi8 = (__m128i*)src1;
298     int sseWidth  =  numBytes / 16;
299 
300     if ((!dstAlignment) && (!src0Alignment) && (!src1Alignment))
301     {
302         __m128i tmp0, tmp1;
303 
304         //
305         // Aligned loads and stores
306         //
307 
308         for (int x = 0; x < sseWidth; ++x)
309         {
310             tmp0 = src0_epi8[x];
311             tmp1 = src1_epi8[x];
312 
313             _mm_stream_si128 (&dst_epi8[2 * x],
314                               _mm_unpacklo_epi8 (tmp0, tmp1));
315 
316             _mm_stream_si128 (&dst_epi8[2 * x + 1],
317                               _mm_unpackhi_epi8 (tmp0, tmp1));
318         }
319 
320         //
321         // Then do run the leftovers one at a time
322         //
323 
324         for (int x = 16 * sseWidth; x < numBytes; ++x)
325         {
326             dst[2 * x]     = src0[x];
327             dst[2 * x + 1] = src1[x];
328         }
329     }
330     else if ((!dstAlignment) && (src0Alignment == 8) && (src1Alignment == 8))
331     {
332         //
333         // Aligned stores, but catch up a few values so we can
334         // use aligned loads
335         //
336 
337         for (int x = 0; x < 8; ++x)
338         {
339             dst[2 * x]     = src0[x];
340             dst[2 * x + 1] = src1[x];
341         }
342 
343         dst_epi8  = (__m128i*)&dst[16];
344         src0_epi8 = (__m128i*)&src0[8];
345         src1_epi8 = (__m128i*)&src1[8];
346         sseWidth  =  (numBytes - 8) / 16;
347 
348         for (int x=0; x<sseWidth; ++x)
349         {
350             _mm_stream_si128 (&dst_epi8[2 * x],
351                               _mm_unpacklo_epi8 (src0_epi8[x], src1_epi8[x]));
352 
353             _mm_stream_si128 (&dst_epi8[2 * x + 1],
354                               _mm_unpackhi_epi8 (src0_epi8[x], src1_epi8[x]));
355         }
356 
357         //
358         // Then do run the leftovers one at a time
359         //
360 
361         for (int x = 16 * sseWidth + 8; x < numBytes; ++x)
362         {
363             dst[2 * x]     = src0[x];
364             dst[2 * x + 1] = src1[x];
365         }
366     }
367     else
368     {
369         //
370         // Unaligned everything
371         //
372 
373         for (int x = 0; x < sseWidth; ++x)
374         {
375             __m128i tmpSrc0_epi8 = _mm_loadu_si128 (&src0_epi8[x]);
376             __m128i tmpSrc1_epi8 = _mm_loadu_si128 (&src1_epi8[x]);
377 
378             _mm_storeu_si128 (&dst_epi8[2 * x],
379                               _mm_unpacklo_epi8 (tmpSrc0_epi8, tmpSrc1_epi8));
380 
381             _mm_storeu_si128 (&dst_epi8[2 * x + 1],
382                               _mm_unpackhi_epi8 (tmpSrc0_epi8, tmpSrc1_epi8));
383         }
384 
385         //
386         // Then do run the leftovers one at a time
387         //
388 
389         for (int x = 16 * sseWidth; x < numBytes; ++x)
390         {
391             dst[2 * x]     = src0[x];
392             dst[2 * x + 1] = src1[x];
393         }
394     }
395 }
396 
397 #endif /* IMF_HAVE_SSE2 */
398 
399 
400 //
401 // Float -> half float conversion
402 //
403 // To enable F16C based conversion, we can't rely on compile-time
404 // detection, hence the multiple defined versions. Pick one based
405 // on runtime cpuid detection.
406 //
407 
408 //
409 // Default boring conversion
410 //
411 
412 void
convertFloatToHalf64_scalar(unsigned short * dst,float * src)413 convertFloatToHalf64_scalar (unsigned short *dst, float *src)
414 {
415     for (int i=0; i<64; ++i)
416         dst[i] = ((half)src[i]).bits();
417 }
418 
419 
420 //
421 // F16C conversion - Assumes aligned src and dst
422 //
423 
424 void
convertFloatToHalf64_f16c(unsigned short * dst,float * src)425 convertFloatToHalf64_f16c (unsigned short *dst, float *src)
426 {
427     //
428     // Ordinarly, I'd avoid using inline asm and prefer intrinsics.
429     // However, in order to get the intrinsics, we need to tell
430     // the compiler to generate VEX instructions.
431     //
432     // (On the GCC side, -mf16c goes ahead and activates -mavc,
433     //  resulting in VEX code. Without -mf16c, no intrinsics..)
434     //
435     // Now, it's quite likely that we'll find ourselves in situations
436     // where we want to build *without* VEX, in order to maintain
437     // maximum compatability. But to get there with intrinsics,
438     // we'd need to break out code into a separate file. Bleh.
439     // I'll take the asm.
440     //
441 
442     #if defined IMF_HAVE_GCC_INLINEASM
443         __asm__
444            ("vmovaps       (%0),     %%ymm0         \n"
445             "vmovaps   0x20(%0),     %%ymm1         \n"
446             "vmovaps   0x40(%0),     %%ymm2         \n"
447             "vmovaps   0x60(%0),     %%ymm3         \n"
448             "vcvtps2ph $0,           %%ymm0, %%xmm0 \n"
449             "vcvtps2ph $0,           %%ymm1, %%xmm1 \n"
450             "vcvtps2ph $0,           %%ymm2, %%xmm2 \n"
451             "vcvtps2ph $0,           %%ymm3, %%xmm3 \n"
452             "vmovdqa   %%xmm0,       0x00(%1)       \n"
453             "vmovdqa   %%xmm1,       0x10(%1)       \n"
454             "vmovdqa   %%xmm2,       0x20(%1)       \n"
455             "vmovdqa   %%xmm3,       0x30(%1)       \n"
456             "vmovaps   0x80(%0),     %%ymm0         \n"
457             "vmovaps   0xa0(%0),     %%ymm1         \n"
458             "vmovaps   0xc0(%0),     %%ymm2         \n"
459             "vmovaps   0xe0(%0),     %%ymm3         \n"
460             "vcvtps2ph $0,           %%ymm0, %%xmm0 \n"
461             "vcvtps2ph $0,           %%ymm1, %%xmm1 \n"
462             "vcvtps2ph $0,           %%ymm2, %%xmm2 \n"
463             "vcvtps2ph $0,           %%ymm3, %%xmm3 \n"
464             "vmovdqa   %%xmm0,       0x40(%1)       \n"
465             "vmovdqa   %%xmm1,       0x50(%1)       \n"
466             "vmovdqa   %%xmm2,       0x60(%1)       \n"
467             "vmovdqa   %%xmm3,       0x70(%1)       \n"
468         #ifndef __AVX__
469             "vzeroupper                             \n"
470         #endif /* __AVX__ */
471             : /* Output  */
472             : /* Input   */ "r"(src), "r"(dst)
473         #ifndef __AVX__
474             : /* Clobber */ "%xmm0", "%xmm1", "%xmm2", "%xmm3", "memory"
475         #else
476             : /* Clobber */ "%ymm0", "%ymm1", "%ymm2", "%ymm3", "memory"
477         #endif /* __AVX__ */
478            );
479     #else
480         convertFloatToHalf64_scalar (dst, src);
481     #endif /* IMF_HAVE_GCC_INLINEASM */
482 }
483 
484 
485 //
486 // Convert an 8x8 block of HALF from zig-zag order to
487 // FLOAT in normal order. The order we want is:
488 //
489 //          src                           dst
490 //  0  1  2  3  4  5  6  7       0  1  5  6 14 15 27 28
491 //  8  9 10 11 12 13 14 15       2  4  7 13 16 26 29 42
492 // 16 17 18 19 20 21 22 23       3  8 12 17 25 30 41 43
493 // 24 25 26 27 28 29 30 31       9 11 18 24 31 40 44 53
494 // 32 33 34 35 36 37 38 39      10 19 23 32 39 45 52 54
495 // 40 41 42 43 44 45 46 47      20 22 33 38 46 51 55 60
496 // 48 49 50 51 52 53 54 55      21 34 37 47 50 56 59 61
497 // 56 57 58 59 60 61 62 63      35 36 48 49 57 58 62 63
498 //
499 
500 void
fromHalfZigZag_scalar(unsigned short * src,float * dst)501 fromHalfZigZag_scalar (unsigned short *src, float *dst)
502 {
503     half *srcHalf = (half *)src;
504 
505     dst[0] = (float)srcHalf[0];
506     dst[1] = (float)srcHalf[1];
507     dst[2] = (float)srcHalf[5];
508     dst[3] = (float)srcHalf[6];
509     dst[4] = (float)srcHalf[14];
510     dst[5] = (float)srcHalf[15];
511     dst[6] = (float)srcHalf[27];
512     dst[7] = (float)srcHalf[28];
513     dst[8] = (float)srcHalf[2];
514     dst[9] = (float)srcHalf[4];
515 
516     dst[10] = (float)srcHalf[7];
517     dst[11] = (float)srcHalf[13];
518     dst[12] = (float)srcHalf[16];
519     dst[13] = (float)srcHalf[26];
520     dst[14] = (float)srcHalf[29];
521     dst[15] = (float)srcHalf[42];
522     dst[16] = (float)srcHalf[3];
523     dst[17] = (float)srcHalf[8];
524     dst[18] = (float)srcHalf[12];
525     dst[19] = (float)srcHalf[17];
526 
527     dst[20] = (float)srcHalf[25];
528     dst[21] = (float)srcHalf[30];
529     dst[22] = (float)srcHalf[41];
530     dst[23] = (float)srcHalf[43];
531     dst[24] = (float)srcHalf[9];
532     dst[25] = (float)srcHalf[11];
533     dst[26] = (float)srcHalf[18];
534     dst[27] = (float)srcHalf[24];
535     dst[28] = (float)srcHalf[31];
536     dst[29] = (float)srcHalf[40];
537 
538     dst[30] = (float)srcHalf[44];
539     dst[31] = (float)srcHalf[53];
540     dst[32] = (float)srcHalf[10];
541     dst[33] = (float)srcHalf[19];
542     dst[34] = (float)srcHalf[23];
543     dst[35] = (float)srcHalf[32];
544     dst[36] = (float)srcHalf[39];
545     dst[37] = (float)srcHalf[45];
546     dst[38] = (float)srcHalf[52];
547     dst[39] = (float)srcHalf[54];
548 
549     dst[40] = (float)srcHalf[20];
550     dst[41] = (float)srcHalf[22];
551     dst[42] = (float)srcHalf[33];
552     dst[43] = (float)srcHalf[38];
553     dst[44] = (float)srcHalf[46];
554     dst[45] = (float)srcHalf[51];
555     dst[46] = (float)srcHalf[55];
556     dst[47] = (float)srcHalf[60];
557     dst[48] = (float)srcHalf[21];
558     dst[49] = (float)srcHalf[34];
559 
560     dst[50] = (float)srcHalf[37];
561     dst[51] = (float)srcHalf[47];
562     dst[52] = (float)srcHalf[50];
563     dst[53] = (float)srcHalf[56];
564     dst[54] = (float)srcHalf[59];
565     dst[55] = (float)srcHalf[61];
566     dst[56] = (float)srcHalf[35];
567     dst[57] = (float)srcHalf[36];
568     dst[58] = (float)srcHalf[48];
569     dst[59] = (float)srcHalf[49];
570 
571     dst[60] = (float)srcHalf[57];
572     dst[61] = (float)srcHalf[58];
573     dst[62] = (float)srcHalf[62];
574     dst[63] = (float)srcHalf[63];
575 }
576 
577 
578 //
579 // If we can form the correct ordering in xmm registers,
580 // we can use F16C to convert from HALF -> FLOAT. However,
581 // making the correct order isn't trivial.
582 //
583 // We want to re-order a source 8x8 matrix from:
584 //
585 //  0  1  2  3  4  5  6  7       0  1  5  6 14 15 27 28
586 //  8  9 10 11 12 13 14 15       2  4  7 13 16 26 29 42
587 // 16 17 18 19 20 21 22 23       3  8 12 17 25 30 41 43
588 // 24 25 26 27 28 29 30 31       9 11 18 24 31 40 44 53   (A)
589 // 32 33 34 35 36 37 38 39  --> 10 19 23 32 39 45 52 54
590 // 40 41 42 43 44 45 46 47      20 22 33 38 46 51 55 60
591 // 48 49 50 51 52 53 54 55      21 34 37 47 50 56 59 61
592 // 56 57 58 59 60 61 62 63      35 36 48 49 57 58 62 63
593 //
594 // Which looks like a mess, right?
595 //
596 // Now, check out the NE/SW diagonals of (A). Along those lines,
597 // we have runs of contiguous values! If we rewrite (A) a bit, we get:
598 //
599 //  0
600 //  1  2
601 //  5  4  3
602 //  6  7  8  9
603 // 14 13 12 11 10
604 // 15 16 17 18 19 20
605 // 27 26 25 24 23 22 21            (B)
606 // 28 29 30 31 32 33 34 35
607 //    42 41 40 39 38 37 36
608 //       43 44 45 46 47 48
609 //          53 52 51 50 49
610 //             54 55 56 57
611 //                60 59 58
612 //                   61 62
613 //                      63
614 //
615 // In this ordering, the columns are the rows (A). If we can 'transpose'
616 // (B), we'll achieve our goal. But we want this to fit nicely into
617 // xmm registers and still be able to load large runs efficiently.
618 // Also, notice that the odd rows are in ascending order, while
619 // the even rows are in descending order.
620 //
621 // If we 'fold' the bottom half up into the top, we can preserve ordered
622 // runs accross rows, and still keep all the correct values in columns.
623 // After transposing, we'll need to rotate things back into place.
624 // This gives us:
625 //
626 //  0 | 42   41   40   39   38   37   36
627 //  1    2 | 43   44   45   46   47   48
628 //  5    4    3 | 53   52   51   50   49
629 //  6    7    8    9 | 54   55   56   57      (C)
630 // 14   13   12   11   10 | 60   59   58
631 // 15   16   17   18   19   20 | 61   62
632 // 27   26   25   24   23   22   21 | 61
633 // 28   29   30   31   32   33   34   35
634 //
635 // But hang on. We still have the backwards descending rows to deal with.
636 // Lets reverse the even rows so that all values are in ascending order
637 //
638 //  36   37  38   39   40   41   42 | 0
639 //  1    2 | 43   44   45   46   47   48
640 //  49   50  51   52   53 |  3    4    5
641 //  6    7    8    9 | 54   55   56   57      (D)
642 // 58   59   60 | 10   11   12   13   14
643 // 15   16   17   18   19   20 | 61   62
644 // 61 | 21   22   23   24   25   26   27
645 // 28   29   30   31   32   33   34   35
646 //
647 // If we can form (D),  we will then:
648 //   1) Reverse the even rows
649 //   2) Transpose
650 //   3) Rotate the rows
651 //
652 // and we'll have (A).
653 //
654 
655 void
fromHalfZigZag_f16c(unsigned short * src,float * dst)656 fromHalfZigZag_f16c (unsigned short *src, float *dst)
657 {
658     #if defined IMF_HAVE_GCC_INLINEASM_64
659         __asm__
660 
661            /* x3 <- 0
662             * x8 <- [ 0- 7]
663             * x6 <- [56-63]
664             * x9 <- [21-28]
665             * x7 <- [28-35]
666             * x3 <- [ 6- 9] (lower half) */
667 
668           ("vpxor   %%xmm3,  %%xmm3, %%xmm3   \n"
669            "vmovdqa    (%0), %%xmm8           \n"
670            "vmovdqa 112(%0), %%xmm6           \n"
671            "vmovdqu  42(%0), %%xmm9           \n"
672            "vmovdqu  56(%0), %%xmm7           \n"
673            "vmovq    12(%0), %%xmm3           \n"
674 
675            /* Setup rows 0-2 of A in xmm0-xmm2
676             * x1 <- x8 >> 16 (1 value)
677             * x2 <- x8 << 32 (2 values)
678             * x0 <- alignr([35-42], x8, 2)
679             * x1 <- blend(x1, [41-48])
680             * x2 <- blend(x2, [49-56])     */
681 
682            "vpsrldq      $2, %%xmm8, %%xmm1   \n"
683            "vpslldq      $4, %%xmm8, %%xmm2   \n"
684            "vpalignr     $2, 70(%0), %%xmm8, %%xmm0 \n"
685            "vpblendw  $0xfc, 82(%0), %%xmm1, %%xmm1 \n"
686            "vpblendw  $0x1f, 98(%0), %%xmm2, %%xmm2 \n"
687 
688            /* Setup rows 4-6 of A in xmm4-xmm6
689             * x4 <- x6 >> 32 (2 values)
690             * x5 <- x6 << 16 (1 value)
691             * x6 <- alignr(x6,x9,14)
692             * x4 <- blend(x4, [ 7-14])
693             * x5 <- blend(x5, [15-22])    */
694 
695            "vpsrldq      $4, %%xmm6, %%xmm4         \n"
696            "vpslldq      $2, %%xmm6, %%xmm5         \n"
697            "vpalignr    $14, %%xmm6, %%xmm9, %%xmm6 \n"
698            "vpblendw  $0xf8, 14(%0), %%xmm4, %%xmm4 \n"
699            "vpblendw  $0x3f, 30(%0), %%xmm5, %%xmm5 \n"
700 
701            /* Load the upper half of row 3 into xmm3
702             * x3 <- [54-57] (upper half) */
703 
704            "vpinsrq      $1, 108(%0), %%xmm3, %%xmm3\n"
705 
706            /* Reverse the even rows. We're not using PSHUFB as
707             * that requires loading an extra constant all the time,
708             * and we're alreadly pretty memory bound.
709             */
710 
711            "vpshuflw $0x1b, %%xmm0, %%xmm0          \n"
712            "vpshuflw $0x1b, %%xmm2, %%xmm2          \n"
713            "vpshuflw $0x1b, %%xmm4, %%xmm4          \n"
714            "vpshuflw $0x1b, %%xmm6, %%xmm6          \n"
715 
716            "vpshufhw $0x1b, %%xmm0, %%xmm0          \n"
717            "vpshufhw $0x1b, %%xmm2, %%xmm2          \n"
718            "vpshufhw $0x1b, %%xmm4, %%xmm4          \n"
719            "vpshufhw $0x1b, %%xmm6, %%xmm6          \n"
720 
721            "vpshufd $0x4e, %%xmm0, %%xmm0          \n"
722            "vpshufd $0x4e, %%xmm2, %%xmm2          \n"
723            "vpshufd $0x4e, %%xmm4, %%xmm4          \n"
724            "vpshufd $0x4e, %%xmm6, %%xmm6          \n"
725 
726            /* Transpose xmm0-xmm7 into xmm8-xmm15 */
727 
728            "vpunpcklwd %%xmm1, %%xmm0, %%xmm8       \n"
729            "vpunpcklwd %%xmm3, %%xmm2, %%xmm9       \n"
730            "vpunpcklwd %%xmm5, %%xmm4, %%xmm10      \n"
731            "vpunpcklwd %%xmm7, %%xmm6, %%xmm11      \n"
732            "vpunpckhwd %%xmm1, %%xmm0, %%xmm12      \n"
733            "vpunpckhwd %%xmm3, %%xmm2, %%xmm13      \n"
734            "vpunpckhwd %%xmm5, %%xmm4, %%xmm14      \n"
735            "vpunpckhwd %%xmm7, %%xmm6, %%xmm15      \n"
736 
737            "vpunpckldq  %%xmm9,  %%xmm8, %%xmm0     \n"
738            "vpunpckldq %%xmm11, %%xmm10, %%xmm1     \n"
739            "vpunpckhdq  %%xmm9,  %%xmm8, %%xmm2     \n"
740            "vpunpckhdq %%xmm11, %%xmm10, %%xmm3     \n"
741            "vpunpckldq %%xmm13, %%xmm12, %%xmm4     \n"
742            "vpunpckldq %%xmm15, %%xmm14, %%xmm5     \n"
743            "vpunpckhdq %%xmm13, %%xmm12, %%xmm6     \n"
744            "vpunpckhdq %%xmm15, %%xmm14, %%xmm7     \n"
745 
746            "vpunpcklqdq %%xmm1,  %%xmm0, %%xmm8     \n"
747            "vpunpckhqdq %%xmm1,  %%xmm0, %%xmm9     \n"
748            "vpunpcklqdq %%xmm3,  %%xmm2, %%xmm10    \n"
749            "vpunpckhqdq %%xmm3,  %%xmm2, %%xmm11    \n"
750            "vpunpcklqdq %%xmm4,  %%xmm5, %%xmm12    \n"
751            "vpunpckhqdq %%xmm5,  %%xmm4, %%xmm13    \n"
752            "vpunpcklqdq %%xmm7,  %%xmm6, %%xmm14    \n"
753            "vpunpckhqdq %%xmm7,  %%xmm6, %%xmm15    \n"
754 
755            /* Rotate the rows to get the correct final order.
756             * Rotating xmm12 isn't needed, as we can handle
757             * the rotation in the PUNPCKLQDQ above. Rotating
758             * xmm8 isn't needed as it's already in the right order
759             */
760 
761            "vpalignr  $2,  %%xmm9,  %%xmm9,  %%xmm9 \n"
762            "vpalignr  $4, %%xmm10, %%xmm10, %%xmm10 \n"
763            "vpalignr  $6, %%xmm11, %%xmm11, %%xmm11 \n"
764            "vpalignr $10, %%xmm13, %%xmm13, %%xmm13 \n"
765            "vpalignr $12, %%xmm14, %%xmm14, %%xmm14 \n"
766            "vpalignr $14, %%xmm15, %%xmm15, %%xmm15 \n"
767 
768             /* Convert from half -> float */
769 
770            "vcvtph2ps  %%xmm8, %%ymm8            \n"
771            "vcvtph2ps  %%xmm9, %%ymm9            \n"
772            "vcvtph2ps %%xmm10, %%ymm10           \n"
773            "vcvtph2ps %%xmm11, %%ymm11           \n"
774            "vcvtph2ps %%xmm12, %%ymm12           \n"
775            "vcvtph2ps %%xmm13, %%ymm13           \n"
776            "vcvtph2ps %%xmm14, %%ymm14           \n"
777            "vcvtph2ps %%xmm15, %%ymm15           \n"
778 
779            /* Move float values to dst */
780 
781            "vmovaps    %%ymm8,    (%1)           \n"
782            "vmovaps    %%ymm9,  32(%1)           \n"
783            "vmovaps   %%ymm10,  64(%1)           \n"
784            "vmovaps   %%ymm11,  96(%1)           \n"
785            "vmovaps   %%ymm12, 128(%1)           \n"
786            "vmovaps   %%ymm13, 160(%1)           \n"
787            "vmovaps   %%ymm14, 192(%1)           \n"
788            "vmovaps   %%ymm15, 224(%1)           \n"
789         #ifndef __AVX__
790             "vzeroupper                          \n"
791         #endif /* __AVX__ */
792             : /* Output  */
793             : /* Input   */ "r"(src), "r"(dst)
794             : /* Clobber */ "memory",
795         #ifndef __AVX__
796                             "%xmm0",  "%xmm1",  "%xmm2",  "%xmm3",
797                             "%xmm4",  "%xmm5",  "%xmm6",  "%xmm7",
798                             "%xmm8",  "%xmm9",  "%xmm10", "%xmm11",
799                             "%xmm12", "%xmm13", "%xmm14", "%xmm15"
800         #else
801                             "%ymm0",  "%ymm1",  "%ymm2",  "%ymm3",
802                             "%ymm4",  "%ymm5",  "%ymm6",  "%ymm7",
803                             "%ymm8",  "%ymm9",  "%ymm10", "%ymm11",
804                             "%ymm12", "%ymm13", "%ymm14", "%ymm15"
805         #endif /* __AVX__ */
806         );
807 
808     #else
809         fromHalfZigZag_scalar(src, dst);
810     #endif /* defined IMF_HAVE_GCC_INLINEASM_64 */
811 }
812 
813 
814 //
815 // Inverse 8x8 DCT, only inverting the DC. This assumes that
816 // all AC frequencies are 0.
817 //
818 
819 #ifndef IMF_HAVE_SSE2
820 
821 void
dctInverse8x8DcOnly(float * data)822 dctInverse8x8DcOnly (float *data)
823 {
824     float val = data[0] * 3.535536e-01f * 3.535536e-01f;
825 
826     for (int i = 0; i < 64; ++i)
827         data[i] = val;
828 }
829 
830 #else  /* IMF_HAVE_SSE2 */
831 
832 void
dctInverse8x8DcOnly(float * data)833 dctInverse8x8DcOnly (float *data)
834 {
835     __m128 src = _mm_set1_ps (data[0] * 3.535536e-01f * 3.535536e-01f);
836     __m128 *dst = (__m128 *)data;
837 
838     for (int i = 0; i < 16; ++i)
839         dst[i] = src;
840 }
841 
842 #endif /* IMF_HAVE_SSE2 */
843 
844 
845 //
846 // Full 8x8 Inverse DCT:
847 //
848 // Simple inverse DCT on an 8x8 block, with scalar ops only.
849 //  Operates on data in-place.
850 //
851 // This is based on the iDCT formuation (y = frequency domain,
852 //                                       x = spatial domain)
853 //
854 //    [x0]    [        ][y0]    [        ][y1]
855 //    [x1] =  [  M1    ][y2]  + [  M2    ][y3]
856 //    [x2]    [        ][y4]    [        ][y5]
857 //    [x3]    [        ][y6]    [        ][y7]
858 //
859 //    [x7]    [        ][y0]    [        ][y1]
860 //    [x6] =  [  M1    ][y2]  - [  M2    ][y3]
861 //    [x5]    [        ][y4]    [        ][y5]
862 //    [x4]    [        ][y6]    [        ][y7]
863 //
864 // where M1:             M2:
865 //
866 //   [a  c  a   f]     [b  d  e  g]
867 //   [a  f -a  -c]     [d -g -b -e]
868 //   [a -f -a   c]     [e -b  g  d]
869 //   [a -c  a  -f]     [g -e  d -b]
870 //
871 // and the constants are as defined below..
872 //
873 // If you know how many of the lower rows are zero, that can
874 // be passed in to help speed things up. If you don't know,
875 // just set zeroedRows=0.
876 //
877 
878 //
879 // Default implementation
880 //
881 
882 template <int zeroedRows>
883 void
dctInverse8x8_scalar(float * data)884 dctInverse8x8_scalar (float *data)
885 {
886     const float a = .5f * cosf (3.14159f / 4.0f);
887     const float b = .5f * cosf (3.14159f / 16.0f);
888     const float c = .5f * cosf (3.14159f / 8.0f);
889     const float d = .5f * cosf (3.f*3.14159f / 16.0f);
890     const float e = .5f * cosf (5.f*3.14159f / 16.0f);
891     const float f = .5f * cosf (3.f*3.14159f / 8.0f);
892     const float g = .5f * cosf (7.f*3.14159f / 16.0f);
893 
894     float alpha[4], beta[4], theta[4], gamma[4];
895 
896     float *rowPtr = NULL;
897 
898     //
899     // First pass - row wise.
900     //
901     // This looks less-compact than the description above in
902     // an attempt to fold together common sub-expressions.
903     //
904 
905     for (int row = 0; row < 8 - zeroedRows; ++row)
906     {
907         rowPtr = data + row * 8;
908 
909         alpha[0] = c * rowPtr[2];
910         alpha[1] = f * rowPtr[2];
911         alpha[2] = c * rowPtr[6];
912         alpha[3] = f * rowPtr[6];
913 
914         beta[0] = b * rowPtr[1] + d * rowPtr[3] + e * rowPtr[5] + g * rowPtr[7];
915         beta[1] = d * rowPtr[1] - g * rowPtr[3] - b * rowPtr[5] - e * rowPtr[7];
916         beta[2] = e * rowPtr[1] - b * rowPtr[3] + g * rowPtr[5] + d * rowPtr[7];
917         beta[3] = g * rowPtr[1] - e * rowPtr[3] + d * rowPtr[5] - b * rowPtr[7];
918 
919         theta[0] = a * (rowPtr[0] + rowPtr[4]);
920         theta[3] = a * (rowPtr[0] - rowPtr[4]);
921 
922         theta[1] = alpha[0] + alpha[3];
923         theta[2] = alpha[1] - alpha[2];
924 
925 
926         gamma[0] = theta[0] + theta[1];
927         gamma[1] = theta[3] + theta[2];
928         gamma[2] = theta[3] - theta[2];
929         gamma[3] = theta[0] - theta[1];
930 
931 
932         rowPtr[0] = gamma[0] + beta[0];
933         rowPtr[1] = gamma[1] + beta[1];
934         rowPtr[2] = gamma[2] + beta[2];
935         rowPtr[3] = gamma[3] + beta[3];
936 
937         rowPtr[4] = gamma[3] - beta[3];
938         rowPtr[5] = gamma[2] - beta[2];
939         rowPtr[6] = gamma[1] - beta[1];
940         rowPtr[7] = gamma[0] - beta[0];
941     }
942 
943     //
944     // Second pass - column wise.
945     //
946 
947     for (int column = 0; column < 8; ++column)
948     {
949         alpha[0] = c * data[16+column];
950         alpha[1] = f * data[16+column];
951         alpha[2] = c * data[48+column];
952         alpha[3] = f * data[48+column];
953 
954         beta[0] = b * data[8+column]  + d * data[24+column] +
955                   e * data[40+column] + g * data[56+column];
956 
957         beta[1] = d * data[8+column]  - g * data[24+column] -
958                   b * data[40+column] - e * data[56+column];
959 
960         beta[2] = e * data[8+column]  - b * data[24+column] +
961                   g * data[40+column] + d * data[56+column];
962 
963         beta[3] = g * data[8+column]  - e * data[24+column] +
964                   d * data[40+column] - b * data[56+column];
965 
966         theta[0] = a * (data[column] + data[32+column]);
967         theta[3] = a * (data[column] - data[32+column]);
968 
969         theta[1] = alpha[0] + alpha[3];
970         theta[2] = alpha[1] - alpha[2];
971 
972         gamma[0] = theta[0] + theta[1];
973         gamma[1] = theta[3] + theta[2];
974         gamma[2] = theta[3] - theta[2];
975         gamma[3] = theta[0] - theta[1];
976 
977         data[     column] = gamma[0] + beta[0];
978         data[ 8 + column] = gamma[1] + beta[1];
979         data[16 + column] = gamma[2] + beta[2];
980         data[24 + column] = gamma[3] + beta[3];
981 
982         data[32 + column] = gamma[3] - beta[3];
983         data[40 + column] = gamma[2] - beta[2];
984         data[48 + column] = gamma[1] - beta[1];
985         data[56 + column] = gamma[0] - beta[0];
986     }
987 }
988 
989 
990 //
991 // SSE2 Implementation
992 //
993 
994 template <int zeroedRows>
995 void
dctInverse8x8_sse2(float * data)996 dctInverse8x8_sse2 (float *data)
997 {
998     #ifdef IMF_HAVE_SSE2
999         __m128 a  = {3.535536e-01f,3.535536e-01f,3.535536e-01f,3.535536e-01f};
1000         __m128 b  = {4.903927e-01f,4.903927e-01f,4.903927e-01f,4.903927e-01f};
1001         __m128 c  = {4.619398e-01f,4.619398e-01f,4.619398e-01f,4.619398e-01f};
1002         __m128 d  = {4.157349e-01f,4.157349e-01f,4.157349e-01f,4.157349e-01f};
1003         __m128 e  = {2.777855e-01f,2.777855e-01f,2.777855e-01f,2.777855e-01f};
1004         __m128 f  = {1.913422e-01f,1.913422e-01f,1.913422e-01f,1.913422e-01f};
1005         __m128 g  = {9.754573e-02f,9.754573e-02f,9.754573e-02f,9.754573e-02f};
1006 
1007         __m128 c0 = {3.535536e-01f, 3.535536e-01f, 3.535536e-01f, 3.535536e-01f};
1008         __m128 c1 = {4.619398e-01f, 1.913422e-01f,-1.913422e-01f,-4.619398e-01f};
1009         __m128 c2 = {3.535536e-01f,-3.535536e-01f,-3.535536e-01f, 3.535536e-01f};
1010         __m128 c3 = {1.913422e-01f,-4.619398e-01f, 4.619398e-01f,-1.913422e-01f};
1011 
1012         __m128 c4 = {4.903927e-01f, 4.157349e-01f, 2.777855e-01f, 9.754573e-02f};
1013         __m128 c5 = {4.157349e-01f,-9.754573e-02f,-4.903927e-01f,-2.777855e-01f};
1014         __m128 c6 = {2.777855e-01f,-4.903927e-01f, 9.754573e-02f, 4.157349e-01f};
1015         __m128 c7 = {9.754573e-02f,-2.777855e-01f, 4.157349e-01f,-4.903927e-01f};
1016 
1017         __m128 *srcVec = (__m128 *)data;
1018         __m128 x[8], evenSum, oddSum;
1019         __m128 in[8], alpha[4], beta[4], theta[4], gamma[4];
1020 
1021         //
1022         // Rows -
1023         //
1024         //  Treat this just like matrix-vector multiplication. The
1025         //  trick is to note that:
1026         //
1027         //    [M00 M01 M02 M03][v0]   [(v0 M00) + (v1 M01) + (v2 M02) + (v3 M03)]
1028         //    [M10 M11 M12 M13][v1] = [(v0 M10) + (v1 M11) + (v2 M12) + (v3 M13)]
1029         //    [M20 M21 M22 M23][v2]   [(v0 M20) + (v1 M21) + (v2 M22) + (v3 M23)]
1030         //    [M30 M31 M32 M33][v3]   [(v0 M30) + (v1 M31) + (v2 M32) + (v3 M33)]
1031         //
1032         // Then, we can fill a register with v_i and multiply by the i-th column
1033         // of M, accumulating across all i-s.
1034         //
1035         // The kids refer to the populating of a register with a single value
1036         // "broadcasting", and it can be done with a shuffle instruction. It
1037         // seems to be the slowest part of the whole ordeal.
1038         //
1039         // Our matrix columns are stored above in c0-c7. c0-3 make up M1, and
1040         // c4-7 are from M2.
1041         //
1042 
1043         #define DCT_INVERSE_8x8_SS2_ROW_LOOP(i)                             \
1044             /*                                                              \
1045              * Broadcast the components of the row                          \
1046              */                                                             \
1047                                                                             \
1048             x[0] = _mm_shuffle_ps (srcVec[2 * i],                           \
1049                                    srcVec[2 * i],                           \
1050                                    _MM_SHUFFLE (0, 0, 0, 0));               \
1051                                                                             \
1052             x[1] = _mm_shuffle_ps (srcVec[2 * i],                           \
1053                                    srcVec[2 * i],                           \
1054                                    _MM_SHUFFLE (1, 1, 1, 1));               \
1055                                                                             \
1056             x[2] = _mm_shuffle_ps (srcVec[2 * i],                           \
1057                                    srcVec[2 * i],                           \
1058                                    _MM_SHUFFLE (2, 2, 2, 2));               \
1059                                                                             \
1060             x[3] = _mm_shuffle_ps (srcVec[2 * i],                           \
1061                                    srcVec[2 * i],                           \
1062                                    _MM_SHUFFLE (3, 3, 3, 3));               \
1063                                                                             \
1064             x[4] = _mm_shuffle_ps (srcVec[2 * i + 1],                       \
1065                                    srcVec[2 * i + 1],                       \
1066                                    _MM_SHUFFLE (0, 0, 0, 0));               \
1067                                                                             \
1068             x[5] = _mm_shuffle_ps (srcVec[2 * i + 1],                       \
1069                                    srcVec[2 * i + 1],                       \
1070                                    _MM_SHUFFLE (1, 1, 1, 1));               \
1071                                                                             \
1072             x[6] = _mm_shuffle_ps (srcVec[2 * i + 1],                       \
1073                                    srcVec[2 * i + 1],                       \
1074                                    _MM_SHUFFLE (2, 2, 2, 2));               \
1075                                                                             \
1076             x[7] = _mm_shuffle_ps (srcVec[2 * i + 1],                       \
1077                                    srcVec[2 * i + 1],                       \
1078                                    _MM_SHUFFLE (3, 3, 3, 3));               \
1079             /*                                                              \
1080              * Multiply the components by each column of the matrix         \
1081              */                                                             \
1082                                                                             \
1083             x[0] = _mm_mul_ps (x[0], c0);                                   \
1084             x[2] = _mm_mul_ps (x[2], c1);                                   \
1085             x[4] = _mm_mul_ps (x[4], c2);                                   \
1086             x[6] = _mm_mul_ps (x[6], c3);                                   \
1087                                                                             \
1088             x[1] = _mm_mul_ps (x[1], c4);                                   \
1089             x[3] = _mm_mul_ps (x[3], c5);                                   \
1090             x[5] = _mm_mul_ps (x[5], c6);                                   \
1091             x[7] = _mm_mul_ps (x[7], c7);                                   \
1092                                                                             \
1093             /*                                                              \
1094              * Add across                                                   \
1095              */                                                             \
1096                                                                             \
1097             evenSum = _mm_setzero_ps();                                     \
1098             evenSum = _mm_add_ps (evenSum, x[0]);                           \
1099             evenSum = _mm_add_ps (evenSum, x[2]);                           \
1100             evenSum = _mm_add_ps (evenSum, x[4]);                           \
1101             evenSum = _mm_add_ps (evenSum, x[6]);                           \
1102                                                                             \
1103             oddSum = _mm_setzero_ps();                                      \
1104             oddSum = _mm_add_ps (oddSum, x[1]);                             \
1105             oddSum = _mm_add_ps (oddSum, x[3]);                             \
1106             oddSum = _mm_add_ps (oddSum, x[5]);                             \
1107             oddSum = _mm_add_ps (oddSum, x[7]);                             \
1108                                                                             \
1109             /*                                                              \
1110              * Final Sum:                                                   \
1111              *    out [0, 1, 2, 3] = evenSum + oddSum                       \
1112              *    out [7, 6, 5, 4] = evenSum - oddSum                       \
1113              */                                                             \
1114                                                                             \
1115             srcVec[2 * i]     = _mm_add_ps (evenSum, oddSum);               \
1116             srcVec[2 * i + 1] = _mm_sub_ps (evenSum, oddSum);               \
1117             srcVec[2 * i + 1] = _mm_shuffle_ps (srcVec[2 * i + 1],          \
1118                                                 srcVec[2 * i + 1],          \
1119                                                 _MM_SHUFFLE (0, 1, 2, 3));
1120 
1121         switch (zeroedRows)
1122         {
1123           case 0:
1124           default:
1125             DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
1126             DCT_INVERSE_8x8_SS2_ROW_LOOP (1)
1127             DCT_INVERSE_8x8_SS2_ROW_LOOP (2)
1128             DCT_INVERSE_8x8_SS2_ROW_LOOP (3)
1129             DCT_INVERSE_8x8_SS2_ROW_LOOP (4)
1130             DCT_INVERSE_8x8_SS2_ROW_LOOP (5)
1131             DCT_INVERSE_8x8_SS2_ROW_LOOP (6)
1132             DCT_INVERSE_8x8_SS2_ROW_LOOP (7)
1133             break;
1134 
1135           case 1:
1136             DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
1137             DCT_INVERSE_8x8_SS2_ROW_LOOP (1)
1138             DCT_INVERSE_8x8_SS2_ROW_LOOP (2)
1139             DCT_INVERSE_8x8_SS2_ROW_LOOP (3)
1140             DCT_INVERSE_8x8_SS2_ROW_LOOP (4)
1141             DCT_INVERSE_8x8_SS2_ROW_LOOP (5)
1142             DCT_INVERSE_8x8_SS2_ROW_LOOP (6)
1143             break;
1144 
1145           case 2:
1146             DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
1147             DCT_INVERSE_8x8_SS2_ROW_LOOP (1)
1148             DCT_INVERSE_8x8_SS2_ROW_LOOP (2)
1149             DCT_INVERSE_8x8_SS2_ROW_LOOP (3)
1150             DCT_INVERSE_8x8_SS2_ROW_LOOP (4)
1151             DCT_INVERSE_8x8_SS2_ROW_LOOP (5)
1152             break;
1153 
1154           case 3:
1155             DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
1156             DCT_INVERSE_8x8_SS2_ROW_LOOP (1)
1157             DCT_INVERSE_8x8_SS2_ROW_LOOP (2)
1158             DCT_INVERSE_8x8_SS2_ROW_LOOP (3)
1159             DCT_INVERSE_8x8_SS2_ROW_LOOP (4)
1160             break;
1161 
1162           case 4:
1163             DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
1164             DCT_INVERSE_8x8_SS2_ROW_LOOP (1)
1165             DCT_INVERSE_8x8_SS2_ROW_LOOP (2)
1166             DCT_INVERSE_8x8_SS2_ROW_LOOP (3)
1167             break;
1168 
1169           case 5:
1170             DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
1171             DCT_INVERSE_8x8_SS2_ROW_LOOP (1)
1172             DCT_INVERSE_8x8_SS2_ROW_LOOP (2)
1173             break;
1174 
1175           case 6:
1176             DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
1177             DCT_INVERSE_8x8_SS2_ROW_LOOP (1)
1178             break;
1179 
1180           case 7:
1181             DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
1182             break;
1183         }
1184 
1185         //
1186         // Columns -
1187         //
1188         // This is slightly more straightforward, if less readable. Here
1189         // we just operate on 4 columns at a time, in two batches.
1190         //
1191         // The slight mess is to try and cache sub-expressions, which
1192         // we ignore in the row-wise pass.
1193         //
1194 
1195         for (int col = 0; col < 2; ++col)
1196         {
1197 
1198             for (int i = 0; i < 8; ++i)
1199                 in[i] = srcVec[2 * i + col];
1200 
1201             alpha[0] = _mm_mul_ps (c, in[2]);
1202             alpha[1] = _mm_mul_ps (f, in[2]);
1203             alpha[2] = _mm_mul_ps (c, in[6]);
1204             alpha[3] = _mm_mul_ps (f, in[6]);
1205 
1206             beta[0] = _mm_add_ps (_mm_add_ps (_mm_mul_ps (in[1], b),
1207                                                           _mm_mul_ps (in[3], d)),
1208                                               _mm_add_ps (_mm_mul_ps (in[5], e),
1209                                                           _mm_mul_ps (in[7], g)));
1210 
1211             beta[1] = _mm_sub_ps (_mm_sub_ps (_mm_mul_ps (in[1], d),
1212                                                           _mm_mul_ps (in[3], g)),
1213                                               _mm_add_ps (_mm_mul_ps (in[5], b),
1214                                                           _mm_mul_ps (in[7], e)));
1215 
1216             beta[2] = _mm_add_ps (_mm_sub_ps (_mm_mul_ps (in[1], e),
1217                                                           _mm_mul_ps (in[3], b)),
1218                                               _mm_add_ps (_mm_mul_ps (in[5], g),
1219                                                           _mm_mul_ps (in[7], d)));
1220 
1221             beta[3] = _mm_add_ps (_mm_sub_ps (_mm_mul_ps (in[1], g),
1222                                                           _mm_mul_ps (in[3], e)),
1223                                               _mm_sub_ps (_mm_mul_ps (in[5], d),
1224                                                           _mm_mul_ps (in[7], b)));
1225 
1226             theta[0] = _mm_mul_ps (a, _mm_add_ps (in[0], in[4]));
1227             theta[3] = _mm_mul_ps (a, _mm_sub_ps (in[0], in[4]));
1228 
1229             theta[1] = _mm_add_ps (alpha[0], alpha[3]);
1230             theta[2] = _mm_sub_ps (alpha[1], alpha[2]);
1231 
1232             gamma[0] = _mm_add_ps (theta[0], theta[1]);
1233             gamma[1] = _mm_add_ps (theta[3], theta[2]);
1234             gamma[2] = _mm_sub_ps (theta[3], theta[2]);
1235             gamma[3] = _mm_sub_ps (theta[0], theta[1]);
1236 
1237             srcVec[  col] = _mm_add_ps (gamma[0], beta[0]);
1238             srcVec[2+col] = _mm_add_ps (gamma[1], beta[1]);
1239             srcVec[4+col] = _mm_add_ps (gamma[2], beta[2]);
1240             srcVec[6+col] = _mm_add_ps (gamma[3], beta[3]);
1241 
1242             srcVec[ 8+col] = _mm_sub_ps (gamma[3], beta[3]);
1243             srcVec[10+col] = _mm_sub_ps (gamma[2], beta[2]);
1244             srcVec[12+col] = _mm_sub_ps (gamma[1], beta[1]);
1245             srcVec[14+col] = _mm_sub_ps (gamma[0], beta[0]);
1246         }
1247 
1248     #else /* IMF_HAVE_SSE2 */
1249 
1250         dctInverse8x8_scalar<zeroedRows> (data);
1251 
1252     #endif /* IMF_HAVE_SSE2 */
1253 }
1254 
1255 
1256 //
1257 // AVX Implementation
1258 //
1259 
1260 #define STR(A) #A
1261 
1262 #define IDCT_AVX_SETUP_2_ROWS(_DST0,  _DST1,  _TMP0,  _TMP1, \
1263                               _OFF00, _OFF01, _OFF10, _OFF11) \
1264     "vmovaps                 " STR(_OFF00) "(%0),  %%xmm" STR(_TMP0) "  \n" \
1265     "vmovaps                 " STR(_OFF01) "(%0),  %%xmm" STR(_TMP1) "  \n" \
1266     "                                                                                \n" \
1267     "vinsertf128  $1, " STR(_OFF10) "(%0), %%ymm" STR(_TMP0) ", %%ymm" STR(_TMP0) "  \n" \
1268     "vinsertf128  $1, " STR(_OFF11) "(%0), %%ymm" STR(_TMP1) ", %%ymm" STR(_TMP1) "  \n" \
1269     "                                                                                \n" \
1270     "vunpcklpd      %%ymm" STR(_TMP1) ",  %%ymm" STR(_TMP0) ",  %%ymm" STR(_DST0) "  \n" \
1271     "vunpckhpd      %%ymm" STR(_TMP1) ",  %%ymm" STR(_TMP0) ",  %%ymm" STR(_DST1) "  \n" \
1272     "                                                                                \n" \
1273     "vunpcklps      %%ymm" STR(_DST1) ",  %%ymm" STR(_DST0) ",  %%ymm" STR(_TMP0) "  \n" \
1274     "vunpckhps      %%ymm" STR(_DST1) ",  %%ymm" STR(_DST0) ",  %%ymm" STR(_TMP1) "  \n" \
1275     "                                                                                \n" \
1276     "vunpcklpd      %%ymm" STR(_TMP1) ",  %%ymm" STR(_TMP0) ",  %%ymm" STR(_DST0) "  \n" \
1277     "vunpckhpd      %%ymm" STR(_TMP1) ",  %%ymm" STR(_TMP0) ",  %%ymm" STR(_DST1) "  \n"
1278 
1279 #define IDCT_AVX_MMULT_ROWS(_SRC)                       \
1280     /* Broadcast the source values into y12-y15 */      \
1281     "vpermilps $0x00, " STR(_SRC) ", %%ymm12       \n"  \
1282     "vpermilps $0x55, " STR(_SRC) ", %%ymm13       \n"  \
1283     "vpermilps $0xaa, " STR(_SRC) ", %%ymm14       \n"  \
1284     "vpermilps $0xff, " STR(_SRC) ", %%ymm15       \n"  \
1285                                                         \
1286     /* Multiple coefs and the broadcasted values */     \
1287     "vmulps    %%ymm12,  %%ymm8, %%ymm12     \n"        \
1288     "vmulps    %%ymm13,  %%ymm9, %%ymm13     \n"        \
1289     "vmulps    %%ymm14, %%ymm10, %%ymm14     \n"        \
1290     "vmulps    %%ymm15, %%ymm11, %%ymm15     \n"        \
1291                                                         \
1292     /* Accumulate the result back into the source */    \
1293     "vaddps    %%ymm13, %%ymm12, %%ymm12      \n"       \
1294     "vaddps    %%ymm15, %%ymm14, %%ymm14      \n"       \
1295     "vaddps    %%ymm14, %%ymm12, " STR(_SRC) "\n"
1296 
1297 #define IDCT_AVX_EO_TO_ROW_HALVES(_EVEN, _ODD, _FRONT, _BACK)      \
1298     "vsubps   " STR(_ODD) "," STR(_EVEN) "," STR(_BACK)  "\n"  \
1299     "vaddps   " STR(_ODD) "," STR(_EVEN) "," STR(_FRONT) "\n"  \
1300     /* Reverse the back half                                */ \
1301     "vpermilps $0x1b," STR(_BACK) "," STR(_BACK) "\n"
1302 
1303 /* In order to allow for path paths when we know certain rows
1304  * of the 8x8 block are zero, most of the body of the DCT is
1305  * in the following macro. Statements are wrapped in a ROWn()
1306  * macro, where n is the lowest row in the 8x8 block in which
1307  * they depend.
1308  *
1309  * This should work for the cases where we have 2-8 full rows.
1310  * the 1-row case is special, and we'll handle it seperately.
1311  */
1312 #define IDCT_AVX_BODY \
1313     /* ==============================================
1314      *               Row 1D DCT
1315      * ----------------------------------------------
1316      */                                                           \
1317                                                                   \
1318     /* Setup for the row-oriented 1D DCT. Assuming that (%0) holds
1319      * the row-major 8x8 block, load ymm0-3 with the even columns
1320      * and ymm4-7 with the odd columns. The lower half of the ymm
1321      * holds one row, while the upper half holds the next row.
1322      *
1323      * If our source is:
1324      *    a0 a1 a2 a3   a4 a5 a6 a7
1325      *    b0 b1 b2 b3   b4 b5 b6 b7
1326      *
1327      * We'll be forming:
1328      *    a0 a2 a4 a6   b0 b2 b4 b6
1329      *    a1 a3 a5 a7   b1 b3 b5 b7
1330      */                                                              \
1331     ROW0( IDCT_AVX_SETUP_2_ROWS(0, 4, 14, 15,    0,  16,  32,  48) ) \
1332     ROW2( IDCT_AVX_SETUP_2_ROWS(1, 5, 12, 13,   64,  80,  96, 112) ) \
1333     ROW4( IDCT_AVX_SETUP_2_ROWS(2, 6, 10, 11,  128, 144, 160, 176) ) \
1334     ROW6( IDCT_AVX_SETUP_2_ROWS(3, 7,  8,  9,  192, 208, 224, 240) ) \
1335                                                                      \
1336     /* Multiple the even columns (ymm0-3) by the matrix M1
1337      * storing the results back in ymm0-3
1338      *
1339      * Assume that (%1) holds the matrix in column major order
1340      */                                                              \
1341     "vbroadcastf128   (%1),  %%ymm8         \n"                      \
1342     "vbroadcastf128 16(%1),  %%ymm9         \n"                      \
1343     "vbroadcastf128 32(%1), %%ymm10         \n"                      \
1344     "vbroadcastf128 48(%1), %%ymm11         \n"                      \
1345                                                                      \
1346     ROW0( IDCT_AVX_MMULT_ROWS(%%ymm0) )                              \
1347     ROW2( IDCT_AVX_MMULT_ROWS(%%ymm1) )                              \
1348     ROW4( IDCT_AVX_MMULT_ROWS(%%ymm2) )                              \
1349     ROW6( IDCT_AVX_MMULT_ROWS(%%ymm3) )                              \
1350                                                                      \
1351     /* Repeat, but with the odd columns (ymm4-7) and the
1352      * matrix M2
1353      */                                                              \
1354     "vbroadcastf128  64(%1),  %%ymm8         \n"                     \
1355     "vbroadcastf128  80(%1),  %%ymm9         \n"                     \
1356     "vbroadcastf128  96(%1), %%ymm10         \n"                     \
1357     "vbroadcastf128 112(%1), %%ymm11         \n"                     \
1358                                                                      \
1359     ROW0( IDCT_AVX_MMULT_ROWS(%%ymm4) )                              \
1360     ROW2( IDCT_AVX_MMULT_ROWS(%%ymm5) )                              \
1361     ROW4( IDCT_AVX_MMULT_ROWS(%%ymm6) )                              \
1362     ROW6( IDCT_AVX_MMULT_ROWS(%%ymm7) )                              \
1363                                                                      \
1364     /* Sum the M1 (ymm0-3) and M2 (ymm4-7) results to get the
1365      * front halves of the results, and difference to get the
1366      * back halves. The front halfs end up in ymm0-3, the back
1367      * halves end up in ymm12-15.
1368      */                                                                \
1369     ROW0( IDCT_AVX_EO_TO_ROW_HALVES(%%ymm0, %%ymm4, %%ymm0, %%ymm12) ) \
1370     ROW2( IDCT_AVX_EO_TO_ROW_HALVES(%%ymm1, %%ymm5, %%ymm1, %%ymm13) ) \
1371     ROW4( IDCT_AVX_EO_TO_ROW_HALVES(%%ymm2, %%ymm6, %%ymm2, %%ymm14) ) \
1372     ROW6( IDCT_AVX_EO_TO_ROW_HALVES(%%ymm3, %%ymm7, %%ymm3, %%ymm15) ) \
1373                                                                        \
1374     /* Reassemble the rows halves into ymm0-7  */                      \
1375     ROW7( "vperm2f128 $0x13, %%ymm3, %%ymm15, %%ymm7   \n" )           \
1376     ROW6( "vperm2f128 $0x02, %%ymm3, %%ymm15, %%ymm6   \n" )           \
1377     ROW5( "vperm2f128 $0x13, %%ymm2, %%ymm14, %%ymm5   \n" )           \
1378     ROW4( "vperm2f128 $0x02, %%ymm2, %%ymm14, %%ymm4   \n" )           \
1379     ROW3( "vperm2f128 $0x13, %%ymm1, %%ymm13, %%ymm3   \n" )           \
1380     ROW2( "vperm2f128 $0x02, %%ymm1, %%ymm13, %%ymm2   \n" )           \
1381     ROW1( "vperm2f128 $0x13, %%ymm0, %%ymm12, %%ymm1   \n" )           \
1382     ROW0( "vperm2f128 $0x02, %%ymm0, %%ymm12, %%ymm0   \n" )           \
1383                                                                        \
1384                                                                        \
1385     /* ==============================================
1386      *                Column 1D DCT
1387      * ----------------------------------------------
1388      */                                                                \
1389                                                                        \
1390     /* Rows should be in ymm0-7, and M2 columns should still be
1391      * preserved in ymm8-11.  M2 has 4 unique values (and +-
1392      * versions of each), and all (positive) values appear in
1393      * the first column (and row), which is in ymm8.
1394      *
1395      * For the column-wise DCT, we need to:
1396      *   1) Broadcast each element a row of M2 into 4 vectors
1397      *   2) Multiple the odd rows (ymm1,3,5,7) by the broadcasts.
1398      *   3) Accumulate into ymm12-15 for the odd outputs.
1399      *
1400      * Instead of doing 16 broadcasts for each element in M2,
1401      * do 4, filling y8-11 with:
1402      *
1403      *     ymm8:  [ b  b  b  b  | b  b  b  b ]
1404      *     ymm9:  [ d  d  d  d  | d  d  d  d ]
1405      *     ymm10: [ e  e  e  e  | e  e  e  e ]
1406      *     ymm11: [ g  g  g  g  | g  g  g  g ]
1407      *
1408      * And deal with the negative values by subtracting during accum.
1409      */                                                                \
1410     "vpermilps        $0xff,  %%ymm8, %%ymm11  \n"                     \
1411     "vpermilps        $0xaa,  %%ymm8, %%ymm10  \n"                     \
1412     "vpermilps        $0x55,  %%ymm8, %%ymm9   \n"                     \
1413     "vpermilps        $0x00,  %%ymm8, %%ymm8   \n"                     \
1414                                                                        \
1415     /* This one is easy, since we have ymm12-15 open for scratch
1416      *    ymm12 = b ymm1 + d ymm3 + e ymm5 + g ymm7
1417      */                                                                \
1418     ROW1( "vmulps    %%ymm1,  %%ymm8, %%ymm12    \n" )                 \
1419     ROW3( "vmulps    %%ymm3,  %%ymm9, %%ymm13    \n" )                 \
1420     ROW5( "vmulps    %%ymm5, %%ymm10, %%ymm14    \n" )                 \
1421     ROW7( "vmulps    %%ymm7, %%ymm11, %%ymm15    \n" )                 \
1422                                                                        \
1423     ROW3( "vaddps   %%ymm12, %%ymm13, %%ymm12    \n" )                 \
1424     ROW7( "vaddps   %%ymm14, %%ymm15, %%ymm14    \n" )                 \
1425     ROW5( "vaddps   %%ymm12, %%ymm14, %%ymm12    \n" )                 \
1426                                                                        \
1427     /* Tricker, since only y13-15 are open for scratch
1428      *    ymm13 = d ymm1 - g ymm3 - b ymm5 - e ymm7
1429      */                                                                \
1430     ROW1( "vmulps    %%ymm1,   %%ymm9, %%ymm13   \n" )                 \
1431     ROW3( "vmulps    %%ymm3,  %%ymm11, %%ymm14   \n" )                 \
1432     ROW5( "vmulps    %%ymm5,   %%ymm8, %%ymm15   \n" )                 \
1433                                                                        \
1434     ROW5( "vaddps    %%ymm14, %%ymm15, %%ymm14   \n" )                 \
1435     ROW3( "vsubps    %%ymm14, %%ymm13, %%ymm13   \n" )                 \
1436                                                                        \
1437     ROW7( "vmulps    %%ymm7,  %%ymm10, %%ymm15   \n" )                 \
1438     ROW7( "vsubps    %%ymm15, %%ymm13, %%ymm13   \n" )                 \
1439                                                                        \
1440     /* Tricker still, as only y14-15 are open for scratch
1441      *    ymm14 = e ymm1 - b ymm3 + g ymm5 + d ymm7
1442      */                                                                \
1443     ROW1( "vmulps     %%ymm1, %%ymm10,  %%ymm14  \n" )                 \
1444     ROW3( "vmulps     %%ymm3,  %%ymm8,  %%ymm15  \n" )                 \
1445                                                                        \
1446     ROW3( "vsubps    %%ymm15, %%ymm14, %%ymm14   \n" )                 \
1447                                                                        \
1448     ROW5( "vmulps     %%ymm5, %%ymm11, %%ymm15   \n" )                 \
1449     ROW5( "vaddps    %%ymm15, %%ymm14, %%ymm14   \n" )                 \
1450                                                                        \
1451     ROW7( "vmulps    %%ymm7,   %%ymm9, %%ymm15   \n" )                 \
1452     ROW7( "vaddps    %%ymm15, %%ymm14, %%ymm14   \n" )                 \
1453                                                                        \
1454                                                                        \
1455     /* Easy, as we can blow away ymm1,3,5,7 for scratch
1456      *    ymm15 = g ymm1 - e ymm3 + d ymm5 - b ymm7
1457      */                                                                \
1458     ROW1( "vmulps    %%ymm1, %%ymm11, %%ymm15    \n" )                 \
1459     ROW3( "vmulps    %%ymm3, %%ymm10,  %%ymm3    \n" )                 \
1460     ROW5( "vmulps    %%ymm5,  %%ymm9,  %%ymm5    \n" )                 \
1461     ROW7( "vmulps    %%ymm7,  %%ymm8,  %%ymm7    \n" )                 \
1462                                                                        \
1463     ROW5( "vaddps   %%ymm15,  %%ymm5, %%ymm15    \n" )                 \
1464     ROW7( "vaddps    %%ymm3,  %%ymm7,  %%ymm3    \n" )                 \
1465     ROW3( "vsubps    %%ymm3, %%ymm15, %%ymm15    \n" )                 \
1466                                                                        \
1467                                                                        \
1468     /* Load coefs for M1. Because we're going to broadcast
1469      * coefs, we don't need to load the actual structure from
1470      * M1. Instead, just load enough that we can broadcast.
1471      * There are only 6 unique values in M1, but they're in +-
1472      * pairs, leaving only 3 unique coefs if we add and subtract
1473      * properly.
1474      *
1475      * Fill      ymm1 with coef[2] = [ a  a  c  f | a  a  c  f ]
1476      * Broadcast ymm5 with           [ f  f  f  f | f  f  f  f ]
1477      * Broadcast ymm3 with           [ c  c  c  c | c  c  c  c ]
1478      * Broadcast ymm1 with           [ a  a  a  a | a  a  a  a ]
1479      */                                                                \
1480     "vbroadcastf128   8(%1),  %%ymm1          \n"                      \
1481     "vpermilps        $0xff,  %%ymm1, %%ymm5  \n"                      \
1482     "vpermilps        $0xaa,  %%ymm1, %%ymm3  \n"                      \
1483     "vpermilps        $0x00,  %%ymm1, %%ymm1  \n"                      \
1484                                                                        \
1485     /* If we expand E = [M1] [x0 x2 x4 x6]^t, we get the following
1486      * common expressions:
1487      *
1488      *   E_0 = ymm8  = (a ymm0 + a ymm4) + (c ymm2 + f ymm6)
1489      *   E_3 = ymm11 = (a ymm0 + a ymm4) - (c ymm2 + f ymm6)
1490      *
1491      *   E_1 = ymm9  = (a ymm0 - a ymm4) + (f ymm2 - c ymm6)
1492      *   E_2 = ymm10 = (a ymm0 - a ymm4) - (f ymm2 - c ymm6)
1493      *
1494      * Afterwards, ymm8-11 will hold the even outputs.
1495      */                                                                \
1496                                                                        \
1497     /*  ymm11 = (a ymm0 + a ymm4),   ymm1 = (a ymm0 - a ymm4) */       \
1498     ROW0( "vmulps    %%ymm1,  %%ymm0, %%ymm11   \n" )                  \
1499     ROW4( "vmulps    %%ymm1,  %%ymm4,  %%ymm4   \n" )                  \
1500     ROW0( "vmovaps   %%ymm11, %%ymm1            \n" )                  \
1501     ROW4( "vaddps    %%ymm4, %%ymm11, %%ymm11   \n" )                  \
1502     ROW4( "vsubps    %%ymm4,  %%ymm1,  %%ymm1   \n" )                  \
1503                                                                        \
1504     /* ymm7 = (c ymm2 + f ymm6) */                                     \
1505     ROW2( "vmulps    %%ymm3, %%ymm2,  %%ymm7    \n" )                  \
1506     ROW6( "vmulps    %%ymm5, %%ymm6,  %%ymm9    \n" )                  \
1507     ROW6( "vaddps    %%ymm9, %%ymm7,  %%ymm7    \n" )                  \
1508                                                                        \
1509     /* E_0 = ymm8  = (a ymm0 + a ymm4) + (c ymm2 + f ymm6)
1510      * E_3 = ymm11 = (a ymm0 + a ymm4) - (c ymm2 + f ymm6)
1511      */                                                                \
1512     ROW0( "vmovaps   %%ymm11, %%ymm8            \n" )                  \
1513     ROW2( "vaddps     %%ymm7, %%ymm8,  %%ymm8   \n" )                  \
1514     ROW2( "vsubps     %%ymm7, %%ymm11, %%ymm11  \n" )                  \
1515                                                                        \
1516     /* ymm7 = (f ymm2 - c ymm6) */                                     \
1517     ROW2( "vmulps     %%ymm5,  %%ymm2, %%ymm7   \n" )                  \
1518     ROW6( "vmulps     %%ymm3,  %%ymm6, %%ymm9   \n" )                  \
1519     ROW6( "vsubps     %%ymm9,  %%ymm7, %%ymm7   \n" )                  \
1520                                                                        \
1521     /* E_1 = ymm9  = (a ymm0 - a ymm4) + (f ymm2 - c ymm6)
1522      * E_2 = ymm10 = (a ymm0 - a ymm4) - (f ymm2 - c ymm6)
1523      */                                                                \
1524     ROW0( "vmovaps   %%ymm1,  %%ymm9            \n" )                  \
1525     ROW0( "vmovaps   %%ymm1, %%ymm10            \n" )                  \
1526     ROW2( "vaddps    %%ymm7,  %%ymm1,  %%ymm9   \n" )                  \
1527     ROW2( "vsubps    %%ymm7,  %%ymm1,  %%ymm10  \n" )                  \
1528                                                                        \
1529     /* Add the even (ymm8-11) and the odds (ymm12-15),
1530      * placing the results into ymm0-7
1531      */                                                                \
1532     "vaddps   %%ymm12,  %%ymm8, %%ymm0       \n"                       \
1533     "vaddps   %%ymm13,  %%ymm9, %%ymm1       \n"                       \
1534     "vaddps   %%ymm14, %%ymm10, %%ymm2       \n"                       \
1535     "vaddps   %%ymm15, %%ymm11, %%ymm3       \n"                       \
1536                                                                        \
1537     "vsubps   %%ymm12,  %%ymm8, %%ymm7       \n"                       \
1538     "vsubps   %%ymm13,  %%ymm9, %%ymm6       \n"                       \
1539     "vsubps   %%ymm14, %%ymm10, %%ymm5       \n"                       \
1540     "vsubps   %%ymm15, %%ymm11, %%ymm4       \n"                       \
1541                                                                        \
1542     /* Copy out the results from ymm0-7  */                            \
1543     "vmovaps   %%ymm0,    (%0)                   \n"                   \
1544     "vmovaps   %%ymm1,  32(%0)                   \n"                   \
1545     "vmovaps   %%ymm2,  64(%0)                   \n"                   \
1546     "vmovaps   %%ymm3,  96(%0)                   \n"                   \
1547     "vmovaps   %%ymm4, 128(%0)                   \n"                   \
1548     "vmovaps   %%ymm5, 160(%0)                   \n"                   \
1549     "vmovaps   %%ymm6, 192(%0)                   \n"                   \
1550     "vmovaps   %%ymm7, 224(%0)                   \n"
1551 
1552 /* Output, input, and clobber (OIC) sections of the inline asm */
1553 #define IDCT_AVX_OIC(_IN0)                          \
1554         : /* Output  */                            \
1555         : /* Input   */ "r"(_IN0), "r"(sAvxCoef)      \
1556         : /* Clobber */ "memory",                  \
1557                         "%xmm0",  "%xmm1",  "%xmm2",  "%xmm3", \
1558                         "%xmm4",  "%xmm5",  "%xmm6",  "%xmm7", \
1559                         "%xmm8",  "%xmm9",  "%xmm10", "%xmm11",\
1560                         "%xmm12", "%xmm13", "%xmm14", "%xmm15"
1561 
1562 /* Include vzeroupper for non-AVX builds                */
1563 #ifndef __AVX__
1564     #define IDCT_AVX_ASM(_IN0)   \
1565         __asm__(                 \
1566             IDCT_AVX_BODY        \
1567             "vzeroupper      \n" \
1568             IDCT_AVX_OIC(_IN0)   \
1569         );
1570 #else /* __AVX__ */
1571     #define IDCT_AVX_ASM(_IN0)   \
1572         __asm__(                 \
1573             IDCT_AVX_BODY        \
1574             IDCT_AVX_OIC(_IN0)   \
1575         );
1576 #endif /* __AVX__ */
1577 
1578 template <int zeroedRows>
1579 void
dctInverse8x8_avx(float * data)1580 dctInverse8x8_avx (float *data)
1581 {
1582     #if defined IMF_HAVE_GCC_INLINEASM_64
1583 
1584     /* The column-major version of M1, followed by the
1585      * column-major version of M2:
1586      *
1587      *          [ a  c  a  f ]          [ b  d  e  g ]
1588      *   M1  =  [ a  f -a -c ]    M2 =  [ d -g -b -e ]
1589      *          [ a -f -a  c ]          [ e -b  g  d ]
1590      *          [ a -c  a -f ]          [ g -e  d -b ]
1591      */
1592     const float sAvxCoef[32]  __attribute__((aligned(32))) = {
1593         3.535536e-01,  3.535536e-01,  3.535536e-01,  3.535536e-01, /* a  a  a  a */
1594         4.619398e-01,  1.913422e-01, -1.913422e-01, -4.619398e-01, /* c  f -f -c */
1595         3.535536e-01, -3.535536e-01, -3.535536e-01,  3.535536e-01, /* a -a -a  a */
1596         1.913422e-01, -4.619398e-01,  4.619398e-01, -1.913422e-01, /* f -c  c -f */
1597 
1598         4.903927e-01,  4.157349e-01,  2.777855e-01,  9.754573e-02, /* b  d  e  g */
1599         4.157349e-01, -9.754573e-02, -4.903927e-01, -2.777855e-01, /* d -g -b -e */
1600         2.777855e-01, -4.903927e-01,  9.754573e-02,  4.157349e-01, /* e -b  g  d */
1601         9.754573e-02, -2.777855e-01,  4.157349e-01, -4.903927e-01  /* g -e  d -b */
1602     };
1603 
1604         #define ROW0(_X) _X
1605         #define ROW1(_X) _X
1606         #define ROW2(_X) _X
1607         #define ROW3(_X) _X
1608         #define ROW4(_X) _X
1609         #define ROW5(_X) _X
1610         #define ROW6(_X) _X
1611         #define ROW7(_X) _X
1612 
1613         if (zeroedRows == 0) {
1614 
1615             IDCT_AVX_ASM(data)
1616 
1617         } else if (zeroedRows == 1) {
1618 
1619             #undef  ROW7
1620             #define ROW7(_X)
1621             IDCT_AVX_ASM(data)
1622 
1623         } else if (zeroedRows == 2) {
1624 
1625             #undef  ROW6
1626             #define ROW6(_X)
1627             IDCT_AVX_ASM(data)
1628 
1629         } else if (zeroedRows == 3) {
1630 
1631             #undef  ROW5
1632             #define ROW5(_X)
1633             IDCT_AVX_ASM(data)
1634 
1635         } else if (zeroedRows == 4) {
1636 
1637             #undef  ROW4
1638             #define ROW4(_X)
1639             IDCT_AVX_ASM(data)
1640 
1641         } else if (zeroedRows == 5) {
1642 
1643             #undef  ROW3
1644             #define ROW3(_X)
1645             IDCT_AVX_ASM(data)
1646 
1647         } else if (zeroedRows == 6) {
1648 
1649             #undef  ROW2
1650             #define ROW2(_X)
1651             IDCT_AVX_ASM(data)
1652 
1653         } else if (zeroedRows == 7) {
1654 
1655             __asm__(
1656 
1657                 /* ==============================================
1658                  *                Row 1D DCT
1659                  * ----------------------------------------------
1660                  */
1661                 IDCT_AVX_SETUP_2_ROWS(0, 4, 14, 15,    0,  16,  32,  48)
1662 
1663                 "vbroadcastf128   (%1),  %%ymm8         \n"
1664                 "vbroadcastf128 16(%1),  %%ymm9         \n"
1665                 "vbroadcastf128 32(%1), %%ymm10         \n"
1666                 "vbroadcastf128 48(%1), %%ymm11         \n"
1667 
1668                 /* Stash a vector of [a a a a | a a a a] away  in ymm2 */
1669                 "vinsertf128 $1,  %%xmm8,  %%ymm8,  %%ymm2 \n"
1670 
1671                 IDCT_AVX_MMULT_ROWS(%%ymm0)
1672 
1673                 "vbroadcastf128  64(%1),  %%ymm8         \n"
1674                 "vbroadcastf128  80(%1),  %%ymm9         \n"
1675                 "vbroadcastf128  96(%1), %%ymm10         \n"
1676                 "vbroadcastf128 112(%1), %%ymm11         \n"
1677 
1678                 IDCT_AVX_MMULT_ROWS(%%ymm4)
1679 
1680                 IDCT_AVX_EO_TO_ROW_HALVES(%%ymm0, %%ymm4, %%ymm0, %%ymm12)
1681 
1682                 "vperm2f128 $0x02, %%ymm0, %%ymm12, %%ymm0   \n"
1683 
1684                 /* ==============================================
1685                  *                Column 1D DCT
1686                  * ----------------------------------------------
1687                  */
1688 
1689                 /* DC only, so multiple by a and we're done */
1690                 "vmulps   %%ymm2, %%ymm0, %%ymm0  \n"
1691 
1692                 /* Copy out results  */
1693                 "vmovaps %%ymm0,    (%0)          \n"
1694                 "vmovaps %%ymm0,  32(%0)          \n"
1695                 "vmovaps %%ymm0,  64(%0)          \n"
1696                 "vmovaps %%ymm0,  96(%0)          \n"
1697                 "vmovaps %%ymm0, 128(%0)          \n"
1698                 "vmovaps %%ymm0, 160(%0)          \n"
1699                 "vmovaps %%ymm0, 192(%0)          \n"
1700                 "vmovaps %%ymm0, 224(%0)          \n"
1701 
1702                 #ifndef __AVX__
1703                     "vzeroupper                   \n"
1704                 #endif /* __AVX__ */
1705                 IDCT_AVX_OIC(data)
1706             );
1707         } else {
1708             assert(false); // Invalid template instance parameter
1709         }
1710     #else  /* IMF_HAVE_GCC_INLINEASM_64 */
1711 
1712         dctInverse8x8_scalar<zeroedRows>(data);
1713 
1714     #endif /*  IMF_HAVE_GCC_INLINEASM_64 */
1715 }
1716 
1717 
1718 //
1719 // Full 8x8 Forward DCT:
1720 //
1721 // Base forward 8x8 DCT implementation. Works on the data in-place
1722 //
1723 // The implementation describedin Pennebaker + Mitchell,
1724 //  section 4.3.2, and illustrated in figure 4-7
1725 //
1726 // The basic idea is that the 1D DCT math reduces to:
1727 //
1728 //   2*out_0            = c_4 [(s_07 + s_34) + (s_12 + s_56)]
1729 //   2*out_4            = c_4 [(s_07 + s_34) - (s_12 + s_56)]
1730 //
1731 //   {2*out_2, 2*out_6} = rot_6 ((d_12 - d_56), (s_07 - s_34))
1732 //
1733 //   {2*out_3, 2*out_5} = rot_-3 (d_07 - c_4 (s_12 - s_56),
1734 //                                d_34 - c_4 (d_12 + d_56))
1735 //
1736 //   {2*out_1, 2*out_7} = rot_-1 (d_07 + c_4 (s_12 - s_56),
1737 //                               -d_34 - c_4 (d_12 + d_56))
1738 //
1739 // where:
1740 //
1741 //    c_i  = cos(i*pi/16)
1742 //    s_i  = sin(i*pi/16)
1743 //
1744 //    s_ij = in_i + in_j
1745 //    d_ij = in_i - in_j
1746 //
1747 //    rot_i(x, y) = {c_i*x + s_i*y, -s_i*x + c_i*y}
1748 //
1749 // We'll run the DCT in two passes. First, run the 1D DCT on
1750 // the rows, in-place. Then, run over the columns in-place,
1751 // and be done with it.
1752 //
1753 
1754 #ifndef IMF_HAVE_SSE2
1755 
1756 //
1757 // Default implementation
1758 //
1759 
1760 void
dctForward8x8(float * data)1761 dctForward8x8 (float *data)
1762 {
1763     float A0, A1, A2, A3, A4, A5, A6, A7;
1764     float K0, K1, rot_x, rot_y;
1765 
1766     float *srcPtr = data;
1767     float *dstPtr = data;
1768 
1769     const float c1 = cosf (3.14159f * 1.0f / 16.0f);
1770     const float c2 = cosf (3.14159f * 2.0f / 16.0f);
1771     const float c3 = cosf (3.14159f * 3.0f / 16.0f);
1772     const float c4 = cosf (3.14159f * 4.0f / 16.0f);
1773     const float c5 = cosf (3.14159f * 5.0f / 16.0f);
1774     const float c6 = cosf (3.14159f * 6.0f / 16.0f);
1775     const float c7 = cosf (3.14159f * 7.0f / 16.0f);
1776 
1777     const float c1Half = .5f * c1;
1778     const float c2Half = .5f * c2;
1779     const float c3Half = .5f * c3;
1780     const float c5Half = .5f * c5;
1781     const float c6Half = .5f * c6;
1782     const float c7Half = .5f * c7;
1783 
1784     //
1785     // First pass - do a 1D DCT over the rows and write the
1786     //              results back in place
1787     //
1788 
1789     for (int row=0; row<8; ++row)
1790     {
1791         float *srcRowPtr = srcPtr + 8 * row;
1792         float *dstRowPtr = dstPtr + 8 * row;
1793 
1794         A0 = srcRowPtr[0] + srcRowPtr[7];
1795         A1 = srcRowPtr[1] + srcRowPtr[2];
1796         A2 = srcRowPtr[1] - srcRowPtr[2];
1797         A3 = srcRowPtr[3] + srcRowPtr[4];
1798         A4 = srcRowPtr[3] - srcRowPtr[4];
1799         A5 = srcRowPtr[5] + srcRowPtr[6];
1800         A6 = srcRowPtr[5] - srcRowPtr[6];
1801         A7 = srcRowPtr[0] - srcRowPtr[7];
1802 
1803         K0 = c4 * (A0 + A3);
1804         K1 = c4 * (A1 + A5);
1805 
1806         dstRowPtr[0] = .5f * (K0 + K1);
1807         dstRowPtr[4] = .5f * (K0 - K1);
1808 
1809         //
1810         // (2*dst2, 2*dst6) = rot 6 (d12 - d56,  s07 - s34)
1811         //
1812 
1813         rot_x = A2 - A6;
1814         rot_y = A0 - A3;
1815 
1816         dstRowPtr[2] =  c6Half * rot_x + c2Half * rot_y;
1817         dstRowPtr[6] =  c6Half * rot_y - c2Half * rot_x;
1818 
1819         //
1820         // K0, K1 are active until after dst[1],dst[7]
1821         //  as well as dst[3], dst[5] are computed.
1822         //
1823 
1824         K0 = c4 * (A1 - A5);
1825         K1 = -1 * c4 * (A2 + A6);
1826 
1827         //
1828         // Two ways to do a rotation:
1829         //
1830         //  rot i (x, y) =
1831         //           X =  c_i*x + s_i*y
1832         //           Y = -s_i*x + c_i*y
1833         //
1834         //        OR
1835         //
1836         //           X = c_i*(x+y) + (s_i-c_i)*y
1837         //           Y = c_i*y     - (s_i+c_i)*x
1838         //
1839         // the first case has 4 multiplies, but fewer constants,
1840         // while the 2nd case has fewer multiplies but takes more space.
1841 
1842         //
1843         // (2*dst3, 2*dst5) = rot -3 ( d07 - K0,  d34 + K1 )
1844         //
1845 
1846         rot_x = A7 - K0;
1847         rot_y = A4 + K1;
1848 
1849         dstRowPtr[3] = c3Half * rot_x - c5Half * rot_y;
1850         dstRowPtr[5] = c5Half * rot_x + c3Half * rot_y;
1851 
1852         //
1853         // (2*dst1, 2*dst7) = rot -1 ( d07 + K0,  K1  - d34 )
1854         //
1855 
1856         rot_x = A7 + K0;
1857         rot_y = K1 - A4;
1858 
1859         //
1860         // A: 4, 7 are inactive. All A's are inactive
1861         //
1862 
1863         dstRowPtr[1] = c1Half * rot_x - c7Half * rot_y;
1864         dstRowPtr[7] = c7Half * rot_x + c1Half * rot_y;
1865     }
1866 
1867     //
1868     // Second pass - do the same, but on the columns
1869     //
1870 
1871     for (int column = 0; column < 8; ++column)
1872     {
1873 
1874         A0 = srcPtr[     column] + srcPtr[56 + column];
1875         A7 = srcPtr[     column] - srcPtr[56 + column];
1876 
1877         A1 = srcPtr[ 8 + column] + srcPtr[16 + column];
1878         A2 = srcPtr[ 8 + column] - srcPtr[16 + column];
1879 
1880         A3 = srcPtr[24 + column] + srcPtr[32 + column];
1881         A4 = srcPtr[24 + column] - srcPtr[32 + column];
1882 
1883         A5 = srcPtr[40 + column] + srcPtr[48 + column];
1884         A6 = srcPtr[40 + column] - srcPtr[48 + column];
1885 
1886         K0 = c4 * (A0 + A3);
1887         K1 = c4 * (A1 + A5);
1888 
1889         dstPtr[   column] = .5f * (K0 + K1);
1890         dstPtr[32+column] = .5f * (K0 - K1);
1891 
1892         //
1893         // (2*dst2, 2*dst6) = rot 6 ( d12 - d56,  s07 - s34 )
1894         //
1895 
1896         rot_x = A2 - A6;
1897         rot_y = A0 - A3;
1898 
1899         dstPtr[16+column] = .5f * (c6 * rot_x + c2 * rot_y);
1900         dstPtr[48+column] = .5f * (c6 * rot_y - c2 * rot_x);
1901 
1902         //
1903         // K0, K1 are active until after dst[1],dst[7]
1904         //  as well as dst[3], dst[5] are computed.
1905         //
1906 
1907         K0 = c4 * (A1 - A5);
1908         K1 = -1 * c4 * (A2 + A6);
1909 
1910         //
1911         // (2*dst3, 2*dst5) = rot -3 ( d07 - K0,  d34 + K1 )
1912         //
1913 
1914         rot_x = A7 - K0;
1915         rot_y = A4 + K1;
1916 
1917         dstPtr[24+column] = .5f * (c3 * rot_x - c5 * rot_y);
1918         dstPtr[40+column] = .5f * (c5 * rot_x + c3 * rot_y);
1919 
1920         //
1921         // (2*dst1, 2*dst7) = rot -1 ( d07 + K0,  K1  - d34 )
1922         //
1923 
1924         rot_x = A7 + K0;
1925         rot_y = K1 - A4;
1926 
1927         dstPtr[ 8+column] = .5f * (c1 * rot_x - c7 * rot_y);
1928         dstPtr[56+column] = .5f * (c7 * rot_x + c1 * rot_y);
1929     }
1930 }
1931 
1932 #else  /* IMF_HAVE_SSE2 */
1933 
1934 //
1935 // SSE2 implementation
1936 //
1937 // Here, we're always doing a column-wise operation
1938 // plus transposes. This might be faster to do differently
1939 // between rows-wise and column-wise
1940 //
1941 
1942 void
dctForward8x8(float * data)1943 dctForward8x8 (float *data)
1944 {
1945     __m128 *srcVec = (__m128 *)data;
1946     __m128  a0Vec, a1Vec, a2Vec, a3Vec, a4Vec, a5Vec, a6Vec, a7Vec;
1947     __m128  k0Vec, k1Vec, rotXVec, rotYVec;
1948     __m128  transTmp[4], transTmp2[4];
1949 
1950     __m128  c4Vec     = { .70710678f,  .70710678f,  .70710678f,  .70710678f};
1951     __m128  c4NegVec  = {-.70710678f, -.70710678f, -.70710678f, -.70710678f};
1952 
1953     __m128  c1HalfVec = {.490392640f, .490392640f, .490392640f, .490392640f};
1954     __m128  c2HalfVec = {.461939770f, .461939770f, .461939770f, .461939770f};
1955     __m128  c3HalfVec = {.415734810f, .415734810f, .415734810f, .415734810f};
1956     __m128  c5HalfVec = {.277785120f, .277785120f, .277785120f, .277785120f};
1957     __m128  c6HalfVec = {.191341720f, .191341720f, .191341720f, .191341720f};
1958     __m128  c7HalfVec = {.097545161f, .097545161f, .097545161f, .097545161f};
1959 
1960     __m128  halfVec   = {.5f, .5f, .5f, .5f};
1961 
1962     for (int iter = 0; iter < 2; ++iter)
1963     {
1964         //
1965         //  Operate on 4 columns at a time. The
1966         //    offsets into our row-major array are:
1967         //                  0:  0      1
1968         //                  1:  2      3
1969         //                  2:  4      5
1970         //                  3:  6      7
1971         //                  4:  8      9
1972         //                  5: 10     11
1973         //                  6: 12     13
1974         //                  7: 14     15
1975         //
1976 
1977         for (int pass=0; pass<2; ++pass)
1978         {
1979             a0Vec = _mm_add_ps (srcVec[ 0 + pass], srcVec[14 + pass]);
1980             a1Vec = _mm_add_ps (srcVec[ 2 + pass], srcVec[ 4 + pass]);
1981             a3Vec = _mm_add_ps (srcVec[ 6 + pass], srcVec[ 8 + pass]);
1982             a5Vec = _mm_add_ps (srcVec[10 + pass], srcVec[12 + pass]);
1983 
1984             a7Vec = _mm_sub_ps (srcVec[ 0 + pass], srcVec[14 + pass]);
1985             a2Vec = _mm_sub_ps (srcVec[ 2 + pass], srcVec[ 4 + pass]);
1986             a4Vec = _mm_sub_ps (srcVec[ 6 + pass], srcVec[ 8 + pass]);
1987             a6Vec = _mm_sub_ps (srcVec[10 + pass], srcVec[12 + pass]);
1988 
1989             //
1990             // First stage; Compute out_0 and out_4
1991             //
1992 
1993             k0Vec = _mm_add_ps (a0Vec, a3Vec);
1994             k1Vec = _mm_add_ps (a1Vec, a5Vec);
1995 
1996             k0Vec = _mm_mul_ps (c4Vec, k0Vec);
1997             k1Vec = _mm_mul_ps (c4Vec, k1Vec);
1998 
1999             srcVec[0 + pass] = _mm_add_ps (k0Vec, k1Vec);
2000             srcVec[8 + pass] = _mm_sub_ps (k0Vec, k1Vec);
2001 
2002             srcVec[0 + pass] = _mm_mul_ps (srcVec[0 + pass], halfVec );
2003             srcVec[8 + pass] = _mm_mul_ps (srcVec[8 + pass], halfVec );
2004 
2005 
2006             //
2007             // Second stage; Compute out_2 and out_6
2008             //
2009 
2010             k0Vec = _mm_sub_ps (a2Vec, a6Vec);
2011             k1Vec = _mm_sub_ps (a0Vec, a3Vec);
2012 
2013             srcVec[ 4 + pass] = _mm_add_ps (_mm_mul_ps (c6HalfVec, k0Vec),
2014                                             _mm_mul_ps (c2HalfVec, k1Vec));
2015 
2016             srcVec[12 + pass] = _mm_sub_ps (_mm_mul_ps (c6HalfVec, k1Vec),
2017                                             _mm_mul_ps (c2HalfVec, k0Vec));
2018 
2019             //
2020             // Precompute K0 and K1 for the remaining stages
2021             //
2022 
2023             k0Vec = _mm_mul_ps (_mm_sub_ps (a1Vec, a5Vec), c4Vec);
2024             k1Vec = _mm_mul_ps (_mm_add_ps (a2Vec, a6Vec), c4NegVec);
2025 
2026             //
2027             // Third Stage, compute out_3 and out_5
2028             //
2029 
2030             rotXVec = _mm_sub_ps (a7Vec, k0Vec);
2031             rotYVec = _mm_add_ps (a4Vec, k1Vec);
2032 
2033             srcVec[ 6 + pass] = _mm_sub_ps (_mm_mul_ps (c3HalfVec, rotXVec),
2034                                             _mm_mul_ps (c5HalfVec, rotYVec));
2035 
2036             srcVec[10 + pass] = _mm_add_ps (_mm_mul_ps (c5HalfVec, rotXVec),
2037                                             _mm_mul_ps (c3HalfVec, rotYVec));
2038 
2039             //
2040             // Fourth Stage, compute out_1 and out_7
2041             //
2042 
2043             rotXVec = _mm_add_ps (a7Vec, k0Vec);
2044             rotYVec = _mm_sub_ps (k1Vec, a4Vec);
2045 
2046             srcVec[ 2 + pass] = _mm_sub_ps (_mm_mul_ps (c1HalfVec, rotXVec),
2047                                             _mm_mul_ps (c7HalfVec, rotYVec));
2048 
2049             srcVec[14 + pass] = _mm_add_ps (_mm_mul_ps (c7HalfVec, rotXVec),
2050                                             _mm_mul_ps (c1HalfVec, rotYVec));
2051         }
2052 
2053         //
2054         // Transpose the matrix, in 4x4 blocks. So, if we have our
2055         // 8x8 matrix divied into 4x4 blocks:
2056         //
2057         //         M0 | M1         M0t | M2t
2058         //        ----+---   -->  -----+------
2059         //         M2 | M3         M1t | M3t
2060         //
2061 
2062         //
2063         // M0t, done in place, the first half.
2064         //
2065 
2066         transTmp[0] = _mm_shuffle_ps (srcVec[0], srcVec[2], 0x44);
2067         transTmp[1] = _mm_shuffle_ps (srcVec[4], srcVec[6], 0x44);
2068         transTmp[3] = _mm_shuffle_ps (srcVec[4], srcVec[6], 0xEE);
2069         transTmp[2] = _mm_shuffle_ps (srcVec[0], srcVec[2], 0xEE);
2070 
2071         //
2072         // M3t, also done in place, the first half.
2073         //
2074 
2075         transTmp2[0] = _mm_shuffle_ps (srcVec[ 9], srcVec[11], 0x44);
2076         transTmp2[1] = _mm_shuffle_ps (srcVec[13], srcVec[15], 0x44);
2077         transTmp2[2] = _mm_shuffle_ps (srcVec[ 9], srcVec[11], 0xEE);
2078         transTmp2[3] = _mm_shuffle_ps (srcVec[13], srcVec[15], 0xEE);
2079 
2080         //
2081         // M0t, the second half.
2082         //
2083 
2084         srcVec[0] = _mm_shuffle_ps (transTmp[0], transTmp[1], 0x88);
2085         srcVec[4] = _mm_shuffle_ps (transTmp[2], transTmp[3], 0x88);
2086         srcVec[2] = _mm_shuffle_ps (transTmp[0], transTmp[1], 0xDD);
2087         srcVec[6] = _mm_shuffle_ps (transTmp[2], transTmp[3], 0xDD);
2088 
2089         //
2090         // M3t, the second half.
2091         //
2092 
2093         srcVec[ 9] = _mm_shuffle_ps (transTmp2[0], transTmp2[1], 0x88);
2094         srcVec[13] = _mm_shuffle_ps (transTmp2[2], transTmp2[3], 0x88);
2095         srcVec[11] = _mm_shuffle_ps (transTmp2[0], transTmp2[1], 0xDD);
2096         srcVec[15] = _mm_shuffle_ps (transTmp2[2], transTmp2[3], 0xDD);
2097 
2098         //
2099         // M1 and M2 need to be done at the same time, because we're
2100         //  swapping.
2101         //
2102         // First, the first half of M1t
2103         //
2104 
2105         transTmp[0] = _mm_shuffle_ps (srcVec[1], srcVec[3], 0x44);
2106         transTmp[1] = _mm_shuffle_ps (srcVec[5], srcVec[7], 0x44);
2107         transTmp[2] = _mm_shuffle_ps (srcVec[1], srcVec[3], 0xEE);
2108         transTmp[3] = _mm_shuffle_ps (srcVec[5], srcVec[7], 0xEE);
2109 
2110         //
2111         // And the first half of M2t
2112         //
2113 
2114         transTmp2[0] = _mm_shuffle_ps (srcVec[ 8], srcVec[10], 0x44);
2115         transTmp2[1] = _mm_shuffle_ps (srcVec[12], srcVec[14], 0x44);
2116         transTmp2[2] = _mm_shuffle_ps (srcVec[ 8], srcVec[10], 0xEE);
2117         transTmp2[3] = _mm_shuffle_ps (srcVec[12], srcVec[14], 0xEE);
2118 
2119         //
2120         // Second half of M1t
2121         //
2122 
2123         srcVec[ 8] = _mm_shuffle_ps (transTmp[0], transTmp[1], 0x88);
2124         srcVec[12] = _mm_shuffle_ps (transTmp[2], transTmp[3], 0x88);
2125         srcVec[10] = _mm_shuffle_ps (transTmp[0], transTmp[1], 0xDD);
2126         srcVec[14] = _mm_shuffle_ps (transTmp[2], transTmp[3], 0xDD);
2127 
2128         //
2129         // Second half of M2
2130         //
2131 
2132         srcVec[1] = _mm_shuffle_ps (transTmp2[0], transTmp2[1], 0x88);
2133         srcVec[5] = _mm_shuffle_ps (transTmp2[2], transTmp2[3], 0x88);
2134         srcVec[3] = _mm_shuffle_ps (transTmp2[0], transTmp2[1], 0xDD);
2135         srcVec[7] = _mm_shuffle_ps (transTmp2[2], transTmp2[3], 0xDD);
2136     }
2137 }
2138 
2139 #endif /* IMF_HAVE_SSE2 */
2140 
2141 } // anonymous namespace
2142 
2143 OPENEXR_IMF_INTERNAL_NAMESPACE_HEADER_EXIT
2144 
2145 #endif
2146