1 ///////////////////////////////////////////////////////////////////////////
2 //
3 // Copyright (c) 2009-2014 DreamWorks Animation LLC.
4 //
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are
9 // met:
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above
13 // copyright notice, this list of conditions and the following disclaimer
14 // in the documentation and/or other materials provided with the
15 // distribution.
16 // * Neither the name of DreamWorks Animation nor the names of
17 // its contributors may be used to endorse or promote products derived
18 // from this software without specific prior written permission.
19 //
20 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 //
32 ///////////////////////////////////////////////////////////////////////////
33
34 #ifndef IMF_DWACOMPRESSORSIMD_H_HAS_BEEN_INCLUDED
35 #define IMF_DWACOMPRESSORSIMD_H_HAS_BEEN_INCLUDED
36
37 //
38 // Various SSE accelerated functions, used by Imf::DwaCompressor.
39 // These have been separated into a separate .h file, as the fast
40 // paths are done with template specialization.
41 //
42 // Unless otherwise noted, all pointers are assumed to be 32-byte
43 // aligned. Unaligned pointers may risk seg-faulting.
44 //
45
46 #include "ImfNamespace.h"
47 #include "ImfSimd.h"
48 #include "ImfSystemSpecific.h"
49 #include "OpenEXRConfig.h"
50
51 #include <half.h>
52 #include <assert.h>
53
54 OPENEXR_IMF_INTERNAL_NAMESPACE_HEADER_ENTER
55
56 #define _SSE_ALIGNMENT 32
57 #define _SSE_ALIGNMENT_MASK 0x0F
58 #define _AVX_ALIGNMENT_MASK 0x1F
59
60 //
61 // Test if we should enable GCC inline asm paths for AVX
62 //
63
64 #ifdef OPENEXR_IMF_HAVE_GCC_INLINE_ASM_AVX
65
66 #define IMF_HAVE_GCC_INLINEASM
67
68 #ifdef __LP64__
69 #define IMF_HAVE_GCC_INLINEASM_64
70 #endif /* __LP64__ */
71
72 #endif /* OPENEXR_IMF_HAVE_GCC_INLINE_ASM_AVX */
73
74 //
75 // A simple 64-element array, aligned properly for SIMD access.
76 //
77
78 template <class T>
79 class SimdAlignedBuffer64
80 {
81 public:
82
SimdAlignedBuffer64()83 SimdAlignedBuffer64(): _buffer (0), _handle (0)
84 {
85 alloc();
86 }
87
SimdAlignedBuffer64(const SimdAlignedBuffer64 & rhs)88 SimdAlignedBuffer64(const SimdAlignedBuffer64 &rhs): _handle(0)
89 {
90 alloc();
91 memcpy (_buffer, rhs._buffer, 64 * sizeof (T));
92 }
93
~SimdAlignedBuffer64()94 ~SimdAlignedBuffer64 ()
95 {
96 EXRFreeAligned (_handle);
97 _handle = 0;
98 _buffer = 0;
99 }
100
alloc()101 void alloc()
102 {
103 //
104 // Try EXRAllocAligned first - but it might fallback to
105 // unaligned allocs. If so, overalloc.
106 //
107
108 _handle = (char *) EXRAllocAligned
109 (64 * sizeof(T), _SSE_ALIGNMENT);
110
111 if (((size_t)_handle & (_SSE_ALIGNMENT - 1)) == 0)
112 {
113 _buffer = (T *)_handle;
114 return;
115 }
116
117 EXRFreeAligned(_handle);
118 _handle = (char *) EXRAllocAligned
119 (64 * sizeof(T) + _SSE_ALIGNMENT, _SSE_ALIGNMENT);
120
121 char *aligned = _handle;
122
123 while ((size_t)aligned & (_SSE_ALIGNMENT - 1))
124 aligned++;
125
126 _buffer = (T *)aligned;
127 }
128
129 T *_buffer;
130
131 private:
132
133 char *_handle;
134 };
135
136 typedef SimdAlignedBuffer64<float> SimdAlignedBuffer64f;
137 typedef SimdAlignedBuffer64<unsigned short> SimdAlignedBuffer64us;
138
139 namespace {
140
141 //
142 // Color space conversion, Inverse 709 CSC, Y'CbCr -> R'G'B'
143 //
144
145 void
csc709Inverse(float & comp0,float & comp1,float & comp2)146 csc709Inverse (float &comp0, float &comp1, float &comp2)
147 {
148 float src[3];
149
150 src[0] = comp0;
151 src[1] = comp1;
152 src[2] = comp2;
153
154 comp0 = src[0] + 1.5747f * src[2];
155 comp1 = src[0] - 0.1873f * src[1] - 0.4682f * src[2];
156 comp2 = src[0] + 1.8556f * src[1];
157 }
158
159 #ifndef IMF_HAVE_SSE2
160
161
162 //
163 // Scalar color space conversion, based on 709 primiary chromaticies.
164 // No scaling or offsets, just the matrix
165 //
166
167 void
csc709Inverse64(float * comp0,float * comp1,float * comp2)168 csc709Inverse64 (float *comp0, float *comp1, float *comp2)
169 {
170 for (int i = 0; i < 64; ++i)
171 csc709Inverse (comp0[i], comp1[i], comp2[i]);
172 }
173
174 #else /* IMF_HAVE_SSE2 */
175
176 //
177 // SSE2 color space conversion
178 //
179
180 void
csc709Inverse64(float * comp0,float * comp1,float * comp2)181 csc709Inverse64 (float *comp0, float *comp1, float *comp2)
182 {
183 __m128 c0 = { 1.5747f, 1.5747f, 1.5747f, 1.5747f};
184 __m128 c1 = { 1.8556f, 1.8556f, 1.8556f, 1.8556f};
185 __m128 c2 = {-0.1873f, -0.1873f, -0.1873f, -0.1873f};
186 __m128 c3 = {-0.4682f, -0.4682f, -0.4682f, -0.4682f};
187
188 __m128 *r = (__m128 *)comp0;
189 __m128 *g = (__m128 *)comp1;
190 __m128 *b = (__m128 *)comp2;
191 __m128 src[3];
192
193 #define CSC_INVERSE_709_SSE2_LOOP(i) \
194 src[0] = r[i]; \
195 src[1] = g[i]; \
196 src[2] = b[i]; \
197 \
198 r[i] = _mm_add_ps (r[i], _mm_mul_ps (src[2], c0)); \
199 \
200 g[i] = _mm_mul_ps (g[i], c2); \
201 src[2] = _mm_mul_ps (src[2], c3); \
202 g[i] = _mm_add_ps (g[i], src[0]); \
203 g[i] = _mm_add_ps (g[i], src[2]); \
204 \
205 b[i] = _mm_mul_ps (c1, src[1]); \
206 b[i] = _mm_add_ps (b[i], src[0]);
207
208 CSC_INVERSE_709_SSE2_LOOP (0)
209 CSC_INVERSE_709_SSE2_LOOP (1)
210 CSC_INVERSE_709_SSE2_LOOP (2)
211 CSC_INVERSE_709_SSE2_LOOP (3)
212
213 CSC_INVERSE_709_SSE2_LOOP (4)
214 CSC_INVERSE_709_SSE2_LOOP (5)
215 CSC_INVERSE_709_SSE2_LOOP (6)
216 CSC_INVERSE_709_SSE2_LOOP (7)
217
218 CSC_INVERSE_709_SSE2_LOOP (8)
219 CSC_INVERSE_709_SSE2_LOOP (9)
220 CSC_INVERSE_709_SSE2_LOOP (10)
221 CSC_INVERSE_709_SSE2_LOOP (11)
222
223 CSC_INVERSE_709_SSE2_LOOP (12)
224 CSC_INVERSE_709_SSE2_LOOP (13)
225 CSC_INVERSE_709_SSE2_LOOP (14)
226 CSC_INVERSE_709_SSE2_LOOP (15)
227 }
228
229 #endif /* IMF_HAVE_SSE2 */
230
231
232 //
233 // Color space conversion, Forward 709 CSC, R'G'B' -> Y'CbCr
234 //
235 // Simple FPU color space conversion. Based on the 709
236 // primary chromaticies, with no scaling or offsets.
237 //
238
239 void
csc709Forward64(float * comp0,float * comp1,float * comp2)240 csc709Forward64 (float *comp0, float *comp1, float *comp2)
241 {
242 float src[3];
243
244 for (int i = 0; i<64; ++i)
245 {
246 src[0] = comp0[i];
247 src[1] = comp1[i];
248 src[2] = comp2[i];
249
250 comp0[i] = 0.2126f * src[0] + 0.7152f * src[1] + 0.0722f * src[2];
251 comp1[i] = -0.1146f * src[0] - 0.3854f * src[1] + 0.5000f * src[2];
252 comp2[i] = 0.5000f * src[0] - 0.4542f * src[1] - 0.0458f * src[2];
253 }
254 }
255
256
257 //
258 // Byte interleaving of 2 byte arrays:
259 // src0 = AAAA
260 // src1 = BBBB
261 // dst = ABABABAB
262 //
263 // numBytes is the size of each of the source buffers
264 //
265
266 #ifndef IMF_HAVE_SSE2
267
268 //
269 // Scalar default implementation
270 //
271
272 void
interleaveByte2(char * dst,char * src0,char * src1,int numBytes)273 interleaveByte2 (char *dst, char *src0, char *src1, int numBytes)
274 {
275 for (int x = 0; x < numBytes; ++x)
276 {
277 dst[2 * x] = src0[x];
278 dst[2 * x + 1] = src1[x];
279 }
280 }
281
282 #else /* IMF_HAVE_SSE2 */
283
284 //
285 // SSE2 byte interleaving
286 //
287
288 void
interleaveByte2(char * dst,char * src0,char * src1,int numBytes)289 interleaveByte2 (char *dst, char *src0, char *src1, int numBytes)
290 {
291 int dstAlignment = (size_t)dst % 16;
292 int src0Alignment = (size_t)src0 % 16;
293 int src1Alignment = (size_t)src1 % 16;
294
295 __m128i *dst_epi8 = (__m128i*)dst;
296 __m128i *src0_epi8 = (__m128i*)src0;
297 __m128i *src1_epi8 = (__m128i*)src1;
298 int sseWidth = numBytes / 16;
299
300 if ((!dstAlignment) && (!src0Alignment) && (!src1Alignment))
301 {
302 __m128i tmp0, tmp1;
303
304 //
305 // Aligned loads and stores
306 //
307
308 for (int x = 0; x < sseWidth; ++x)
309 {
310 tmp0 = src0_epi8[x];
311 tmp1 = src1_epi8[x];
312
313 _mm_stream_si128 (&dst_epi8[2 * x],
314 _mm_unpacklo_epi8 (tmp0, tmp1));
315
316 _mm_stream_si128 (&dst_epi8[2 * x + 1],
317 _mm_unpackhi_epi8 (tmp0, tmp1));
318 }
319
320 //
321 // Then do run the leftovers one at a time
322 //
323
324 for (int x = 16 * sseWidth; x < numBytes; ++x)
325 {
326 dst[2 * x] = src0[x];
327 dst[2 * x + 1] = src1[x];
328 }
329 }
330 else if ((!dstAlignment) && (src0Alignment == 8) && (src1Alignment == 8))
331 {
332 //
333 // Aligned stores, but catch up a few values so we can
334 // use aligned loads
335 //
336
337 for (int x = 0; x < 8; ++x)
338 {
339 dst[2 * x] = src0[x];
340 dst[2 * x + 1] = src1[x];
341 }
342
343 dst_epi8 = (__m128i*)&dst[16];
344 src0_epi8 = (__m128i*)&src0[8];
345 src1_epi8 = (__m128i*)&src1[8];
346 sseWidth = (numBytes - 8) / 16;
347
348 for (int x=0; x<sseWidth; ++x)
349 {
350 _mm_stream_si128 (&dst_epi8[2 * x],
351 _mm_unpacklo_epi8 (src0_epi8[x], src1_epi8[x]));
352
353 _mm_stream_si128 (&dst_epi8[2 * x + 1],
354 _mm_unpackhi_epi8 (src0_epi8[x], src1_epi8[x]));
355 }
356
357 //
358 // Then do run the leftovers one at a time
359 //
360
361 for (int x = 16 * sseWidth + 8; x < numBytes; ++x)
362 {
363 dst[2 * x] = src0[x];
364 dst[2 * x + 1] = src1[x];
365 }
366 }
367 else
368 {
369 //
370 // Unaligned everything
371 //
372
373 for (int x = 0; x < sseWidth; ++x)
374 {
375 __m128i tmpSrc0_epi8 = _mm_loadu_si128 (&src0_epi8[x]);
376 __m128i tmpSrc1_epi8 = _mm_loadu_si128 (&src1_epi8[x]);
377
378 _mm_storeu_si128 (&dst_epi8[2 * x],
379 _mm_unpacklo_epi8 (tmpSrc0_epi8, tmpSrc1_epi8));
380
381 _mm_storeu_si128 (&dst_epi8[2 * x + 1],
382 _mm_unpackhi_epi8 (tmpSrc0_epi8, tmpSrc1_epi8));
383 }
384
385 //
386 // Then do run the leftovers one at a time
387 //
388
389 for (int x = 16 * sseWidth; x < numBytes; ++x)
390 {
391 dst[2 * x] = src0[x];
392 dst[2 * x + 1] = src1[x];
393 }
394 }
395 }
396
397 #endif /* IMF_HAVE_SSE2 */
398
399
400 //
401 // Float -> half float conversion
402 //
403 // To enable F16C based conversion, we can't rely on compile-time
404 // detection, hence the multiple defined versions. Pick one based
405 // on runtime cpuid detection.
406 //
407
408 //
409 // Default boring conversion
410 //
411
412 void
convertFloatToHalf64_scalar(unsigned short * dst,float * src)413 convertFloatToHalf64_scalar (unsigned short *dst, float *src)
414 {
415 for (int i=0; i<64; ++i)
416 dst[i] = ((half)src[i]).bits();
417 }
418
419
420 //
421 // F16C conversion - Assumes aligned src and dst
422 //
423
424 void
convertFloatToHalf64_f16c(unsigned short * dst,float * src)425 convertFloatToHalf64_f16c (unsigned short *dst, float *src)
426 {
427 //
428 // Ordinarly, I'd avoid using inline asm and prefer intrinsics.
429 // However, in order to get the intrinsics, we need to tell
430 // the compiler to generate VEX instructions.
431 //
432 // (On the GCC side, -mf16c goes ahead and activates -mavc,
433 // resulting in VEX code. Without -mf16c, no intrinsics..)
434 //
435 // Now, it's quite likely that we'll find ourselves in situations
436 // where we want to build *without* VEX, in order to maintain
437 // maximum compatability. But to get there with intrinsics,
438 // we'd need to break out code into a separate file. Bleh.
439 // I'll take the asm.
440 //
441
442 #if defined IMF_HAVE_GCC_INLINEASM
443 __asm__
444 ("vmovaps (%0), %%ymm0 \n"
445 "vmovaps 0x20(%0), %%ymm1 \n"
446 "vmovaps 0x40(%0), %%ymm2 \n"
447 "vmovaps 0x60(%0), %%ymm3 \n"
448 "vcvtps2ph $0, %%ymm0, %%xmm0 \n"
449 "vcvtps2ph $0, %%ymm1, %%xmm1 \n"
450 "vcvtps2ph $0, %%ymm2, %%xmm2 \n"
451 "vcvtps2ph $0, %%ymm3, %%xmm3 \n"
452 "vmovdqa %%xmm0, 0x00(%1) \n"
453 "vmovdqa %%xmm1, 0x10(%1) \n"
454 "vmovdqa %%xmm2, 0x20(%1) \n"
455 "vmovdqa %%xmm3, 0x30(%1) \n"
456 "vmovaps 0x80(%0), %%ymm0 \n"
457 "vmovaps 0xa0(%0), %%ymm1 \n"
458 "vmovaps 0xc0(%0), %%ymm2 \n"
459 "vmovaps 0xe0(%0), %%ymm3 \n"
460 "vcvtps2ph $0, %%ymm0, %%xmm0 \n"
461 "vcvtps2ph $0, %%ymm1, %%xmm1 \n"
462 "vcvtps2ph $0, %%ymm2, %%xmm2 \n"
463 "vcvtps2ph $0, %%ymm3, %%xmm3 \n"
464 "vmovdqa %%xmm0, 0x40(%1) \n"
465 "vmovdqa %%xmm1, 0x50(%1) \n"
466 "vmovdqa %%xmm2, 0x60(%1) \n"
467 "vmovdqa %%xmm3, 0x70(%1) \n"
468 #ifndef __AVX__
469 "vzeroupper \n"
470 #endif /* __AVX__ */
471 : /* Output */
472 : /* Input */ "r"(src), "r"(dst)
473 #ifndef __AVX__
474 : /* Clobber */ "%xmm0", "%xmm1", "%xmm2", "%xmm3", "memory"
475 #else
476 : /* Clobber */ "%ymm0", "%ymm1", "%ymm2", "%ymm3", "memory"
477 #endif /* __AVX__ */
478 );
479 #else
480 convertFloatToHalf64_scalar (dst, src);
481 #endif /* IMF_HAVE_GCC_INLINEASM */
482 }
483
484
485 //
486 // Convert an 8x8 block of HALF from zig-zag order to
487 // FLOAT in normal order. The order we want is:
488 //
489 // src dst
490 // 0 1 2 3 4 5 6 7 0 1 5 6 14 15 27 28
491 // 8 9 10 11 12 13 14 15 2 4 7 13 16 26 29 42
492 // 16 17 18 19 20 21 22 23 3 8 12 17 25 30 41 43
493 // 24 25 26 27 28 29 30 31 9 11 18 24 31 40 44 53
494 // 32 33 34 35 36 37 38 39 10 19 23 32 39 45 52 54
495 // 40 41 42 43 44 45 46 47 20 22 33 38 46 51 55 60
496 // 48 49 50 51 52 53 54 55 21 34 37 47 50 56 59 61
497 // 56 57 58 59 60 61 62 63 35 36 48 49 57 58 62 63
498 //
499
500 void
fromHalfZigZag_scalar(unsigned short * src,float * dst)501 fromHalfZigZag_scalar (unsigned short *src, float *dst)
502 {
503 half *srcHalf = (half *)src;
504
505 dst[0] = (float)srcHalf[0];
506 dst[1] = (float)srcHalf[1];
507 dst[2] = (float)srcHalf[5];
508 dst[3] = (float)srcHalf[6];
509 dst[4] = (float)srcHalf[14];
510 dst[5] = (float)srcHalf[15];
511 dst[6] = (float)srcHalf[27];
512 dst[7] = (float)srcHalf[28];
513 dst[8] = (float)srcHalf[2];
514 dst[9] = (float)srcHalf[4];
515
516 dst[10] = (float)srcHalf[7];
517 dst[11] = (float)srcHalf[13];
518 dst[12] = (float)srcHalf[16];
519 dst[13] = (float)srcHalf[26];
520 dst[14] = (float)srcHalf[29];
521 dst[15] = (float)srcHalf[42];
522 dst[16] = (float)srcHalf[3];
523 dst[17] = (float)srcHalf[8];
524 dst[18] = (float)srcHalf[12];
525 dst[19] = (float)srcHalf[17];
526
527 dst[20] = (float)srcHalf[25];
528 dst[21] = (float)srcHalf[30];
529 dst[22] = (float)srcHalf[41];
530 dst[23] = (float)srcHalf[43];
531 dst[24] = (float)srcHalf[9];
532 dst[25] = (float)srcHalf[11];
533 dst[26] = (float)srcHalf[18];
534 dst[27] = (float)srcHalf[24];
535 dst[28] = (float)srcHalf[31];
536 dst[29] = (float)srcHalf[40];
537
538 dst[30] = (float)srcHalf[44];
539 dst[31] = (float)srcHalf[53];
540 dst[32] = (float)srcHalf[10];
541 dst[33] = (float)srcHalf[19];
542 dst[34] = (float)srcHalf[23];
543 dst[35] = (float)srcHalf[32];
544 dst[36] = (float)srcHalf[39];
545 dst[37] = (float)srcHalf[45];
546 dst[38] = (float)srcHalf[52];
547 dst[39] = (float)srcHalf[54];
548
549 dst[40] = (float)srcHalf[20];
550 dst[41] = (float)srcHalf[22];
551 dst[42] = (float)srcHalf[33];
552 dst[43] = (float)srcHalf[38];
553 dst[44] = (float)srcHalf[46];
554 dst[45] = (float)srcHalf[51];
555 dst[46] = (float)srcHalf[55];
556 dst[47] = (float)srcHalf[60];
557 dst[48] = (float)srcHalf[21];
558 dst[49] = (float)srcHalf[34];
559
560 dst[50] = (float)srcHalf[37];
561 dst[51] = (float)srcHalf[47];
562 dst[52] = (float)srcHalf[50];
563 dst[53] = (float)srcHalf[56];
564 dst[54] = (float)srcHalf[59];
565 dst[55] = (float)srcHalf[61];
566 dst[56] = (float)srcHalf[35];
567 dst[57] = (float)srcHalf[36];
568 dst[58] = (float)srcHalf[48];
569 dst[59] = (float)srcHalf[49];
570
571 dst[60] = (float)srcHalf[57];
572 dst[61] = (float)srcHalf[58];
573 dst[62] = (float)srcHalf[62];
574 dst[63] = (float)srcHalf[63];
575 }
576
577
578 //
579 // If we can form the correct ordering in xmm registers,
580 // we can use F16C to convert from HALF -> FLOAT. However,
581 // making the correct order isn't trivial.
582 //
583 // We want to re-order a source 8x8 matrix from:
584 //
585 // 0 1 2 3 4 5 6 7 0 1 5 6 14 15 27 28
586 // 8 9 10 11 12 13 14 15 2 4 7 13 16 26 29 42
587 // 16 17 18 19 20 21 22 23 3 8 12 17 25 30 41 43
588 // 24 25 26 27 28 29 30 31 9 11 18 24 31 40 44 53 (A)
589 // 32 33 34 35 36 37 38 39 --> 10 19 23 32 39 45 52 54
590 // 40 41 42 43 44 45 46 47 20 22 33 38 46 51 55 60
591 // 48 49 50 51 52 53 54 55 21 34 37 47 50 56 59 61
592 // 56 57 58 59 60 61 62 63 35 36 48 49 57 58 62 63
593 //
594 // Which looks like a mess, right?
595 //
596 // Now, check out the NE/SW diagonals of (A). Along those lines,
597 // we have runs of contiguous values! If we rewrite (A) a bit, we get:
598 //
599 // 0
600 // 1 2
601 // 5 4 3
602 // 6 7 8 9
603 // 14 13 12 11 10
604 // 15 16 17 18 19 20
605 // 27 26 25 24 23 22 21 (B)
606 // 28 29 30 31 32 33 34 35
607 // 42 41 40 39 38 37 36
608 // 43 44 45 46 47 48
609 // 53 52 51 50 49
610 // 54 55 56 57
611 // 60 59 58
612 // 61 62
613 // 63
614 //
615 // In this ordering, the columns are the rows (A). If we can 'transpose'
616 // (B), we'll achieve our goal. But we want this to fit nicely into
617 // xmm registers and still be able to load large runs efficiently.
618 // Also, notice that the odd rows are in ascending order, while
619 // the even rows are in descending order.
620 //
621 // If we 'fold' the bottom half up into the top, we can preserve ordered
622 // runs accross rows, and still keep all the correct values in columns.
623 // After transposing, we'll need to rotate things back into place.
624 // This gives us:
625 //
626 // 0 | 42 41 40 39 38 37 36
627 // 1 2 | 43 44 45 46 47 48
628 // 5 4 3 | 53 52 51 50 49
629 // 6 7 8 9 | 54 55 56 57 (C)
630 // 14 13 12 11 10 | 60 59 58
631 // 15 16 17 18 19 20 | 61 62
632 // 27 26 25 24 23 22 21 | 61
633 // 28 29 30 31 32 33 34 35
634 //
635 // But hang on. We still have the backwards descending rows to deal with.
636 // Lets reverse the even rows so that all values are in ascending order
637 //
638 // 36 37 38 39 40 41 42 | 0
639 // 1 2 | 43 44 45 46 47 48
640 // 49 50 51 52 53 | 3 4 5
641 // 6 7 8 9 | 54 55 56 57 (D)
642 // 58 59 60 | 10 11 12 13 14
643 // 15 16 17 18 19 20 | 61 62
644 // 61 | 21 22 23 24 25 26 27
645 // 28 29 30 31 32 33 34 35
646 //
647 // If we can form (D), we will then:
648 // 1) Reverse the even rows
649 // 2) Transpose
650 // 3) Rotate the rows
651 //
652 // and we'll have (A).
653 //
654
655 void
fromHalfZigZag_f16c(unsigned short * src,float * dst)656 fromHalfZigZag_f16c (unsigned short *src, float *dst)
657 {
658 #if defined IMF_HAVE_GCC_INLINEASM_64
659 __asm__
660
661 /* x3 <- 0
662 * x8 <- [ 0- 7]
663 * x6 <- [56-63]
664 * x9 <- [21-28]
665 * x7 <- [28-35]
666 * x3 <- [ 6- 9] (lower half) */
667
668 ("vpxor %%xmm3, %%xmm3, %%xmm3 \n"
669 "vmovdqa (%0), %%xmm8 \n"
670 "vmovdqa 112(%0), %%xmm6 \n"
671 "vmovdqu 42(%0), %%xmm9 \n"
672 "vmovdqu 56(%0), %%xmm7 \n"
673 "vmovq 12(%0), %%xmm3 \n"
674
675 /* Setup rows 0-2 of A in xmm0-xmm2
676 * x1 <- x8 >> 16 (1 value)
677 * x2 <- x8 << 32 (2 values)
678 * x0 <- alignr([35-42], x8, 2)
679 * x1 <- blend(x1, [41-48])
680 * x2 <- blend(x2, [49-56]) */
681
682 "vpsrldq $2, %%xmm8, %%xmm1 \n"
683 "vpslldq $4, %%xmm8, %%xmm2 \n"
684 "vpalignr $2, 70(%0), %%xmm8, %%xmm0 \n"
685 "vpblendw $0xfc, 82(%0), %%xmm1, %%xmm1 \n"
686 "vpblendw $0x1f, 98(%0), %%xmm2, %%xmm2 \n"
687
688 /* Setup rows 4-6 of A in xmm4-xmm6
689 * x4 <- x6 >> 32 (2 values)
690 * x5 <- x6 << 16 (1 value)
691 * x6 <- alignr(x6,x9,14)
692 * x4 <- blend(x4, [ 7-14])
693 * x5 <- blend(x5, [15-22]) */
694
695 "vpsrldq $4, %%xmm6, %%xmm4 \n"
696 "vpslldq $2, %%xmm6, %%xmm5 \n"
697 "vpalignr $14, %%xmm6, %%xmm9, %%xmm6 \n"
698 "vpblendw $0xf8, 14(%0), %%xmm4, %%xmm4 \n"
699 "vpblendw $0x3f, 30(%0), %%xmm5, %%xmm5 \n"
700
701 /* Load the upper half of row 3 into xmm3
702 * x3 <- [54-57] (upper half) */
703
704 "vpinsrq $1, 108(%0), %%xmm3, %%xmm3\n"
705
706 /* Reverse the even rows. We're not using PSHUFB as
707 * that requires loading an extra constant all the time,
708 * and we're alreadly pretty memory bound.
709 */
710
711 "vpshuflw $0x1b, %%xmm0, %%xmm0 \n"
712 "vpshuflw $0x1b, %%xmm2, %%xmm2 \n"
713 "vpshuflw $0x1b, %%xmm4, %%xmm4 \n"
714 "vpshuflw $0x1b, %%xmm6, %%xmm6 \n"
715
716 "vpshufhw $0x1b, %%xmm0, %%xmm0 \n"
717 "vpshufhw $0x1b, %%xmm2, %%xmm2 \n"
718 "vpshufhw $0x1b, %%xmm4, %%xmm4 \n"
719 "vpshufhw $0x1b, %%xmm6, %%xmm6 \n"
720
721 "vpshufd $0x4e, %%xmm0, %%xmm0 \n"
722 "vpshufd $0x4e, %%xmm2, %%xmm2 \n"
723 "vpshufd $0x4e, %%xmm4, %%xmm4 \n"
724 "vpshufd $0x4e, %%xmm6, %%xmm6 \n"
725
726 /* Transpose xmm0-xmm7 into xmm8-xmm15 */
727
728 "vpunpcklwd %%xmm1, %%xmm0, %%xmm8 \n"
729 "vpunpcklwd %%xmm3, %%xmm2, %%xmm9 \n"
730 "vpunpcklwd %%xmm5, %%xmm4, %%xmm10 \n"
731 "vpunpcklwd %%xmm7, %%xmm6, %%xmm11 \n"
732 "vpunpckhwd %%xmm1, %%xmm0, %%xmm12 \n"
733 "vpunpckhwd %%xmm3, %%xmm2, %%xmm13 \n"
734 "vpunpckhwd %%xmm5, %%xmm4, %%xmm14 \n"
735 "vpunpckhwd %%xmm7, %%xmm6, %%xmm15 \n"
736
737 "vpunpckldq %%xmm9, %%xmm8, %%xmm0 \n"
738 "vpunpckldq %%xmm11, %%xmm10, %%xmm1 \n"
739 "vpunpckhdq %%xmm9, %%xmm8, %%xmm2 \n"
740 "vpunpckhdq %%xmm11, %%xmm10, %%xmm3 \n"
741 "vpunpckldq %%xmm13, %%xmm12, %%xmm4 \n"
742 "vpunpckldq %%xmm15, %%xmm14, %%xmm5 \n"
743 "vpunpckhdq %%xmm13, %%xmm12, %%xmm6 \n"
744 "vpunpckhdq %%xmm15, %%xmm14, %%xmm7 \n"
745
746 "vpunpcklqdq %%xmm1, %%xmm0, %%xmm8 \n"
747 "vpunpckhqdq %%xmm1, %%xmm0, %%xmm9 \n"
748 "vpunpcklqdq %%xmm3, %%xmm2, %%xmm10 \n"
749 "vpunpckhqdq %%xmm3, %%xmm2, %%xmm11 \n"
750 "vpunpcklqdq %%xmm4, %%xmm5, %%xmm12 \n"
751 "vpunpckhqdq %%xmm5, %%xmm4, %%xmm13 \n"
752 "vpunpcklqdq %%xmm7, %%xmm6, %%xmm14 \n"
753 "vpunpckhqdq %%xmm7, %%xmm6, %%xmm15 \n"
754
755 /* Rotate the rows to get the correct final order.
756 * Rotating xmm12 isn't needed, as we can handle
757 * the rotation in the PUNPCKLQDQ above. Rotating
758 * xmm8 isn't needed as it's already in the right order
759 */
760
761 "vpalignr $2, %%xmm9, %%xmm9, %%xmm9 \n"
762 "vpalignr $4, %%xmm10, %%xmm10, %%xmm10 \n"
763 "vpalignr $6, %%xmm11, %%xmm11, %%xmm11 \n"
764 "vpalignr $10, %%xmm13, %%xmm13, %%xmm13 \n"
765 "vpalignr $12, %%xmm14, %%xmm14, %%xmm14 \n"
766 "vpalignr $14, %%xmm15, %%xmm15, %%xmm15 \n"
767
768 /* Convert from half -> float */
769
770 "vcvtph2ps %%xmm8, %%ymm8 \n"
771 "vcvtph2ps %%xmm9, %%ymm9 \n"
772 "vcvtph2ps %%xmm10, %%ymm10 \n"
773 "vcvtph2ps %%xmm11, %%ymm11 \n"
774 "vcvtph2ps %%xmm12, %%ymm12 \n"
775 "vcvtph2ps %%xmm13, %%ymm13 \n"
776 "vcvtph2ps %%xmm14, %%ymm14 \n"
777 "vcvtph2ps %%xmm15, %%ymm15 \n"
778
779 /* Move float values to dst */
780
781 "vmovaps %%ymm8, (%1) \n"
782 "vmovaps %%ymm9, 32(%1) \n"
783 "vmovaps %%ymm10, 64(%1) \n"
784 "vmovaps %%ymm11, 96(%1) \n"
785 "vmovaps %%ymm12, 128(%1) \n"
786 "vmovaps %%ymm13, 160(%1) \n"
787 "vmovaps %%ymm14, 192(%1) \n"
788 "vmovaps %%ymm15, 224(%1) \n"
789 #ifndef __AVX__
790 "vzeroupper \n"
791 #endif /* __AVX__ */
792 : /* Output */
793 : /* Input */ "r"(src), "r"(dst)
794 : /* Clobber */ "memory",
795 #ifndef __AVX__
796 "%xmm0", "%xmm1", "%xmm2", "%xmm3",
797 "%xmm4", "%xmm5", "%xmm6", "%xmm7",
798 "%xmm8", "%xmm9", "%xmm10", "%xmm11",
799 "%xmm12", "%xmm13", "%xmm14", "%xmm15"
800 #else
801 "%ymm0", "%ymm1", "%ymm2", "%ymm3",
802 "%ymm4", "%ymm5", "%ymm6", "%ymm7",
803 "%ymm8", "%ymm9", "%ymm10", "%ymm11",
804 "%ymm12", "%ymm13", "%ymm14", "%ymm15"
805 #endif /* __AVX__ */
806 );
807
808 #else
809 fromHalfZigZag_scalar(src, dst);
810 #endif /* defined IMF_HAVE_GCC_INLINEASM_64 */
811 }
812
813
814 //
815 // Inverse 8x8 DCT, only inverting the DC. This assumes that
816 // all AC frequencies are 0.
817 //
818
819 #ifndef IMF_HAVE_SSE2
820
821 void
dctInverse8x8DcOnly(float * data)822 dctInverse8x8DcOnly (float *data)
823 {
824 float val = data[0] * 3.535536e-01f * 3.535536e-01f;
825
826 for (int i = 0; i < 64; ++i)
827 data[i] = val;
828 }
829
830 #else /* IMF_HAVE_SSE2 */
831
832 void
dctInverse8x8DcOnly(float * data)833 dctInverse8x8DcOnly (float *data)
834 {
835 __m128 src = _mm_set1_ps (data[0] * 3.535536e-01f * 3.535536e-01f);
836 __m128 *dst = (__m128 *)data;
837
838 for (int i = 0; i < 16; ++i)
839 dst[i] = src;
840 }
841
842 #endif /* IMF_HAVE_SSE2 */
843
844
845 //
846 // Full 8x8 Inverse DCT:
847 //
848 // Simple inverse DCT on an 8x8 block, with scalar ops only.
849 // Operates on data in-place.
850 //
851 // This is based on the iDCT formuation (y = frequency domain,
852 // x = spatial domain)
853 //
854 // [x0] [ ][y0] [ ][y1]
855 // [x1] = [ M1 ][y2] + [ M2 ][y3]
856 // [x2] [ ][y4] [ ][y5]
857 // [x3] [ ][y6] [ ][y7]
858 //
859 // [x7] [ ][y0] [ ][y1]
860 // [x6] = [ M1 ][y2] - [ M2 ][y3]
861 // [x5] [ ][y4] [ ][y5]
862 // [x4] [ ][y6] [ ][y7]
863 //
864 // where M1: M2:
865 //
866 // [a c a f] [b d e g]
867 // [a f -a -c] [d -g -b -e]
868 // [a -f -a c] [e -b g d]
869 // [a -c a -f] [g -e d -b]
870 //
871 // and the constants are as defined below..
872 //
873 // If you know how many of the lower rows are zero, that can
874 // be passed in to help speed things up. If you don't know,
875 // just set zeroedRows=0.
876 //
877
878 //
879 // Default implementation
880 //
881
882 template <int zeroedRows>
883 void
dctInverse8x8_scalar(float * data)884 dctInverse8x8_scalar (float *data)
885 {
886 const float a = .5f * cosf (3.14159f / 4.0f);
887 const float b = .5f * cosf (3.14159f / 16.0f);
888 const float c = .5f * cosf (3.14159f / 8.0f);
889 const float d = .5f * cosf (3.f*3.14159f / 16.0f);
890 const float e = .5f * cosf (5.f*3.14159f / 16.0f);
891 const float f = .5f * cosf (3.f*3.14159f / 8.0f);
892 const float g = .5f * cosf (7.f*3.14159f / 16.0f);
893
894 float alpha[4], beta[4], theta[4], gamma[4];
895
896 float *rowPtr = NULL;
897
898 //
899 // First pass - row wise.
900 //
901 // This looks less-compact than the description above in
902 // an attempt to fold together common sub-expressions.
903 //
904
905 for (int row = 0; row < 8 - zeroedRows; ++row)
906 {
907 rowPtr = data + row * 8;
908
909 alpha[0] = c * rowPtr[2];
910 alpha[1] = f * rowPtr[2];
911 alpha[2] = c * rowPtr[6];
912 alpha[3] = f * rowPtr[6];
913
914 beta[0] = b * rowPtr[1] + d * rowPtr[3] + e * rowPtr[5] + g * rowPtr[7];
915 beta[1] = d * rowPtr[1] - g * rowPtr[3] - b * rowPtr[5] - e * rowPtr[7];
916 beta[2] = e * rowPtr[1] - b * rowPtr[3] + g * rowPtr[5] + d * rowPtr[7];
917 beta[3] = g * rowPtr[1] - e * rowPtr[3] + d * rowPtr[5] - b * rowPtr[7];
918
919 theta[0] = a * (rowPtr[0] + rowPtr[4]);
920 theta[3] = a * (rowPtr[0] - rowPtr[4]);
921
922 theta[1] = alpha[0] + alpha[3];
923 theta[2] = alpha[1] - alpha[2];
924
925
926 gamma[0] = theta[0] + theta[1];
927 gamma[1] = theta[3] + theta[2];
928 gamma[2] = theta[3] - theta[2];
929 gamma[3] = theta[0] - theta[1];
930
931
932 rowPtr[0] = gamma[0] + beta[0];
933 rowPtr[1] = gamma[1] + beta[1];
934 rowPtr[2] = gamma[2] + beta[2];
935 rowPtr[3] = gamma[3] + beta[3];
936
937 rowPtr[4] = gamma[3] - beta[3];
938 rowPtr[5] = gamma[2] - beta[2];
939 rowPtr[6] = gamma[1] - beta[1];
940 rowPtr[7] = gamma[0] - beta[0];
941 }
942
943 //
944 // Second pass - column wise.
945 //
946
947 for (int column = 0; column < 8; ++column)
948 {
949 alpha[0] = c * data[16+column];
950 alpha[1] = f * data[16+column];
951 alpha[2] = c * data[48+column];
952 alpha[3] = f * data[48+column];
953
954 beta[0] = b * data[8+column] + d * data[24+column] +
955 e * data[40+column] + g * data[56+column];
956
957 beta[1] = d * data[8+column] - g * data[24+column] -
958 b * data[40+column] - e * data[56+column];
959
960 beta[2] = e * data[8+column] - b * data[24+column] +
961 g * data[40+column] + d * data[56+column];
962
963 beta[3] = g * data[8+column] - e * data[24+column] +
964 d * data[40+column] - b * data[56+column];
965
966 theta[0] = a * (data[column] + data[32+column]);
967 theta[3] = a * (data[column] - data[32+column]);
968
969 theta[1] = alpha[0] + alpha[3];
970 theta[2] = alpha[1] - alpha[2];
971
972 gamma[0] = theta[0] + theta[1];
973 gamma[1] = theta[3] + theta[2];
974 gamma[2] = theta[3] - theta[2];
975 gamma[3] = theta[0] - theta[1];
976
977 data[ column] = gamma[0] + beta[0];
978 data[ 8 + column] = gamma[1] + beta[1];
979 data[16 + column] = gamma[2] + beta[2];
980 data[24 + column] = gamma[3] + beta[3];
981
982 data[32 + column] = gamma[3] - beta[3];
983 data[40 + column] = gamma[2] - beta[2];
984 data[48 + column] = gamma[1] - beta[1];
985 data[56 + column] = gamma[0] - beta[0];
986 }
987 }
988
989
990 //
991 // SSE2 Implementation
992 //
993
994 template <int zeroedRows>
995 void
dctInverse8x8_sse2(float * data)996 dctInverse8x8_sse2 (float *data)
997 {
998 #ifdef IMF_HAVE_SSE2
999 __m128 a = {3.535536e-01f,3.535536e-01f,3.535536e-01f,3.535536e-01f};
1000 __m128 b = {4.903927e-01f,4.903927e-01f,4.903927e-01f,4.903927e-01f};
1001 __m128 c = {4.619398e-01f,4.619398e-01f,4.619398e-01f,4.619398e-01f};
1002 __m128 d = {4.157349e-01f,4.157349e-01f,4.157349e-01f,4.157349e-01f};
1003 __m128 e = {2.777855e-01f,2.777855e-01f,2.777855e-01f,2.777855e-01f};
1004 __m128 f = {1.913422e-01f,1.913422e-01f,1.913422e-01f,1.913422e-01f};
1005 __m128 g = {9.754573e-02f,9.754573e-02f,9.754573e-02f,9.754573e-02f};
1006
1007 __m128 c0 = {3.535536e-01f, 3.535536e-01f, 3.535536e-01f, 3.535536e-01f};
1008 __m128 c1 = {4.619398e-01f, 1.913422e-01f,-1.913422e-01f,-4.619398e-01f};
1009 __m128 c2 = {3.535536e-01f,-3.535536e-01f,-3.535536e-01f, 3.535536e-01f};
1010 __m128 c3 = {1.913422e-01f,-4.619398e-01f, 4.619398e-01f,-1.913422e-01f};
1011
1012 __m128 c4 = {4.903927e-01f, 4.157349e-01f, 2.777855e-01f, 9.754573e-02f};
1013 __m128 c5 = {4.157349e-01f,-9.754573e-02f,-4.903927e-01f,-2.777855e-01f};
1014 __m128 c6 = {2.777855e-01f,-4.903927e-01f, 9.754573e-02f, 4.157349e-01f};
1015 __m128 c7 = {9.754573e-02f,-2.777855e-01f, 4.157349e-01f,-4.903927e-01f};
1016
1017 __m128 *srcVec = (__m128 *)data;
1018 __m128 x[8], evenSum, oddSum;
1019 __m128 in[8], alpha[4], beta[4], theta[4], gamma[4];
1020
1021 //
1022 // Rows -
1023 //
1024 // Treat this just like matrix-vector multiplication. The
1025 // trick is to note that:
1026 //
1027 // [M00 M01 M02 M03][v0] [(v0 M00) + (v1 M01) + (v2 M02) + (v3 M03)]
1028 // [M10 M11 M12 M13][v1] = [(v0 M10) + (v1 M11) + (v2 M12) + (v3 M13)]
1029 // [M20 M21 M22 M23][v2] [(v0 M20) + (v1 M21) + (v2 M22) + (v3 M23)]
1030 // [M30 M31 M32 M33][v3] [(v0 M30) + (v1 M31) + (v2 M32) + (v3 M33)]
1031 //
1032 // Then, we can fill a register with v_i and multiply by the i-th column
1033 // of M, accumulating across all i-s.
1034 //
1035 // The kids refer to the populating of a register with a single value
1036 // "broadcasting", and it can be done with a shuffle instruction. It
1037 // seems to be the slowest part of the whole ordeal.
1038 //
1039 // Our matrix columns are stored above in c0-c7. c0-3 make up M1, and
1040 // c4-7 are from M2.
1041 //
1042
1043 #define DCT_INVERSE_8x8_SS2_ROW_LOOP(i) \
1044 /* \
1045 * Broadcast the components of the row \
1046 */ \
1047 \
1048 x[0] = _mm_shuffle_ps (srcVec[2 * i], \
1049 srcVec[2 * i], \
1050 _MM_SHUFFLE (0, 0, 0, 0)); \
1051 \
1052 x[1] = _mm_shuffle_ps (srcVec[2 * i], \
1053 srcVec[2 * i], \
1054 _MM_SHUFFLE (1, 1, 1, 1)); \
1055 \
1056 x[2] = _mm_shuffle_ps (srcVec[2 * i], \
1057 srcVec[2 * i], \
1058 _MM_SHUFFLE (2, 2, 2, 2)); \
1059 \
1060 x[3] = _mm_shuffle_ps (srcVec[2 * i], \
1061 srcVec[2 * i], \
1062 _MM_SHUFFLE (3, 3, 3, 3)); \
1063 \
1064 x[4] = _mm_shuffle_ps (srcVec[2 * i + 1], \
1065 srcVec[2 * i + 1], \
1066 _MM_SHUFFLE (0, 0, 0, 0)); \
1067 \
1068 x[5] = _mm_shuffle_ps (srcVec[2 * i + 1], \
1069 srcVec[2 * i + 1], \
1070 _MM_SHUFFLE (1, 1, 1, 1)); \
1071 \
1072 x[6] = _mm_shuffle_ps (srcVec[2 * i + 1], \
1073 srcVec[2 * i + 1], \
1074 _MM_SHUFFLE (2, 2, 2, 2)); \
1075 \
1076 x[7] = _mm_shuffle_ps (srcVec[2 * i + 1], \
1077 srcVec[2 * i + 1], \
1078 _MM_SHUFFLE (3, 3, 3, 3)); \
1079 /* \
1080 * Multiply the components by each column of the matrix \
1081 */ \
1082 \
1083 x[0] = _mm_mul_ps (x[0], c0); \
1084 x[2] = _mm_mul_ps (x[2], c1); \
1085 x[4] = _mm_mul_ps (x[4], c2); \
1086 x[6] = _mm_mul_ps (x[6], c3); \
1087 \
1088 x[1] = _mm_mul_ps (x[1], c4); \
1089 x[3] = _mm_mul_ps (x[3], c5); \
1090 x[5] = _mm_mul_ps (x[5], c6); \
1091 x[7] = _mm_mul_ps (x[7], c7); \
1092 \
1093 /* \
1094 * Add across \
1095 */ \
1096 \
1097 evenSum = _mm_setzero_ps(); \
1098 evenSum = _mm_add_ps (evenSum, x[0]); \
1099 evenSum = _mm_add_ps (evenSum, x[2]); \
1100 evenSum = _mm_add_ps (evenSum, x[4]); \
1101 evenSum = _mm_add_ps (evenSum, x[6]); \
1102 \
1103 oddSum = _mm_setzero_ps(); \
1104 oddSum = _mm_add_ps (oddSum, x[1]); \
1105 oddSum = _mm_add_ps (oddSum, x[3]); \
1106 oddSum = _mm_add_ps (oddSum, x[5]); \
1107 oddSum = _mm_add_ps (oddSum, x[7]); \
1108 \
1109 /* \
1110 * Final Sum: \
1111 * out [0, 1, 2, 3] = evenSum + oddSum \
1112 * out [7, 6, 5, 4] = evenSum - oddSum \
1113 */ \
1114 \
1115 srcVec[2 * i] = _mm_add_ps (evenSum, oddSum); \
1116 srcVec[2 * i + 1] = _mm_sub_ps (evenSum, oddSum); \
1117 srcVec[2 * i + 1] = _mm_shuffle_ps (srcVec[2 * i + 1], \
1118 srcVec[2 * i + 1], \
1119 _MM_SHUFFLE (0, 1, 2, 3));
1120
1121 switch (zeroedRows)
1122 {
1123 case 0:
1124 default:
1125 DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
1126 DCT_INVERSE_8x8_SS2_ROW_LOOP (1)
1127 DCT_INVERSE_8x8_SS2_ROW_LOOP (2)
1128 DCT_INVERSE_8x8_SS2_ROW_LOOP (3)
1129 DCT_INVERSE_8x8_SS2_ROW_LOOP (4)
1130 DCT_INVERSE_8x8_SS2_ROW_LOOP (5)
1131 DCT_INVERSE_8x8_SS2_ROW_LOOP (6)
1132 DCT_INVERSE_8x8_SS2_ROW_LOOP (7)
1133 break;
1134
1135 case 1:
1136 DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
1137 DCT_INVERSE_8x8_SS2_ROW_LOOP (1)
1138 DCT_INVERSE_8x8_SS2_ROW_LOOP (2)
1139 DCT_INVERSE_8x8_SS2_ROW_LOOP (3)
1140 DCT_INVERSE_8x8_SS2_ROW_LOOP (4)
1141 DCT_INVERSE_8x8_SS2_ROW_LOOP (5)
1142 DCT_INVERSE_8x8_SS2_ROW_LOOP (6)
1143 break;
1144
1145 case 2:
1146 DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
1147 DCT_INVERSE_8x8_SS2_ROW_LOOP (1)
1148 DCT_INVERSE_8x8_SS2_ROW_LOOP (2)
1149 DCT_INVERSE_8x8_SS2_ROW_LOOP (3)
1150 DCT_INVERSE_8x8_SS2_ROW_LOOP (4)
1151 DCT_INVERSE_8x8_SS2_ROW_LOOP (5)
1152 break;
1153
1154 case 3:
1155 DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
1156 DCT_INVERSE_8x8_SS2_ROW_LOOP (1)
1157 DCT_INVERSE_8x8_SS2_ROW_LOOP (2)
1158 DCT_INVERSE_8x8_SS2_ROW_LOOP (3)
1159 DCT_INVERSE_8x8_SS2_ROW_LOOP (4)
1160 break;
1161
1162 case 4:
1163 DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
1164 DCT_INVERSE_8x8_SS2_ROW_LOOP (1)
1165 DCT_INVERSE_8x8_SS2_ROW_LOOP (2)
1166 DCT_INVERSE_8x8_SS2_ROW_LOOP (3)
1167 break;
1168
1169 case 5:
1170 DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
1171 DCT_INVERSE_8x8_SS2_ROW_LOOP (1)
1172 DCT_INVERSE_8x8_SS2_ROW_LOOP (2)
1173 break;
1174
1175 case 6:
1176 DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
1177 DCT_INVERSE_8x8_SS2_ROW_LOOP (1)
1178 break;
1179
1180 case 7:
1181 DCT_INVERSE_8x8_SS2_ROW_LOOP (0)
1182 break;
1183 }
1184
1185 //
1186 // Columns -
1187 //
1188 // This is slightly more straightforward, if less readable. Here
1189 // we just operate on 4 columns at a time, in two batches.
1190 //
1191 // The slight mess is to try and cache sub-expressions, which
1192 // we ignore in the row-wise pass.
1193 //
1194
1195 for (int col = 0; col < 2; ++col)
1196 {
1197
1198 for (int i = 0; i < 8; ++i)
1199 in[i] = srcVec[2 * i + col];
1200
1201 alpha[0] = _mm_mul_ps (c, in[2]);
1202 alpha[1] = _mm_mul_ps (f, in[2]);
1203 alpha[2] = _mm_mul_ps (c, in[6]);
1204 alpha[3] = _mm_mul_ps (f, in[6]);
1205
1206 beta[0] = _mm_add_ps (_mm_add_ps (_mm_mul_ps (in[1], b),
1207 _mm_mul_ps (in[3], d)),
1208 _mm_add_ps (_mm_mul_ps (in[5], e),
1209 _mm_mul_ps (in[7], g)));
1210
1211 beta[1] = _mm_sub_ps (_mm_sub_ps (_mm_mul_ps (in[1], d),
1212 _mm_mul_ps (in[3], g)),
1213 _mm_add_ps (_mm_mul_ps (in[5], b),
1214 _mm_mul_ps (in[7], e)));
1215
1216 beta[2] = _mm_add_ps (_mm_sub_ps (_mm_mul_ps (in[1], e),
1217 _mm_mul_ps (in[3], b)),
1218 _mm_add_ps (_mm_mul_ps (in[5], g),
1219 _mm_mul_ps (in[7], d)));
1220
1221 beta[3] = _mm_add_ps (_mm_sub_ps (_mm_mul_ps (in[1], g),
1222 _mm_mul_ps (in[3], e)),
1223 _mm_sub_ps (_mm_mul_ps (in[5], d),
1224 _mm_mul_ps (in[7], b)));
1225
1226 theta[0] = _mm_mul_ps (a, _mm_add_ps (in[0], in[4]));
1227 theta[3] = _mm_mul_ps (a, _mm_sub_ps (in[0], in[4]));
1228
1229 theta[1] = _mm_add_ps (alpha[0], alpha[3]);
1230 theta[2] = _mm_sub_ps (alpha[1], alpha[2]);
1231
1232 gamma[0] = _mm_add_ps (theta[0], theta[1]);
1233 gamma[1] = _mm_add_ps (theta[3], theta[2]);
1234 gamma[2] = _mm_sub_ps (theta[3], theta[2]);
1235 gamma[3] = _mm_sub_ps (theta[0], theta[1]);
1236
1237 srcVec[ col] = _mm_add_ps (gamma[0], beta[0]);
1238 srcVec[2+col] = _mm_add_ps (gamma[1], beta[1]);
1239 srcVec[4+col] = _mm_add_ps (gamma[2], beta[2]);
1240 srcVec[6+col] = _mm_add_ps (gamma[3], beta[3]);
1241
1242 srcVec[ 8+col] = _mm_sub_ps (gamma[3], beta[3]);
1243 srcVec[10+col] = _mm_sub_ps (gamma[2], beta[2]);
1244 srcVec[12+col] = _mm_sub_ps (gamma[1], beta[1]);
1245 srcVec[14+col] = _mm_sub_ps (gamma[0], beta[0]);
1246 }
1247
1248 #else /* IMF_HAVE_SSE2 */
1249
1250 dctInverse8x8_scalar<zeroedRows> (data);
1251
1252 #endif /* IMF_HAVE_SSE2 */
1253 }
1254
1255
1256 //
1257 // AVX Implementation
1258 //
1259
1260 #define STR(A) #A
1261
1262 #define IDCT_AVX_SETUP_2_ROWS(_DST0, _DST1, _TMP0, _TMP1, \
1263 _OFF00, _OFF01, _OFF10, _OFF11) \
1264 "vmovaps " STR(_OFF00) "(%0), %%xmm" STR(_TMP0) " \n" \
1265 "vmovaps " STR(_OFF01) "(%0), %%xmm" STR(_TMP1) " \n" \
1266 " \n" \
1267 "vinsertf128 $1, " STR(_OFF10) "(%0), %%ymm" STR(_TMP0) ", %%ymm" STR(_TMP0) " \n" \
1268 "vinsertf128 $1, " STR(_OFF11) "(%0), %%ymm" STR(_TMP1) ", %%ymm" STR(_TMP1) " \n" \
1269 " \n" \
1270 "vunpcklpd %%ymm" STR(_TMP1) ", %%ymm" STR(_TMP0) ", %%ymm" STR(_DST0) " \n" \
1271 "vunpckhpd %%ymm" STR(_TMP1) ", %%ymm" STR(_TMP0) ", %%ymm" STR(_DST1) " \n" \
1272 " \n" \
1273 "vunpcklps %%ymm" STR(_DST1) ", %%ymm" STR(_DST0) ", %%ymm" STR(_TMP0) " \n" \
1274 "vunpckhps %%ymm" STR(_DST1) ", %%ymm" STR(_DST0) ", %%ymm" STR(_TMP1) " \n" \
1275 " \n" \
1276 "vunpcklpd %%ymm" STR(_TMP1) ", %%ymm" STR(_TMP0) ", %%ymm" STR(_DST0) " \n" \
1277 "vunpckhpd %%ymm" STR(_TMP1) ", %%ymm" STR(_TMP0) ", %%ymm" STR(_DST1) " \n"
1278
1279 #define IDCT_AVX_MMULT_ROWS(_SRC) \
1280 /* Broadcast the source values into y12-y15 */ \
1281 "vpermilps $0x00, " STR(_SRC) ", %%ymm12 \n" \
1282 "vpermilps $0x55, " STR(_SRC) ", %%ymm13 \n" \
1283 "vpermilps $0xaa, " STR(_SRC) ", %%ymm14 \n" \
1284 "vpermilps $0xff, " STR(_SRC) ", %%ymm15 \n" \
1285 \
1286 /* Multiple coefs and the broadcasted values */ \
1287 "vmulps %%ymm12, %%ymm8, %%ymm12 \n" \
1288 "vmulps %%ymm13, %%ymm9, %%ymm13 \n" \
1289 "vmulps %%ymm14, %%ymm10, %%ymm14 \n" \
1290 "vmulps %%ymm15, %%ymm11, %%ymm15 \n" \
1291 \
1292 /* Accumulate the result back into the source */ \
1293 "vaddps %%ymm13, %%ymm12, %%ymm12 \n" \
1294 "vaddps %%ymm15, %%ymm14, %%ymm14 \n" \
1295 "vaddps %%ymm14, %%ymm12, " STR(_SRC) "\n"
1296
1297 #define IDCT_AVX_EO_TO_ROW_HALVES(_EVEN, _ODD, _FRONT, _BACK) \
1298 "vsubps " STR(_ODD) "," STR(_EVEN) "," STR(_BACK) "\n" \
1299 "vaddps " STR(_ODD) "," STR(_EVEN) "," STR(_FRONT) "\n" \
1300 /* Reverse the back half */ \
1301 "vpermilps $0x1b," STR(_BACK) "," STR(_BACK) "\n"
1302
1303 /* In order to allow for path paths when we know certain rows
1304 * of the 8x8 block are zero, most of the body of the DCT is
1305 * in the following macro. Statements are wrapped in a ROWn()
1306 * macro, where n is the lowest row in the 8x8 block in which
1307 * they depend.
1308 *
1309 * This should work for the cases where we have 2-8 full rows.
1310 * the 1-row case is special, and we'll handle it seperately.
1311 */
1312 #define IDCT_AVX_BODY \
1313 /* ==============================================
1314 * Row 1D DCT
1315 * ----------------------------------------------
1316 */ \
1317 \
1318 /* Setup for the row-oriented 1D DCT. Assuming that (%0) holds
1319 * the row-major 8x8 block, load ymm0-3 with the even columns
1320 * and ymm4-7 with the odd columns. The lower half of the ymm
1321 * holds one row, while the upper half holds the next row.
1322 *
1323 * If our source is:
1324 * a0 a1 a2 a3 a4 a5 a6 a7
1325 * b0 b1 b2 b3 b4 b5 b6 b7
1326 *
1327 * We'll be forming:
1328 * a0 a2 a4 a6 b0 b2 b4 b6
1329 * a1 a3 a5 a7 b1 b3 b5 b7
1330 */ \
1331 ROW0( IDCT_AVX_SETUP_2_ROWS(0, 4, 14, 15, 0, 16, 32, 48) ) \
1332 ROW2( IDCT_AVX_SETUP_2_ROWS(1, 5, 12, 13, 64, 80, 96, 112) ) \
1333 ROW4( IDCT_AVX_SETUP_2_ROWS(2, 6, 10, 11, 128, 144, 160, 176) ) \
1334 ROW6( IDCT_AVX_SETUP_2_ROWS(3, 7, 8, 9, 192, 208, 224, 240) ) \
1335 \
1336 /* Multiple the even columns (ymm0-3) by the matrix M1
1337 * storing the results back in ymm0-3
1338 *
1339 * Assume that (%1) holds the matrix in column major order
1340 */ \
1341 "vbroadcastf128 (%1), %%ymm8 \n" \
1342 "vbroadcastf128 16(%1), %%ymm9 \n" \
1343 "vbroadcastf128 32(%1), %%ymm10 \n" \
1344 "vbroadcastf128 48(%1), %%ymm11 \n" \
1345 \
1346 ROW0( IDCT_AVX_MMULT_ROWS(%%ymm0) ) \
1347 ROW2( IDCT_AVX_MMULT_ROWS(%%ymm1) ) \
1348 ROW4( IDCT_AVX_MMULT_ROWS(%%ymm2) ) \
1349 ROW6( IDCT_AVX_MMULT_ROWS(%%ymm3) ) \
1350 \
1351 /* Repeat, but with the odd columns (ymm4-7) and the
1352 * matrix M2
1353 */ \
1354 "vbroadcastf128 64(%1), %%ymm8 \n" \
1355 "vbroadcastf128 80(%1), %%ymm9 \n" \
1356 "vbroadcastf128 96(%1), %%ymm10 \n" \
1357 "vbroadcastf128 112(%1), %%ymm11 \n" \
1358 \
1359 ROW0( IDCT_AVX_MMULT_ROWS(%%ymm4) ) \
1360 ROW2( IDCT_AVX_MMULT_ROWS(%%ymm5) ) \
1361 ROW4( IDCT_AVX_MMULT_ROWS(%%ymm6) ) \
1362 ROW6( IDCT_AVX_MMULT_ROWS(%%ymm7) ) \
1363 \
1364 /* Sum the M1 (ymm0-3) and M2 (ymm4-7) results to get the
1365 * front halves of the results, and difference to get the
1366 * back halves. The front halfs end up in ymm0-3, the back
1367 * halves end up in ymm12-15.
1368 */ \
1369 ROW0( IDCT_AVX_EO_TO_ROW_HALVES(%%ymm0, %%ymm4, %%ymm0, %%ymm12) ) \
1370 ROW2( IDCT_AVX_EO_TO_ROW_HALVES(%%ymm1, %%ymm5, %%ymm1, %%ymm13) ) \
1371 ROW4( IDCT_AVX_EO_TO_ROW_HALVES(%%ymm2, %%ymm6, %%ymm2, %%ymm14) ) \
1372 ROW6( IDCT_AVX_EO_TO_ROW_HALVES(%%ymm3, %%ymm7, %%ymm3, %%ymm15) ) \
1373 \
1374 /* Reassemble the rows halves into ymm0-7 */ \
1375 ROW7( "vperm2f128 $0x13, %%ymm3, %%ymm15, %%ymm7 \n" ) \
1376 ROW6( "vperm2f128 $0x02, %%ymm3, %%ymm15, %%ymm6 \n" ) \
1377 ROW5( "vperm2f128 $0x13, %%ymm2, %%ymm14, %%ymm5 \n" ) \
1378 ROW4( "vperm2f128 $0x02, %%ymm2, %%ymm14, %%ymm4 \n" ) \
1379 ROW3( "vperm2f128 $0x13, %%ymm1, %%ymm13, %%ymm3 \n" ) \
1380 ROW2( "vperm2f128 $0x02, %%ymm1, %%ymm13, %%ymm2 \n" ) \
1381 ROW1( "vperm2f128 $0x13, %%ymm0, %%ymm12, %%ymm1 \n" ) \
1382 ROW0( "vperm2f128 $0x02, %%ymm0, %%ymm12, %%ymm0 \n" ) \
1383 \
1384 \
1385 /* ==============================================
1386 * Column 1D DCT
1387 * ----------------------------------------------
1388 */ \
1389 \
1390 /* Rows should be in ymm0-7, and M2 columns should still be
1391 * preserved in ymm8-11. M2 has 4 unique values (and +-
1392 * versions of each), and all (positive) values appear in
1393 * the first column (and row), which is in ymm8.
1394 *
1395 * For the column-wise DCT, we need to:
1396 * 1) Broadcast each element a row of M2 into 4 vectors
1397 * 2) Multiple the odd rows (ymm1,3,5,7) by the broadcasts.
1398 * 3) Accumulate into ymm12-15 for the odd outputs.
1399 *
1400 * Instead of doing 16 broadcasts for each element in M2,
1401 * do 4, filling y8-11 with:
1402 *
1403 * ymm8: [ b b b b | b b b b ]
1404 * ymm9: [ d d d d | d d d d ]
1405 * ymm10: [ e e e e | e e e e ]
1406 * ymm11: [ g g g g | g g g g ]
1407 *
1408 * And deal with the negative values by subtracting during accum.
1409 */ \
1410 "vpermilps $0xff, %%ymm8, %%ymm11 \n" \
1411 "vpermilps $0xaa, %%ymm8, %%ymm10 \n" \
1412 "vpermilps $0x55, %%ymm8, %%ymm9 \n" \
1413 "vpermilps $0x00, %%ymm8, %%ymm8 \n" \
1414 \
1415 /* This one is easy, since we have ymm12-15 open for scratch
1416 * ymm12 = b ymm1 + d ymm3 + e ymm5 + g ymm7
1417 */ \
1418 ROW1( "vmulps %%ymm1, %%ymm8, %%ymm12 \n" ) \
1419 ROW3( "vmulps %%ymm3, %%ymm9, %%ymm13 \n" ) \
1420 ROW5( "vmulps %%ymm5, %%ymm10, %%ymm14 \n" ) \
1421 ROW7( "vmulps %%ymm7, %%ymm11, %%ymm15 \n" ) \
1422 \
1423 ROW3( "vaddps %%ymm12, %%ymm13, %%ymm12 \n" ) \
1424 ROW7( "vaddps %%ymm14, %%ymm15, %%ymm14 \n" ) \
1425 ROW5( "vaddps %%ymm12, %%ymm14, %%ymm12 \n" ) \
1426 \
1427 /* Tricker, since only y13-15 are open for scratch
1428 * ymm13 = d ymm1 - g ymm3 - b ymm5 - e ymm7
1429 */ \
1430 ROW1( "vmulps %%ymm1, %%ymm9, %%ymm13 \n" ) \
1431 ROW3( "vmulps %%ymm3, %%ymm11, %%ymm14 \n" ) \
1432 ROW5( "vmulps %%ymm5, %%ymm8, %%ymm15 \n" ) \
1433 \
1434 ROW5( "vaddps %%ymm14, %%ymm15, %%ymm14 \n" ) \
1435 ROW3( "vsubps %%ymm14, %%ymm13, %%ymm13 \n" ) \
1436 \
1437 ROW7( "vmulps %%ymm7, %%ymm10, %%ymm15 \n" ) \
1438 ROW7( "vsubps %%ymm15, %%ymm13, %%ymm13 \n" ) \
1439 \
1440 /* Tricker still, as only y14-15 are open for scratch
1441 * ymm14 = e ymm1 - b ymm3 + g ymm5 + d ymm7
1442 */ \
1443 ROW1( "vmulps %%ymm1, %%ymm10, %%ymm14 \n" ) \
1444 ROW3( "vmulps %%ymm3, %%ymm8, %%ymm15 \n" ) \
1445 \
1446 ROW3( "vsubps %%ymm15, %%ymm14, %%ymm14 \n" ) \
1447 \
1448 ROW5( "vmulps %%ymm5, %%ymm11, %%ymm15 \n" ) \
1449 ROW5( "vaddps %%ymm15, %%ymm14, %%ymm14 \n" ) \
1450 \
1451 ROW7( "vmulps %%ymm7, %%ymm9, %%ymm15 \n" ) \
1452 ROW7( "vaddps %%ymm15, %%ymm14, %%ymm14 \n" ) \
1453 \
1454 \
1455 /* Easy, as we can blow away ymm1,3,5,7 for scratch
1456 * ymm15 = g ymm1 - e ymm3 + d ymm5 - b ymm7
1457 */ \
1458 ROW1( "vmulps %%ymm1, %%ymm11, %%ymm15 \n" ) \
1459 ROW3( "vmulps %%ymm3, %%ymm10, %%ymm3 \n" ) \
1460 ROW5( "vmulps %%ymm5, %%ymm9, %%ymm5 \n" ) \
1461 ROW7( "vmulps %%ymm7, %%ymm8, %%ymm7 \n" ) \
1462 \
1463 ROW5( "vaddps %%ymm15, %%ymm5, %%ymm15 \n" ) \
1464 ROW7( "vaddps %%ymm3, %%ymm7, %%ymm3 \n" ) \
1465 ROW3( "vsubps %%ymm3, %%ymm15, %%ymm15 \n" ) \
1466 \
1467 \
1468 /* Load coefs for M1. Because we're going to broadcast
1469 * coefs, we don't need to load the actual structure from
1470 * M1. Instead, just load enough that we can broadcast.
1471 * There are only 6 unique values in M1, but they're in +-
1472 * pairs, leaving only 3 unique coefs if we add and subtract
1473 * properly.
1474 *
1475 * Fill ymm1 with coef[2] = [ a a c f | a a c f ]
1476 * Broadcast ymm5 with [ f f f f | f f f f ]
1477 * Broadcast ymm3 with [ c c c c | c c c c ]
1478 * Broadcast ymm1 with [ a a a a | a a a a ]
1479 */ \
1480 "vbroadcastf128 8(%1), %%ymm1 \n" \
1481 "vpermilps $0xff, %%ymm1, %%ymm5 \n" \
1482 "vpermilps $0xaa, %%ymm1, %%ymm3 \n" \
1483 "vpermilps $0x00, %%ymm1, %%ymm1 \n" \
1484 \
1485 /* If we expand E = [M1] [x0 x2 x4 x6]^t, we get the following
1486 * common expressions:
1487 *
1488 * E_0 = ymm8 = (a ymm0 + a ymm4) + (c ymm2 + f ymm6)
1489 * E_3 = ymm11 = (a ymm0 + a ymm4) - (c ymm2 + f ymm6)
1490 *
1491 * E_1 = ymm9 = (a ymm0 - a ymm4) + (f ymm2 - c ymm6)
1492 * E_2 = ymm10 = (a ymm0 - a ymm4) - (f ymm2 - c ymm6)
1493 *
1494 * Afterwards, ymm8-11 will hold the even outputs.
1495 */ \
1496 \
1497 /* ymm11 = (a ymm0 + a ymm4), ymm1 = (a ymm0 - a ymm4) */ \
1498 ROW0( "vmulps %%ymm1, %%ymm0, %%ymm11 \n" ) \
1499 ROW4( "vmulps %%ymm1, %%ymm4, %%ymm4 \n" ) \
1500 ROW0( "vmovaps %%ymm11, %%ymm1 \n" ) \
1501 ROW4( "vaddps %%ymm4, %%ymm11, %%ymm11 \n" ) \
1502 ROW4( "vsubps %%ymm4, %%ymm1, %%ymm1 \n" ) \
1503 \
1504 /* ymm7 = (c ymm2 + f ymm6) */ \
1505 ROW2( "vmulps %%ymm3, %%ymm2, %%ymm7 \n" ) \
1506 ROW6( "vmulps %%ymm5, %%ymm6, %%ymm9 \n" ) \
1507 ROW6( "vaddps %%ymm9, %%ymm7, %%ymm7 \n" ) \
1508 \
1509 /* E_0 = ymm8 = (a ymm0 + a ymm4) + (c ymm2 + f ymm6)
1510 * E_3 = ymm11 = (a ymm0 + a ymm4) - (c ymm2 + f ymm6)
1511 */ \
1512 ROW0( "vmovaps %%ymm11, %%ymm8 \n" ) \
1513 ROW2( "vaddps %%ymm7, %%ymm8, %%ymm8 \n" ) \
1514 ROW2( "vsubps %%ymm7, %%ymm11, %%ymm11 \n" ) \
1515 \
1516 /* ymm7 = (f ymm2 - c ymm6) */ \
1517 ROW2( "vmulps %%ymm5, %%ymm2, %%ymm7 \n" ) \
1518 ROW6( "vmulps %%ymm3, %%ymm6, %%ymm9 \n" ) \
1519 ROW6( "vsubps %%ymm9, %%ymm7, %%ymm7 \n" ) \
1520 \
1521 /* E_1 = ymm9 = (a ymm0 - a ymm4) + (f ymm2 - c ymm6)
1522 * E_2 = ymm10 = (a ymm0 - a ymm4) - (f ymm2 - c ymm6)
1523 */ \
1524 ROW0( "vmovaps %%ymm1, %%ymm9 \n" ) \
1525 ROW0( "vmovaps %%ymm1, %%ymm10 \n" ) \
1526 ROW2( "vaddps %%ymm7, %%ymm1, %%ymm9 \n" ) \
1527 ROW2( "vsubps %%ymm7, %%ymm1, %%ymm10 \n" ) \
1528 \
1529 /* Add the even (ymm8-11) and the odds (ymm12-15),
1530 * placing the results into ymm0-7
1531 */ \
1532 "vaddps %%ymm12, %%ymm8, %%ymm0 \n" \
1533 "vaddps %%ymm13, %%ymm9, %%ymm1 \n" \
1534 "vaddps %%ymm14, %%ymm10, %%ymm2 \n" \
1535 "vaddps %%ymm15, %%ymm11, %%ymm3 \n" \
1536 \
1537 "vsubps %%ymm12, %%ymm8, %%ymm7 \n" \
1538 "vsubps %%ymm13, %%ymm9, %%ymm6 \n" \
1539 "vsubps %%ymm14, %%ymm10, %%ymm5 \n" \
1540 "vsubps %%ymm15, %%ymm11, %%ymm4 \n" \
1541 \
1542 /* Copy out the results from ymm0-7 */ \
1543 "vmovaps %%ymm0, (%0) \n" \
1544 "vmovaps %%ymm1, 32(%0) \n" \
1545 "vmovaps %%ymm2, 64(%0) \n" \
1546 "vmovaps %%ymm3, 96(%0) \n" \
1547 "vmovaps %%ymm4, 128(%0) \n" \
1548 "vmovaps %%ymm5, 160(%0) \n" \
1549 "vmovaps %%ymm6, 192(%0) \n" \
1550 "vmovaps %%ymm7, 224(%0) \n"
1551
1552 /* Output, input, and clobber (OIC) sections of the inline asm */
1553 #define IDCT_AVX_OIC(_IN0) \
1554 : /* Output */ \
1555 : /* Input */ "r"(_IN0), "r"(sAvxCoef) \
1556 : /* Clobber */ "memory", \
1557 "%xmm0", "%xmm1", "%xmm2", "%xmm3", \
1558 "%xmm4", "%xmm5", "%xmm6", "%xmm7", \
1559 "%xmm8", "%xmm9", "%xmm10", "%xmm11",\
1560 "%xmm12", "%xmm13", "%xmm14", "%xmm15"
1561
1562 /* Include vzeroupper for non-AVX builds */
1563 #ifndef __AVX__
1564 #define IDCT_AVX_ASM(_IN0) \
1565 __asm__( \
1566 IDCT_AVX_BODY \
1567 "vzeroupper \n" \
1568 IDCT_AVX_OIC(_IN0) \
1569 );
1570 #else /* __AVX__ */
1571 #define IDCT_AVX_ASM(_IN0) \
1572 __asm__( \
1573 IDCT_AVX_BODY \
1574 IDCT_AVX_OIC(_IN0) \
1575 );
1576 #endif /* __AVX__ */
1577
1578 template <int zeroedRows>
1579 void
dctInverse8x8_avx(float * data)1580 dctInverse8x8_avx (float *data)
1581 {
1582 #if defined IMF_HAVE_GCC_INLINEASM_64
1583
1584 /* The column-major version of M1, followed by the
1585 * column-major version of M2:
1586 *
1587 * [ a c a f ] [ b d e g ]
1588 * M1 = [ a f -a -c ] M2 = [ d -g -b -e ]
1589 * [ a -f -a c ] [ e -b g d ]
1590 * [ a -c a -f ] [ g -e d -b ]
1591 */
1592 const float sAvxCoef[32] __attribute__((aligned(32))) = {
1593 3.535536e-01, 3.535536e-01, 3.535536e-01, 3.535536e-01, /* a a a a */
1594 4.619398e-01, 1.913422e-01, -1.913422e-01, -4.619398e-01, /* c f -f -c */
1595 3.535536e-01, -3.535536e-01, -3.535536e-01, 3.535536e-01, /* a -a -a a */
1596 1.913422e-01, -4.619398e-01, 4.619398e-01, -1.913422e-01, /* f -c c -f */
1597
1598 4.903927e-01, 4.157349e-01, 2.777855e-01, 9.754573e-02, /* b d e g */
1599 4.157349e-01, -9.754573e-02, -4.903927e-01, -2.777855e-01, /* d -g -b -e */
1600 2.777855e-01, -4.903927e-01, 9.754573e-02, 4.157349e-01, /* e -b g d */
1601 9.754573e-02, -2.777855e-01, 4.157349e-01, -4.903927e-01 /* g -e d -b */
1602 };
1603
1604 #define ROW0(_X) _X
1605 #define ROW1(_X) _X
1606 #define ROW2(_X) _X
1607 #define ROW3(_X) _X
1608 #define ROW4(_X) _X
1609 #define ROW5(_X) _X
1610 #define ROW6(_X) _X
1611 #define ROW7(_X) _X
1612
1613 if (zeroedRows == 0) {
1614
1615 IDCT_AVX_ASM(data)
1616
1617 } else if (zeroedRows == 1) {
1618
1619 #undef ROW7
1620 #define ROW7(_X)
1621 IDCT_AVX_ASM(data)
1622
1623 } else if (zeroedRows == 2) {
1624
1625 #undef ROW6
1626 #define ROW6(_X)
1627 IDCT_AVX_ASM(data)
1628
1629 } else if (zeroedRows == 3) {
1630
1631 #undef ROW5
1632 #define ROW5(_X)
1633 IDCT_AVX_ASM(data)
1634
1635 } else if (zeroedRows == 4) {
1636
1637 #undef ROW4
1638 #define ROW4(_X)
1639 IDCT_AVX_ASM(data)
1640
1641 } else if (zeroedRows == 5) {
1642
1643 #undef ROW3
1644 #define ROW3(_X)
1645 IDCT_AVX_ASM(data)
1646
1647 } else if (zeroedRows == 6) {
1648
1649 #undef ROW2
1650 #define ROW2(_X)
1651 IDCT_AVX_ASM(data)
1652
1653 } else if (zeroedRows == 7) {
1654
1655 __asm__(
1656
1657 /* ==============================================
1658 * Row 1D DCT
1659 * ----------------------------------------------
1660 */
1661 IDCT_AVX_SETUP_2_ROWS(0, 4, 14, 15, 0, 16, 32, 48)
1662
1663 "vbroadcastf128 (%1), %%ymm8 \n"
1664 "vbroadcastf128 16(%1), %%ymm9 \n"
1665 "vbroadcastf128 32(%1), %%ymm10 \n"
1666 "vbroadcastf128 48(%1), %%ymm11 \n"
1667
1668 /* Stash a vector of [a a a a | a a a a] away in ymm2 */
1669 "vinsertf128 $1, %%xmm8, %%ymm8, %%ymm2 \n"
1670
1671 IDCT_AVX_MMULT_ROWS(%%ymm0)
1672
1673 "vbroadcastf128 64(%1), %%ymm8 \n"
1674 "vbroadcastf128 80(%1), %%ymm9 \n"
1675 "vbroadcastf128 96(%1), %%ymm10 \n"
1676 "vbroadcastf128 112(%1), %%ymm11 \n"
1677
1678 IDCT_AVX_MMULT_ROWS(%%ymm4)
1679
1680 IDCT_AVX_EO_TO_ROW_HALVES(%%ymm0, %%ymm4, %%ymm0, %%ymm12)
1681
1682 "vperm2f128 $0x02, %%ymm0, %%ymm12, %%ymm0 \n"
1683
1684 /* ==============================================
1685 * Column 1D DCT
1686 * ----------------------------------------------
1687 */
1688
1689 /* DC only, so multiple by a and we're done */
1690 "vmulps %%ymm2, %%ymm0, %%ymm0 \n"
1691
1692 /* Copy out results */
1693 "vmovaps %%ymm0, (%0) \n"
1694 "vmovaps %%ymm0, 32(%0) \n"
1695 "vmovaps %%ymm0, 64(%0) \n"
1696 "vmovaps %%ymm0, 96(%0) \n"
1697 "vmovaps %%ymm0, 128(%0) \n"
1698 "vmovaps %%ymm0, 160(%0) \n"
1699 "vmovaps %%ymm0, 192(%0) \n"
1700 "vmovaps %%ymm0, 224(%0) \n"
1701
1702 #ifndef __AVX__
1703 "vzeroupper \n"
1704 #endif /* __AVX__ */
1705 IDCT_AVX_OIC(data)
1706 );
1707 } else {
1708 assert(false); // Invalid template instance parameter
1709 }
1710 #else /* IMF_HAVE_GCC_INLINEASM_64 */
1711
1712 dctInverse8x8_scalar<zeroedRows>(data);
1713
1714 #endif /* IMF_HAVE_GCC_INLINEASM_64 */
1715 }
1716
1717
1718 //
1719 // Full 8x8 Forward DCT:
1720 //
1721 // Base forward 8x8 DCT implementation. Works on the data in-place
1722 //
1723 // The implementation describedin Pennebaker + Mitchell,
1724 // section 4.3.2, and illustrated in figure 4-7
1725 //
1726 // The basic idea is that the 1D DCT math reduces to:
1727 //
1728 // 2*out_0 = c_4 [(s_07 + s_34) + (s_12 + s_56)]
1729 // 2*out_4 = c_4 [(s_07 + s_34) - (s_12 + s_56)]
1730 //
1731 // {2*out_2, 2*out_6} = rot_6 ((d_12 - d_56), (s_07 - s_34))
1732 //
1733 // {2*out_3, 2*out_5} = rot_-3 (d_07 - c_4 (s_12 - s_56),
1734 // d_34 - c_4 (d_12 + d_56))
1735 //
1736 // {2*out_1, 2*out_7} = rot_-1 (d_07 + c_4 (s_12 - s_56),
1737 // -d_34 - c_4 (d_12 + d_56))
1738 //
1739 // where:
1740 //
1741 // c_i = cos(i*pi/16)
1742 // s_i = sin(i*pi/16)
1743 //
1744 // s_ij = in_i + in_j
1745 // d_ij = in_i - in_j
1746 //
1747 // rot_i(x, y) = {c_i*x + s_i*y, -s_i*x + c_i*y}
1748 //
1749 // We'll run the DCT in two passes. First, run the 1D DCT on
1750 // the rows, in-place. Then, run over the columns in-place,
1751 // and be done with it.
1752 //
1753
1754 #ifndef IMF_HAVE_SSE2
1755
1756 //
1757 // Default implementation
1758 //
1759
1760 void
dctForward8x8(float * data)1761 dctForward8x8 (float *data)
1762 {
1763 float A0, A1, A2, A3, A4, A5, A6, A7;
1764 float K0, K1, rot_x, rot_y;
1765
1766 float *srcPtr = data;
1767 float *dstPtr = data;
1768
1769 const float c1 = cosf (3.14159f * 1.0f / 16.0f);
1770 const float c2 = cosf (3.14159f * 2.0f / 16.0f);
1771 const float c3 = cosf (3.14159f * 3.0f / 16.0f);
1772 const float c4 = cosf (3.14159f * 4.0f / 16.0f);
1773 const float c5 = cosf (3.14159f * 5.0f / 16.0f);
1774 const float c6 = cosf (3.14159f * 6.0f / 16.0f);
1775 const float c7 = cosf (3.14159f * 7.0f / 16.0f);
1776
1777 const float c1Half = .5f * c1;
1778 const float c2Half = .5f * c2;
1779 const float c3Half = .5f * c3;
1780 const float c5Half = .5f * c5;
1781 const float c6Half = .5f * c6;
1782 const float c7Half = .5f * c7;
1783
1784 //
1785 // First pass - do a 1D DCT over the rows and write the
1786 // results back in place
1787 //
1788
1789 for (int row=0; row<8; ++row)
1790 {
1791 float *srcRowPtr = srcPtr + 8 * row;
1792 float *dstRowPtr = dstPtr + 8 * row;
1793
1794 A0 = srcRowPtr[0] + srcRowPtr[7];
1795 A1 = srcRowPtr[1] + srcRowPtr[2];
1796 A2 = srcRowPtr[1] - srcRowPtr[2];
1797 A3 = srcRowPtr[3] + srcRowPtr[4];
1798 A4 = srcRowPtr[3] - srcRowPtr[4];
1799 A5 = srcRowPtr[5] + srcRowPtr[6];
1800 A6 = srcRowPtr[5] - srcRowPtr[6];
1801 A7 = srcRowPtr[0] - srcRowPtr[7];
1802
1803 K0 = c4 * (A0 + A3);
1804 K1 = c4 * (A1 + A5);
1805
1806 dstRowPtr[0] = .5f * (K0 + K1);
1807 dstRowPtr[4] = .5f * (K0 - K1);
1808
1809 //
1810 // (2*dst2, 2*dst6) = rot 6 (d12 - d56, s07 - s34)
1811 //
1812
1813 rot_x = A2 - A6;
1814 rot_y = A0 - A3;
1815
1816 dstRowPtr[2] = c6Half * rot_x + c2Half * rot_y;
1817 dstRowPtr[6] = c6Half * rot_y - c2Half * rot_x;
1818
1819 //
1820 // K0, K1 are active until after dst[1],dst[7]
1821 // as well as dst[3], dst[5] are computed.
1822 //
1823
1824 K0 = c4 * (A1 - A5);
1825 K1 = -1 * c4 * (A2 + A6);
1826
1827 //
1828 // Two ways to do a rotation:
1829 //
1830 // rot i (x, y) =
1831 // X = c_i*x + s_i*y
1832 // Y = -s_i*x + c_i*y
1833 //
1834 // OR
1835 //
1836 // X = c_i*(x+y) + (s_i-c_i)*y
1837 // Y = c_i*y - (s_i+c_i)*x
1838 //
1839 // the first case has 4 multiplies, but fewer constants,
1840 // while the 2nd case has fewer multiplies but takes more space.
1841
1842 //
1843 // (2*dst3, 2*dst5) = rot -3 ( d07 - K0, d34 + K1 )
1844 //
1845
1846 rot_x = A7 - K0;
1847 rot_y = A4 + K1;
1848
1849 dstRowPtr[3] = c3Half * rot_x - c5Half * rot_y;
1850 dstRowPtr[5] = c5Half * rot_x + c3Half * rot_y;
1851
1852 //
1853 // (2*dst1, 2*dst7) = rot -1 ( d07 + K0, K1 - d34 )
1854 //
1855
1856 rot_x = A7 + K0;
1857 rot_y = K1 - A4;
1858
1859 //
1860 // A: 4, 7 are inactive. All A's are inactive
1861 //
1862
1863 dstRowPtr[1] = c1Half * rot_x - c7Half * rot_y;
1864 dstRowPtr[7] = c7Half * rot_x + c1Half * rot_y;
1865 }
1866
1867 //
1868 // Second pass - do the same, but on the columns
1869 //
1870
1871 for (int column = 0; column < 8; ++column)
1872 {
1873
1874 A0 = srcPtr[ column] + srcPtr[56 + column];
1875 A7 = srcPtr[ column] - srcPtr[56 + column];
1876
1877 A1 = srcPtr[ 8 + column] + srcPtr[16 + column];
1878 A2 = srcPtr[ 8 + column] - srcPtr[16 + column];
1879
1880 A3 = srcPtr[24 + column] + srcPtr[32 + column];
1881 A4 = srcPtr[24 + column] - srcPtr[32 + column];
1882
1883 A5 = srcPtr[40 + column] + srcPtr[48 + column];
1884 A6 = srcPtr[40 + column] - srcPtr[48 + column];
1885
1886 K0 = c4 * (A0 + A3);
1887 K1 = c4 * (A1 + A5);
1888
1889 dstPtr[ column] = .5f * (K0 + K1);
1890 dstPtr[32+column] = .5f * (K0 - K1);
1891
1892 //
1893 // (2*dst2, 2*dst6) = rot 6 ( d12 - d56, s07 - s34 )
1894 //
1895
1896 rot_x = A2 - A6;
1897 rot_y = A0 - A3;
1898
1899 dstPtr[16+column] = .5f * (c6 * rot_x + c2 * rot_y);
1900 dstPtr[48+column] = .5f * (c6 * rot_y - c2 * rot_x);
1901
1902 //
1903 // K0, K1 are active until after dst[1],dst[7]
1904 // as well as dst[3], dst[5] are computed.
1905 //
1906
1907 K0 = c4 * (A1 - A5);
1908 K1 = -1 * c4 * (A2 + A6);
1909
1910 //
1911 // (2*dst3, 2*dst5) = rot -3 ( d07 - K0, d34 + K1 )
1912 //
1913
1914 rot_x = A7 - K0;
1915 rot_y = A4 + K1;
1916
1917 dstPtr[24+column] = .5f * (c3 * rot_x - c5 * rot_y);
1918 dstPtr[40+column] = .5f * (c5 * rot_x + c3 * rot_y);
1919
1920 //
1921 // (2*dst1, 2*dst7) = rot -1 ( d07 + K0, K1 - d34 )
1922 //
1923
1924 rot_x = A7 + K0;
1925 rot_y = K1 - A4;
1926
1927 dstPtr[ 8+column] = .5f * (c1 * rot_x - c7 * rot_y);
1928 dstPtr[56+column] = .5f * (c7 * rot_x + c1 * rot_y);
1929 }
1930 }
1931
1932 #else /* IMF_HAVE_SSE2 */
1933
1934 //
1935 // SSE2 implementation
1936 //
1937 // Here, we're always doing a column-wise operation
1938 // plus transposes. This might be faster to do differently
1939 // between rows-wise and column-wise
1940 //
1941
1942 void
dctForward8x8(float * data)1943 dctForward8x8 (float *data)
1944 {
1945 __m128 *srcVec = (__m128 *)data;
1946 __m128 a0Vec, a1Vec, a2Vec, a3Vec, a4Vec, a5Vec, a6Vec, a7Vec;
1947 __m128 k0Vec, k1Vec, rotXVec, rotYVec;
1948 __m128 transTmp[4], transTmp2[4];
1949
1950 __m128 c4Vec = { .70710678f, .70710678f, .70710678f, .70710678f};
1951 __m128 c4NegVec = {-.70710678f, -.70710678f, -.70710678f, -.70710678f};
1952
1953 __m128 c1HalfVec = {.490392640f, .490392640f, .490392640f, .490392640f};
1954 __m128 c2HalfVec = {.461939770f, .461939770f, .461939770f, .461939770f};
1955 __m128 c3HalfVec = {.415734810f, .415734810f, .415734810f, .415734810f};
1956 __m128 c5HalfVec = {.277785120f, .277785120f, .277785120f, .277785120f};
1957 __m128 c6HalfVec = {.191341720f, .191341720f, .191341720f, .191341720f};
1958 __m128 c7HalfVec = {.097545161f, .097545161f, .097545161f, .097545161f};
1959
1960 __m128 halfVec = {.5f, .5f, .5f, .5f};
1961
1962 for (int iter = 0; iter < 2; ++iter)
1963 {
1964 //
1965 // Operate on 4 columns at a time. The
1966 // offsets into our row-major array are:
1967 // 0: 0 1
1968 // 1: 2 3
1969 // 2: 4 5
1970 // 3: 6 7
1971 // 4: 8 9
1972 // 5: 10 11
1973 // 6: 12 13
1974 // 7: 14 15
1975 //
1976
1977 for (int pass=0; pass<2; ++pass)
1978 {
1979 a0Vec = _mm_add_ps (srcVec[ 0 + pass], srcVec[14 + pass]);
1980 a1Vec = _mm_add_ps (srcVec[ 2 + pass], srcVec[ 4 + pass]);
1981 a3Vec = _mm_add_ps (srcVec[ 6 + pass], srcVec[ 8 + pass]);
1982 a5Vec = _mm_add_ps (srcVec[10 + pass], srcVec[12 + pass]);
1983
1984 a7Vec = _mm_sub_ps (srcVec[ 0 + pass], srcVec[14 + pass]);
1985 a2Vec = _mm_sub_ps (srcVec[ 2 + pass], srcVec[ 4 + pass]);
1986 a4Vec = _mm_sub_ps (srcVec[ 6 + pass], srcVec[ 8 + pass]);
1987 a6Vec = _mm_sub_ps (srcVec[10 + pass], srcVec[12 + pass]);
1988
1989 //
1990 // First stage; Compute out_0 and out_4
1991 //
1992
1993 k0Vec = _mm_add_ps (a0Vec, a3Vec);
1994 k1Vec = _mm_add_ps (a1Vec, a5Vec);
1995
1996 k0Vec = _mm_mul_ps (c4Vec, k0Vec);
1997 k1Vec = _mm_mul_ps (c4Vec, k1Vec);
1998
1999 srcVec[0 + pass] = _mm_add_ps (k0Vec, k1Vec);
2000 srcVec[8 + pass] = _mm_sub_ps (k0Vec, k1Vec);
2001
2002 srcVec[0 + pass] = _mm_mul_ps (srcVec[0 + pass], halfVec );
2003 srcVec[8 + pass] = _mm_mul_ps (srcVec[8 + pass], halfVec );
2004
2005
2006 //
2007 // Second stage; Compute out_2 and out_6
2008 //
2009
2010 k0Vec = _mm_sub_ps (a2Vec, a6Vec);
2011 k1Vec = _mm_sub_ps (a0Vec, a3Vec);
2012
2013 srcVec[ 4 + pass] = _mm_add_ps (_mm_mul_ps (c6HalfVec, k0Vec),
2014 _mm_mul_ps (c2HalfVec, k1Vec));
2015
2016 srcVec[12 + pass] = _mm_sub_ps (_mm_mul_ps (c6HalfVec, k1Vec),
2017 _mm_mul_ps (c2HalfVec, k0Vec));
2018
2019 //
2020 // Precompute K0 and K1 for the remaining stages
2021 //
2022
2023 k0Vec = _mm_mul_ps (_mm_sub_ps (a1Vec, a5Vec), c4Vec);
2024 k1Vec = _mm_mul_ps (_mm_add_ps (a2Vec, a6Vec), c4NegVec);
2025
2026 //
2027 // Third Stage, compute out_3 and out_5
2028 //
2029
2030 rotXVec = _mm_sub_ps (a7Vec, k0Vec);
2031 rotYVec = _mm_add_ps (a4Vec, k1Vec);
2032
2033 srcVec[ 6 + pass] = _mm_sub_ps (_mm_mul_ps (c3HalfVec, rotXVec),
2034 _mm_mul_ps (c5HalfVec, rotYVec));
2035
2036 srcVec[10 + pass] = _mm_add_ps (_mm_mul_ps (c5HalfVec, rotXVec),
2037 _mm_mul_ps (c3HalfVec, rotYVec));
2038
2039 //
2040 // Fourth Stage, compute out_1 and out_7
2041 //
2042
2043 rotXVec = _mm_add_ps (a7Vec, k0Vec);
2044 rotYVec = _mm_sub_ps (k1Vec, a4Vec);
2045
2046 srcVec[ 2 + pass] = _mm_sub_ps (_mm_mul_ps (c1HalfVec, rotXVec),
2047 _mm_mul_ps (c7HalfVec, rotYVec));
2048
2049 srcVec[14 + pass] = _mm_add_ps (_mm_mul_ps (c7HalfVec, rotXVec),
2050 _mm_mul_ps (c1HalfVec, rotYVec));
2051 }
2052
2053 //
2054 // Transpose the matrix, in 4x4 blocks. So, if we have our
2055 // 8x8 matrix divied into 4x4 blocks:
2056 //
2057 // M0 | M1 M0t | M2t
2058 // ----+--- --> -----+------
2059 // M2 | M3 M1t | M3t
2060 //
2061
2062 //
2063 // M0t, done in place, the first half.
2064 //
2065
2066 transTmp[0] = _mm_shuffle_ps (srcVec[0], srcVec[2], 0x44);
2067 transTmp[1] = _mm_shuffle_ps (srcVec[4], srcVec[6], 0x44);
2068 transTmp[3] = _mm_shuffle_ps (srcVec[4], srcVec[6], 0xEE);
2069 transTmp[2] = _mm_shuffle_ps (srcVec[0], srcVec[2], 0xEE);
2070
2071 //
2072 // M3t, also done in place, the first half.
2073 //
2074
2075 transTmp2[0] = _mm_shuffle_ps (srcVec[ 9], srcVec[11], 0x44);
2076 transTmp2[1] = _mm_shuffle_ps (srcVec[13], srcVec[15], 0x44);
2077 transTmp2[2] = _mm_shuffle_ps (srcVec[ 9], srcVec[11], 0xEE);
2078 transTmp2[3] = _mm_shuffle_ps (srcVec[13], srcVec[15], 0xEE);
2079
2080 //
2081 // M0t, the second half.
2082 //
2083
2084 srcVec[0] = _mm_shuffle_ps (transTmp[0], transTmp[1], 0x88);
2085 srcVec[4] = _mm_shuffle_ps (transTmp[2], transTmp[3], 0x88);
2086 srcVec[2] = _mm_shuffle_ps (transTmp[0], transTmp[1], 0xDD);
2087 srcVec[6] = _mm_shuffle_ps (transTmp[2], transTmp[3], 0xDD);
2088
2089 //
2090 // M3t, the second half.
2091 //
2092
2093 srcVec[ 9] = _mm_shuffle_ps (transTmp2[0], transTmp2[1], 0x88);
2094 srcVec[13] = _mm_shuffle_ps (transTmp2[2], transTmp2[3], 0x88);
2095 srcVec[11] = _mm_shuffle_ps (transTmp2[0], transTmp2[1], 0xDD);
2096 srcVec[15] = _mm_shuffle_ps (transTmp2[2], transTmp2[3], 0xDD);
2097
2098 //
2099 // M1 and M2 need to be done at the same time, because we're
2100 // swapping.
2101 //
2102 // First, the first half of M1t
2103 //
2104
2105 transTmp[0] = _mm_shuffle_ps (srcVec[1], srcVec[3], 0x44);
2106 transTmp[1] = _mm_shuffle_ps (srcVec[5], srcVec[7], 0x44);
2107 transTmp[2] = _mm_shuffle_ps (srcVec[1], srcVec[3], 0xEE);
2108 transTmp[3] = _mm_shuffle_ps (srcVec[5], srcVec[7], 0xEE);
2109
2110 //
2111 // And the first half of M2t
2112 //
2113
2114 transTmp2[0] = _mm_shuffle_ps (srcVec[ 8], srcVec[10], 0x44);
2115 transTmp2[1] = _mm_shuffle_ps (srcVec[12], srcVec[14], 0x44);
2116 transTmp2[2] = _mm_shuffle_ps (srcVec[ 8], srcVec[10], 0xEE);
2117 transTmp2[3] = _mm_shuffle_ps (srcVec[12], srcVec[14], 0xEE);
2118
2119 //
2120 // Second half of M1t
2121 //
2122
2123 srcVec[ 8] = _mm_shuffle_ps (transTmp[0], transTmp[1], 0x88);
2124 srcVec[12] = _mm_shuffle_ps (transTmp[2], transTmp[3], 0x88);
2125 srcVec[10] = _mm_shuffle_ps (transTmp[0], transTmp[1], 0xDD);
2126 srcVec[14] = _mm_shuffle_ps (transTmp[2], transTmp[3], 0xDD);
2127
2128 //
2129 // Second half of M2
2130 //
2131
2132 srcVec[1] = _mm_shuffle_ps (transTmp2[0], transTmp2[1], 0x88);
2133 srcVec[5] = _mm_shuffle_ps (transTmp2[2], transTmp2[3], 0x88);
2134 srcVec[3] = _mm_shuffle_ps (transTmp2[0], transTmp2[1], 0xDD);
2135 srcVec[7] = _mm_shuffle_ps (transTmp2[2], transTmp2[3], 0xDD);
2136 }
2137 }
2138
2139 #endif /* IMF_HAVE_SSE2 */
2140
2141 } // anonymous namespace
2142
2143 OPENEXR_IMF_INTERNAL_NAMESPACE_HEADER_EXIT
2144
2145 #endif
2146