1 ///////////////////////////////////////////////////////////////////////////
2 //
3 // Copyright (c) 2012, Autodesk, Inc.
4 //
5 // All rights reserved.
6 //
7 // Implementation of IIF-specific file format and speed optimizations
8 // provided by Innobec Technologies inc on behalf of Autodesk.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 // *       Redistributions of source code must retain the above copyright
14 // notice, this list of conditions and the following disclaimer.
15 // *       Redistributions in binary form must reproduce the above
16 // copyright notice, this list of conditions and the following disclaimer
17 // in the documentation and/or other materials provided with the
18 // distribution.
19 // *       Neither the name of Industrial Light & Magic nor the names of
20 // its contributors may be used to endorse or promote products derived
21 // from this software without specific prior written permission.
22 //
23 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 //
35 ///////////////////////////////////////////////////////////////////////////
36 
37 #pragma once
38 
39 #ifndef INCLUDED_IMF_OPTIMIZED_PIXEL_READING_H
40 #define INCLUDED_IMF_OPTIMIZED_PIXEL_READING_H
41 
42 #include "ImfSimd.h"
43 #include "ImfSystemSpecific.h"
44 #include <iostream>
45 #include "ImfChannelList.h"
46 #include "ImfFrameBuffer.h"
47 #include "ImfStringVectorAttribute.h"
48 
49 OPENEXR_IMF_INTERNAL_NAMESPACE_HEADER_ENTER
50 
51 class OptimizationMode
52 {
53 public:
54 
55 
56     bool _optimizable;
57     int _ySampling;
OptimizationMode()58     OptimizationMode() : _optimizable(false) {}
59 
60 };
61 
62 
63 #if IMF_HAVE_SSE2
64 
65 
66 //------------------------------------------------------------------------
67 // Test for SSE pointer alignemnt
68 //------------------------------------------------------------------------
69 EXR_FORCEINLINE
70 bool
isPointerSSEAligned(const void * EXR_RESTRICT pPointer)71 isPointerSSEAligned (const void* EXR_RESTRICT pPointer)
72 {
73     unsigned long trailingBits = ((unsigned long)pPointer) & 15;
74     return trailingBits == 0;
75 }
76 
77 //------------------------------------------------------------------------
78 // Load SSE from address into register
79 //------------------------------------------------------------------------
80 template<bool IS_ALIGNED>
81 EXR_FORCEINLINE
loadSSE(__m128i * & loadAddress)82 __m128i loadSSE (__m128i*& loadAddress)
83 {
84     // throw exception :: this is not accepted
85     return _mm_loadu_si128 (loadAddress);
86 }
87 
88 template<>
89 EXR_FORCEINLINE
90 __m128i loadSSE<false> (__m128i*& loadAddress)
91 {
92     return _mm_loadu_si128 (loadAddress);
93 }
94 
95 template<>
96 EXR_FORCEINLINE
97 __m128i loadSSE<true> (__m128i*& loadAddress)
98 {
99     return _mm_load_si128 (loadAddress);
100 }
101 
102 //------------------------------------------------------------------------
103 // Store SSE from register into address
104 //------------------------------------------------------------------------
105 template<bool IS_ALIGNED>
106 EXR_FORCEINLINE
storeSSE(__m128i * & storeAddress,__m128i & dataToStore)107 void storeSSE (__m128i*& storeAddress, __m128i& dataToStore)
108 {
109 
110 }
111 
112 template<>
113 EXR_FORCEINLINE
114 void
115 storeSSE<false> (__m128i*& storeAddress, __m128i& dataToStore)
116 {
117     _mm_storeu_si128 (storeAddress, dataToStore);
118 }
119 
120 template<>
121 EXR_FORCEINLINE
122 void
123 storeSSE<true> (__m128i*& storeAddress, __m128i& dataToStore)
124 {
125     _mm_stream_si128 (storeAddress, dataToStore);
126 }
127 
128 
129 
130 //------------------------------------------------------------------------
131 //
132 // Write to RGBA
133 //
134 //------------------------------------------------------------------------
135 
136 //
137 // Using SSE intrinsics
138 //
139 template<bool READ_PTR_ALIGNED, bool WRITE_PTR_ALIGNED>
140 EXR_FORCEINLINE
writeToRGBASSETemplate(__m128i * & readPtrSSERed,__m128i * & readPtrSSEGreen,__m128i * & readPtrSSEBlue,__m128i * & readPtrSSEAlpha,__m128i * & writePtrSSE,const size_t & lPixelsToCopySSE)141 void writeToRGBASSETemplate
142     (__m128i*& readPtrSSERed,
143      __m128i*& readPtrSSEGreen,
144      __m128i*& readPtrSSEBlue,
145      __m128i*& readPtrSSEAlpha,
146      __m128i*& writePtrSSE,
147      const size_t& lPixelsToCopySSE)
148 {
149     for (size_t i = 0; i < lPixelsToCopySSE; ++i)
150     {
151         __m128i redRegister   = loadSSE<READ_PTR_ALIGNED> (readPtrSSERed);
152         __m128i greenRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEGreen);
153         __m128i blueRegister  = loadSSE<READ_PTR_ALIGNED> (readPtrSSEBlue);
154         __m128i alphaRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEAlpha);
155 
156         __m128i redGreenRegister  = _mm_unpacklo_epi16 (redRegister,
157                                                         greenRegister);
158         __m128i blueAlphaRegister = _mm_unpacklo_epi16 (blueRegister,
159                                                         alphaRegister);
160 
161         __m128i pixel12Register   = _mm_unpacklo_epi32 (redGreenRegister,
162                                                         blueAlphaRegister);
163         __m128i pixel34Register   = _mm_unpackhi_epi32 (redGreenRegister,
164                                                         blueAlphaRegister);
165 
166         storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel12Register);
167         ++writePtrSSE;
168 
169         storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel34Register);
170         ++writePtrSSE;
171 
172         redGreenRegister  = _mm_unpackhi_epi16 (redRegister, greenRegister);
173         blueAlphaRegister = _mm_unpackhi_epi16 (blueRegister, alphaRegister);
174 
175         pixel12Register   = _mm_unpacklo_epi32 (redGreenRegister,
176                                                 blueAlphaRegister);
177         pixel34Register   = _mm_unpackhi_epi32 (redGreenRegister,
178                                                 blueAlphaRegister);
179 
180         storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel12Register);
181         ++writePtrSSE;
182 
183         storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel34Register);
184         ++writePtrSSE;
185 
186         ++readPtrSSEAlpha;
187         ++readPtrSSEBlue;
188         ++readPtrSSEGreen;
189         ++readPtrSSERed;
190     }
191 }
192 
193 //
194 // Not using SSE intrinsics.  This is still faster than the alternative
195 // because we have multiple read pointers and therefore we are able to
196 // take advantage of data locality for write operations.
197 //
198 EXR_FORCEINLINE
writeToRGBANormal(unsigned short * & readPtrRed,unsigned short * & readPtrGreen,unsigned short * & readPtrBlue,unsigned short * & readPtrAlpha,unsigned short * & writePtr,const size_t & lPixelsToCopy)199 void writeToRGBANormal (unsigned short*& readPtrRed,
200                         unsigned short*& readPtrGreen,
201                         unsigned short*& readPtrBlue,
202                         unsigned short*& readPtrAlpha,
203                         unsigned short*& writePtr,
204                         const size_t& lPixelsToCopy)
205 {
206     for (size_t i = 0; i < lPixelsToCopy; ++i)
207     {
208         *(writePtr++) = *(readPtrRed++);
209         *(writePtr++) = *(readPtrGreen++);
210         *(writePtr++) = *(readPtrBlue++);
211         *(writePtr++) = *(readPtrAlpha++);
212     }
213 }
214 
215 //
216 // Determine which (template) version to use by checking whether pointers
217 // are aligned
218 //
219 EXR_FORCEINLINE
optimizedWriteToRGBA(unsigned short * & readPtrRed,unsigned short * & readPtrGreen,unsigned short * & readPtrBlue,unsigned short * & readPtrAlpha,unsigned short * & writePtr,const size_t & pixelsToCopySSE,const size_t & pixelsToCopyNormal)220 void optimizedWriteToRGBA (unsigned short*& readPtrRed,
221                            unsigned short*& readPtrGreen,
222                            unsigned short*& readPtrBlue,
223                            unsigned short*& readPtrAlpha,
224                            unsigned short*& writePtr,
225                            const size_t& pixelsToCopySSE,
226                            const size_t& pixelsToCopyNormal)
227 {
228     bool readPtrAreAligned = true;
229 
230     readPtrAreAligned &= isPointerSSEAligned(readPtrRed);
231     readPtrAreAligned &= isPointerSSEAligned(readPtrGreen);
232     readPtrAreAligned &= isPointerSSEAligned(readPtrBlue);
233     readPtrAreAligned &= isPointerSSEAligned(readPtrAlpha);
234 
235     bool writePtrIsAligned = isPointerSSEAligned(writePtr);
236 
237     if (!readPtrAreAligned && !writePtrIsAligned)
238     {
239         writeToRGBASSETemplate<false, false> ((__m128i*&)readPtrRed,
240                                               (__m128i*&)readPtrGreen,
241                                               (__m128i*&)readPtrBlue,
242                                               (__m128i*&)readPtrAlpha,
243                                               (__m128i*&)writePtr,
244                                               pixelsToCopySSE);
245     }
246     else if (!readPtrAreAligned && writePtrIsAligned)
247     {
248         writeToRGBASSETemplate<false, true> ((__m128i*&)readPtrRed,
249                                              (__m128i*&)readPtrGreen,
250                                              (__m128i*&)readPtrBlue,
251                                              (__m128i*&)readPtrAlpha,
252                                              (__m128i*&)writePtr,
253                                              pixelsToCopySSE);
254     }
255     else if (readPtrAreAligned && !writePtrIsAligned)
256     {
257         writeToRGBASSETemplate<true, false> ((__m128i*&)readPtrRed,
258                                              (__m128i*&)readPtrGreen,
259                                              (__m128i*&)readPtrBlue,
260                                              (__m128i*&)readPtrAlpha,
261                                              (__m128i*&)writePtr,
262                                              pixelsToCopySSE);
263     }
264     else if(readPtrAreAligned && writePtrIsAligned)
265     {
266         writeToRGBASSETemplate<true, true> ((__m128i*&)readPtrRed,
267                                             (__m128i*&)readPtrGreen,
268                                             (__m128i*&)readPtrBlue,
269                                             (__m128i*&)readPtrAlpha,
270                                             (__m128i*&)writePtr,
271                                             pixelsToCopySSE);
272     }
273 
274     writeToRGBANormal (readPtrRed, readPtrGreen, readPtrBlue, readPtrAlpha,
275                        writePtr, pixelsToCopyNormal);
276 }
277 
278 
279 
280 //------------------------------------------------------------------------
281 //
282 // Write to RGBA Fill A
283 //
284 //------------------------------------------------------------------------
285 
286 //
287 // Using SSE intrinsics
288 //
289 template<bool READ_PTR_ALIGNED, bool WRITE_PTR_ALIGNED>
290 EXR_FORCEINLINE
291 void
writeToRGBAFillASSETemplate(__m128i * & readPtrSSERed,__m128i * & readPtrSSEGreen,__m128i * & readPtrSSEBlue,const unsigned short & alphaFillValue,__m128i * & writePtrSSE,const size_t & pixelsToCopySSE)292 writeToRGBAFillASSETemplate (__m128i*& readPtrSSERed,
293                              __m128i*& readPtrSSEGreen,
294                              __m128i*& readPtrSSEBlue,
295                              const unsigned short& alphaFillValue,
296                              __m128i*& writePtrSSE,
297                              const size_t& pixelsToCopySSE)
298 {
299     const __m128i dummyAlphaRegister = _mm_set_epi16 (alphaFillValue,
300                                                       alphaFillValue,
301                                                       alphaFillValue,
302                                                       alphaFillValue,
303                                                       alphaFillValue,
304                                                       alphaFillValue,
305                                                       alphaFillValue,
306                                                       alphaFillValue);
307 
308     for (size_t pixelCounter = 0; pixelCounter < pixelsToCopySSE; ++pixelCounter)
309     {
310         __m128i redRegister   = loadSSE<READ_PTR_ALIGNED> (readPtrSSERed);
311         __m128i greenRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEGreen);
312         __m128i blueRegister  = loadSSE<READ_PTR_ALIGNED> (readPtrSSEBlue);
313 
314         __m128i redGreenRegister  = _mm_unpacklo_epi16 (redRegister,
315                                                         greenRegister);
316         __m128i blueAlphaRegister = _mm_unpacklo_epi16 (blueRegister,
317                                                         dummyAlphaRegister);
318 
319         __m128i pixel12Register   = _mm_unpacklo_epi32 (redGreenRegister,
320                                                         blueAlphaRegister);
321         __m128i pixel34Register   = _mm_unpackhi_epi32 (redGreenRegister,
322                                                         blueAlphaRegister);
323 
324         storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel12Register);
325         ++writePtrSSE;
326 
327         storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel34Register);
328         ++writePtrSSE;
329 
330         redGreenRegister  = _mm_unpackhi_epi16 (redRegister,
331                                                 greenRegister);
332         blueAlphaRegister = _mm_unpackhi_epi16 (blueRegister,
333                                                 dummyAlphaRegister);
334 
335         pixel12Register   = _mm_unpacklo_epi32 (redGreenRegister,
336                                                 blueAlphaRegister);
337         pixel34Register   = _mm_unpackhi_epi32 (redGreenRegister,
338                                                 blueAlphaRegister);
339 
340         storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel12Register);
341         ++writePtrSSE;
342 
343         storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel34Register);
344         ++writePtrSSE;
345 
346         ++readPtrSSEBlue;
347         ++readPtrSSEGreen;
348         ++readPtrSSERed;
349     }
350 }
351 
352 //
353 // Not using SSE intrinsics.  This is still faster than the alternative
354 // because we have multiple read pointers and therefore we are able to
355 // take advantage of data locality for write operations.
356 //
357 EXR_FORCEINLINE
358 void
writeToRGBAFillANormal(unsigned short * & readPtrRed,unsigned short * & readPtrGreen,unsigned short * & readPtrBlue,const unsigned short & alphaFillValue,unsigned short * & writePtr,const size_t & pixelsToCopy)359 writeToRGBAFillANormal (unsigned short*& readPtrRed,
360                         unsigned short*& readPtrGreen,
361                         unsigned short*& readPtrBlue,
362                         const unsigned short& alphaFillValue,
363                         unsigned short*& writePtr,
364                         const size_t& pixelsToCopy)
365 {
366     for (size_t i = 0; i < pixelsToCopy; ++i)
367     {
368         *(writePtr++) = *(readPtrRed++);
369         *(writePtr++) = *(readPtrGreen++);
370         *(writePtr++) = *(readPtrBlue++);
371         *(writePtr++) = alphaFillValue;
372     }
373 }
374 
375 //
376 // Determine which (template) version to use by checking whether pointers
377 // are aligned.
378 //
379 EXR_FORCEINLINE
380 void
optimizedWriteToRGBAFillA(unsigned short * & readPtrRed,unsigned short * & readPtrGreen,unsigned short * & readPtrBlue,const unsigned short & alphaFillValue,unsigned short * & writePtr,const size_t & pixelsToCopySSE,const size_t & pixelsToCopyNormal)381 optimizedWriteToRGBAFillA (unsigned short*& readPtrRed,
382                            unsigned short*& readPtrGreen,
383                            unsigned short*& readPtrBlue,
384                            const unsigned short& alphaFillValue,
385                            unsigned short*& writePtr,
386                            const size_t& pixelsToCopySSE,
387                            const size_t& pixelsToCopyNormal)
388 {
389     bool readPtrAreAligned = true;
390 
391     readPtrAreAligned &= isPointerSSEAligned (readPtrRed);
392     readPtrAreAligned &= isPointerSSEAligned (readPtrGreen);
393     readPtrAreAligned &= isPointerSSEAligned (readPtrBlue);
394 
395     bool writePtrIsAligned = isPointerSSEAligned (writePtr);
396 
397     if (!readPtrAreAligned && !writePtrIsAligned)
398     {
399         writeToRGBAFillASSETemplate<false, false> ((__m128i*&)readPtrRed,
400                                                    (__m128i*&)readPtrGreen,
401                                                    (__m128i*&)readPtrBlue,
402                                                    alphaFillValue,
403                                                    (__m128i*&)writePtr,
404                                                    pixelsToCopySSE);
405     }
406     else if (!readPtrAreAligned && writePtrIsAligned)
407     {
408         writeToRGBAFillASSETemplate<false, true> ((__m128i*&)readPtrRed,
409                                                   (__m128i*&)readPtrGreen,
410                                                   (__m128i*&)readPtrBlue,
411                                                   alphaFillValue,
412                                                   (__m128i*&)writePtr,
413                                                   pixelsToCopySSE);
414     }
415     else if (readPtrAreAligned && !writePtrIsAligned)
416     {
417         writeToRGBAFillASSETemplate<true, false> ((__m128i*&)readPtrRed,
418                                                   (__m128i*&)readPtrGreen,
419                                                   (__m128i*&)readPtrBlue,
420                                                   alphaFillValue,
421                                                   (__m128i*&)writePtr,
422                                                   pixelsToCopySSE);
423     }
424     else if (readPtrAreAligned && writePtrIsAligned)
425     {
426         writeToRGBAFillASSETemplate<true, true> ((__m128i*&)readPtrRed,
427                                                  (__m128i*&)readPtrGreen,
428                                                  (__m128i*&)readPtrBlue,
429                                                  alphaFillValue,
430                                                  (__m128i*&)writePtr,
431                                                  pixelsToCopySSE);
432     }
433 
434     writeToRGBAFillANormal (readPtrRed,
435                             readPtrGreen, readPtrBlue, alphaFillValue,
436                             writePtr, pixelsToCopyNormal);
437 }
438 
439 
440 
441 //------------------------------------------------------------------------
442 //
443 // Write to RGB
444 //
445 //------------------------------------------------------------------------
446 
447 //
448 // Using SSE intrinsics
449 //
450 template<bool READ_PTR_ALIGNED, bool WRITE_PTR_ALIGNED>
451 EXR_FORCEINLINE
452 void
writeToRGBSSETemplate(__m128i * & readPtrSSERed,__m128i * & readPtrSSEGreen,__m128i * & readPtrSSEBlue,__m128i * & writePtrSSE,const size_t & pixelsToCopySSE)453 writeToRGBSSETemplate (__m128i*& readPtrSSERed,
454                        __m128i*& readPtrSSEGreen,
455                        __m128i*& readPtrSSEBlue,
456                        __m128i*& writePtrSSE,
457                        const size_t& pixelsToCopySSE)
458 {
459 
460     for (size_t pixelCounter = 0; pixelCounter < pixelsToCopySSE; ++pixelCounter)
461     {
462         //
463         // Need to shuffle and unpack pointers to obtain my first register
464         // We must save 8 pixels at a time, so we must have the following three registers at the end:
465         // 1) R1 G1 B1 R2 G2 B2 R3 G3
466         // 2) B3 R4 G4 B4 R5 G5 B5 R6
467         // 3) G6 B6 R7 G7 B7 R8 G8 B8
468         //
469         __m128i redRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSERed);
470         __m128i greenRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEGreen);
471         __m128i blueRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEBlue);
472 
473         //
474         // First register: R1 G1 B1 R2 G2 B2 R3 G3
475         // Construct 2 registers and then unpack them to obtain our final result:
476         //
477         __m128i redGreenRegister  = _mm_unpacklo_epi16 (redRegister,
478                                                         greenRegister);
479         __m128i redBlueRegister   = _mm_unpacklo_epi16 (redRegister,
480                                                         blueRegister);
481         __m128i greenBlueRegister = _mm_unpacklo_epi16 (greenRegister,
482                                                         blueRegister);
483 
484         // Left Part (R1 G1 B1 R2)
485         __m128i quarterRight = _mm_shufflelo_epi16 (redBlueRegister,
486                                                     _MM_SHUFFLE(3,0,2,1));
487         __m128i halfLeft     = _mm_unpacklo_epi32 (redGreenRegister,
488                                                    quarterRight);
489 
490         // Right Part (G2 B2 R3 G3)
491         __m128i quarterLeft  = _mm_shuffle_epi32 (greenBlueRegister,
492                                                  _MM_SHUFFLE(3,2,0,1));
493         quarterRight         = _mm_shuffle_epi32 (redGreenRegister,
494                                                  _MM_SHUFFLE(3,0,1,2));
495         __m128i halfRight    = _mm_unpacklo_epi32 (quarterLeft, quarterRight);
496 
497         __m128i fullRegister = _mm_unpacklo_epi64 (halfLeft, halfRight);
498         storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, fullRegister);
499         ++writePtrSSE;
500 
501         //
502         // Second register: B3 R4 G4 B4 R5 G5 B5 R6
503         //
504 
505         // Left Part (B3, R4, G4, B4)
506         quarterLeft  = _mm_shufflehi_epi16 (redBlueRegister,
507                                             _MM_SHUFFLE(0, 3, 2, 1));
508         quarterRight = _mm_shufflehi_epi16 (greenBlueRegister,
509                                             _MM_SHUFFLE(1, 0, 3, 2));
510         halfLeft     = _mm_unpackhi_epi32 (quarterLeft, quarterRight);
511 
512         // Update the registers
513         redGreenRegister  = _mm_unpackhi_epi16 (redRegister, greenRegister);
514         redBlueRegister   = _mm_unpackhi_epi16 (redRegister, blueRegister);
515         greenBlueRegister = _mm_unpackhi_epi16 (greenRegister, blueRegister);
516 
517         // Right Part (R5 G5 B5 R6)
518         quarterRight = _mm_shufflelo_epi16 (redBlueRegister,
519                                             _MM_SHUFFLE(3,0,2,1));
520         halfRight    = _mm_unpacklo_epi32 (redGreenRegister, quarterRight);
521 
522         fullRegister = _mm_unpacklo_epi64 (halfLeft, halfRight);
523         storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, fullRegister);
524         ++writePtrSSE;
525 
526         //
527         // Third register: G6 B6 R7 G7 B7 R8 G8 B8
528         //
529 
530         // Left part (G6 B6 R7 G7)
531         quarterLeft  = _mm_shuffle_epi32 (greenBlueRegister,
532                                           _MM_SHUFFLE(3,2,0,1));
533         quarterRight = _mm_shuffle_epi32 (redGreenRegister,
534                                           _MM_SHUFFLE(3,0,1,2));
535         halfLeft     = _mm_unpacklo_epi32 (quarterLeft, quarterRight);
536 
537         // Right part (B7 R8 G8 B8)
538         quarterLeft  = _mm_shufflehi_epi16 (redBlueRegister,
539                                             _MM_SHUFFLE(0, 3, 2, 1));
540         quarterRight = _mm_shufflehi_epi16 (greenBlueRegister,
541                                             _MM_SHUFFLE(1, 0, 3, 2));
542         halfRight    = _mm_unpackhi_epi32 (quarterLeft, quarterRight);
543 
544         fullRegister = _mm_unpacklo_epi64 (halfLeft, halfRight);
545         storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, fullRegister);
546         ++writePtrSSE;
547 
548         //
549         // Increment read pointers
550         //
551         ++readPtrSSEBlue;
552         ++readPtrSSEGreen;
553         ++readPtrSSERed;
554     }
555 }
556 
557 //
558 // Not using SSE intrinsics.  This is still faster than the alternative
559 // because we have multiple read pointers and therefore we are able to
560 // take advantage of data locality for write operations.
561 //
562 EXR_FORCEINLINE
563 void
writeToRGBNormal(unsigned short * & readPtrRed,unsigned short * & readPtrGreen,unsigned short * & readPtrBlue,unsigned short * & writePtr,const size_t & pixelsToCopy)564 writeToRGBNormal (unsigned short*& readPtrRed,
565                   unsigned short*& readPtrGreen,
566                   unsigned short*& readPtrBlue,
567                   unsigned short*& writePtr,
568                   const size_t& pixelsToCopy)
569 {
570     for (size_t i = 0; i < pixelsToCopy; ++i)
571     {
572         *(writePtr++) = *(readPtrRed++);
573         *(writePtr++) = *(readPtrGreen++);
574         *(writePtr++) = *(readPtrBlue++);
575     }
576 }
577 
578 //
579 // Determine which (template) version to use by checking whether pointers
580 // are aligned
581 //
582 EXR_FORCEINLINE
optimizedWriteToRGB(unsigned short * & readPtrRed,unsigned short * & readPtrGreen,unsigned short * & readPtrBlue,unsigned short * & writePtr,const size_t & pixelsToCopySSE,const size_t & pixelsToCopyNormal)583 void optimizedWriteToRGB (unsigned short*& readPtrRed,
584                           unsigned short*& readPtrGreen,
585                           unsigned short*& readPtrBlue,
586                           unsigned short*& writePtr,
587                           const size_t& pixelsToCopySSE,
588                           const size_t& pixelsToCopyNormal)
589 {
590     bool readPtrAreAligned = true;
591 
592     readPtrAreAligned &= isPointerSSEAligned(readPtrRed);
593     readPtrAreAligned &= isPointerSSEAligned(readPtrGreen);
594     readPtrAreAligned &= isPointerSSEAligned(readPtrBlue);
595 
596     bool writePtrIsAligned = isPointerSSEAligned(writePtr);
597 
598     if (!readPtrAreAligned && !writePtrIsAligned)
599     {
600         writeToRGBSSETemplate<false, false> ((__m128i*&)readPtrRed,
601                                              (__m128i*&)readPtrGreen,
602                                              (__m128i*&)readPtrBlue,
603                                              (__m128i*&)writePtr,
604                                              pixelsToCopySSE);
605     }
606     else if (!readPtrAreAligned && writePtrIsAligned)
607     {
608         writeToRGBSSETemplate<false, true> ((__m128i*&)readPtrRed,
609                                             (__m128i*&)readPtrGreen,
610                                             (__m128i*&)readPtrBlue,
611                                             (__m128i*&)writePtr,
612                                             pixelsToCopySSE);
613     }
614     else if (readPtrAreAligned && !writePtrIsAligned)
615     {
616         writeToRGBSSETemplate<true, false> ((__m128i*&)readPtrRed,
617                                             (__m128i*&)readPtrGreen,
618                                             (__m128i*&)readPtrBlue,
619                                             (__m128i*&)writePtr,
620                                             pixelsToCopySSE);
621     }
622     else if (readPtrAreAligned && writePtrIsAligned)
623     {
624         writeToRGBSSETemplate<true, true> ((__m128i*&)readPtrRed,
625                                            (__m128i*&)readPtrGreen,
626                                            (__m128i*&)readPtrBlue,
627                                            (__m128i*&)writePtr,
628                                            pixelsToCopySSE);
629     }
630 
631 
632     writeToRGBNormal (readPtrRed, readPtrGreen, readPtrBlue,
633                       writePtr, pixelsToCopyNormal);
634 }
635 
636 
637 
638 
639 #else // ! defined IMF_HAVE_SSE2
640 
641 #endif // defined IMF_HAVE_SSE2
642 
643 
644 OPENEXR_IMF_INTERNAL_NAMESPACE_HEADER_EXIT
645 
646 #endif
647