1 ///////////////////////////////////////////////////////////////////////////
2 //
3 // Copyright (c) 2012, Autodesk, Inc.
4 //
5 // All rights reserved.
6 //
7 // Implementation of IIF-specific file format and speed optimizations
8 // provided by Innobec Technologies inc on behalf of Autodesk.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 // * Redistributions of source code must retain the above copyright
14 // notice, this list of conditions and the following disclaimer.
15 // * Redistributions in binary form must reproduce the above
16 // copyright notice, this list of conditions and the following disclaimer
17 // in the documentation and/or other materials provided with the
18 // distribution.
19 // * Neither the name of Industrial Light & Magic nor the names of
20 // its contributors may be used to endorse or promote products derived
21 // from this software without specific prior written permission.
22 //
23 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
29 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
30 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
31 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
32 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
33 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 //
35 ///////////////////////////////////////////////////////////////////////////
36
37 #pragma once
38
39 #ifndef INCLUDED_IMF_OPTIMIZED_PIXEL_READING_H
40 #define INCLUDED_IMF_OPTIMIZED_PIXEL_READING_H
41
42 #include "ImfSimd.h"
43 #include "ImfSystemSpecific.h"
44 #include <iostream>
45 #include "ImfChannelList.h"
46 #include "ImfFrameBuffer.h"
47 #include "ImfStringVectorAttribute.h"
48
49 OPENEXR_IMF_INTERNAL_NAMESPACE_HEADER_ENTER
50
51 class OptimizationMode
52 {
53 public:
54
55
56 bool _optimizable;
57 int _ySampling;
OptimizationMode()58 OptimizationMode() : _optimizable(false) {}
59
60 };
61
62
63 #if IMF_HAVE_SSE2
64
65
66 //------------------------------------------------------------------------
67 // Test for SSE pointer alignemnt
68 //------------------------------------------------------------------------
69 EXR_FORCEINLINE
70 bool
isPointerSSEAligned(const void * EXR_RESTRICT pPointer)71 isPointerSSEAligned (const void* EXR_RESTRICT pPointer)
72 {
73 unsigned long trailingBits = ((unsigned long)pPointer) & 15;
74 return trailingBits == 0;
75 }
76
77 //------------------------------------------------------------------------
78 // Load SSE from address into register
79 //------------------------------------------------------------------------
80 template<bool IS_ALIGNED>
81 EXR_FORCEINLINE
loadSSE(__m128i * & loadAddress)82 __m128i loadSSE (__m128i*& loadAddress)
83 {
84 // throw exception :: this is not accepted
85 return _mm_loadu_si128 (loadAddress);
86 }
87
88 template<>
89 EXR_FORCEINLINE
90 __m128i loadSSE<false> (__m128i*& loadAddress)
91 {
92 return _mm_loadu_si128 (loadAddress);
93 }
94
95 template<>
96 EXR_FORCEINLINE
97 __m128i loadSSE<true> (__m128i*& loadAddress)
98 {
99 return _mm_load_si128 (loadAddress);
100 }
101
102 //------------------------------------------------------------------------
103 // Store SSE from register into address
104 //------------------------------------------------------------------------
105 template<bool IS_ALIGNED>
106 EXR_FORCEINLINE
storeSSE(__m128i * & storeAddress,__m128i & dataToStore)107 void storeSSE (__m128i*& storeAddress, __m128i& dataToStore)
108 {
109
110 }
111
112 template<>
113 EXR_FORCEINLINE
114 void
115 storeSSE<false> (__m128i*& storeAddress, __m128i& dataToStore)
116 {
117 _mm_storeu_si128 (storeAddress, dataToStore);
118 }
119
120 template<>
121 EXR_FORCEINLINE
122 void
123 storeSSE<true> (__m128i*& storeAddress, __m128i& dataToStore)
124 {
125 _mm_stream_si128 (storeAddress, dataToStore);
126 }
127
128
129
130 //------------------------------------------------------------------------
131 //
132 // Write to RGBA
133 //
134 //------------------------------------------------------------------------
135
136 //
137 // Using SSE intrinsics
138 //
139 template<bool READ_PTR_ALIGNED, bool WRITE_PTR_ALIGNED>
140 EXR_FORCEINLINE
writeToRGBASSETemplate(__m128i * & readPtrSSERed,__m128i * & readPtrSSEGreen,__m128i * & readPtrSSEBlue,__m128i * & readPtrSSEAlpha,__m128i * & writePtrSSE,const size_t & lPixelsToCopySSE)141 void writeToRGBASSETemplate
142 (__m128i*& readPtrSSERed,
143 __m128i*& readPtrSSEGreen,
144 __m128i*& readPtrSSEBlue,
145 __m128i*& readPtrSSEAlpha,
146 __m128i*& writePtrSSE,
147 const size_t& lPixelsToCopySSE)
148 {
149 for (size_t i = 0; i < lPixelsToCopySSE; ++i)
150 {
151 __m128i redRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSERed);
152 __m128i greenRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEGreen);
153 __m128i blueRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEBlue);
154 __m128i alphaRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEAlpha);
155
156 __m128i redGreenRegister = _mm_unpacklo_epi16 (redRegister,
157 greenRegister);
158 __m128i blueAlphaRegister = _mm_unpacklo_epi16 (blueRegister,
159 alphaRegister);
160
161 __m128i pixel12Register = _mm_unpacklo_epi32 (redGreenRegister,
162 blueAlphaRegister);
163 __m128i pixel34Register = _mm_unpackhi_epi32 (redGreenRegister,
164 blueAlphaRegister);
165
166 storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel12Register);
167 ++writePtrSSE;
168
169 storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel34Register);
170 ++writePtrSSE;
171
172 redGreenRegister = _mm_unpackhi_epi16 (redRegister, greenRegister);
173 blueAlphaRegister = _mm_unpackhi_epi16 (blueRegister, alphaRegister);
174
175 pixel12Register = _mm_unpacklo_epi32 (redGreenRegister,
176 blueAlphaRegister);
177 pixel34Register = _mm_unpackhi_epi32 (redGreenRegister,
178 blueAlphaRegister);
179
180 storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel12Register);
181 ++writePtrSSE;
182
183 storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel34Register);
184 ++writePtrSSE;
185
186 ++readPtrSSEAlpha;
187 ++readPtrSSEBlue;
188 ++readPtrSSEGreen;
189 ++readPtrSSERed;
190 }
191 }
192
193 //
194 // Not using SSE intrinsics. This is still faster than the alternative
195 // because we have multiple read pointers and therefore we are able to
196 // take advantage of data locality for write operations.
197 //
198 EXR_FORCEINLINE
writeToRGBANormal(unsigned short * & readPtrRed,unsigned short * & readPtrGreen,unsigned short * & readPtrBlue,unsigned short * & readPtrAlpha,unsigned short * & writePtr,const size_t & lPixelsToCopy)199 void writeToRGBANormal (unsigned short*& readPtrRed,
200 unsigned short*& readPtrGreen,
201 unsigned short*& readPtrBlue,
202 unsigned short*& readPtrAlpha,
203 unsigned short*& writePtr,
204 const size_t& lPixelsToCopy)
205 {
206 for (size_t i = 0; i < lPixelsToCopy; ++i)
207 {
208 *(writePtr++) = *(readPtrRed++);
209 *(writePtr++) = *(readPtrGreen++);
210 *(writePtr++) = *(readPtrBlue++);
211 *(writePtr++) = *(readPtrAlpha++);
212 }
213 }
214
215 //
216 // Determine which (template) version to use by checking whether pointers
217 // are aligned
218 //
219 EXR_FORCEINLINE
optimizedWriteToRGBA(unsigned short * & readPtrRed,unsigned short * & readPtrGreen,unsigned short * & readPtrBlue,unsigned short * & readPtrAlpha,unsigned short * & writePtr,const size_t & pixelsToCopySSE,const size_t & pixelsToCopyNormal)220 void optimizedWriteToRGBA (unsigned short*& readPtrRed,
221 unsigned short*& readPtrGreen,
222 unsigned short*& readPtrBlue,
223 unsigned short*& readPtrAlpha,
224 unsigned short*& writePtr,
225 const size_t& pixelsToCopySSE,
226 const size_t& pixelsToCopyNormal)
227 {
228 bool readPtrAreAligned = true;
229
230 readPtrAreAligned &= isPointerSSEAligned(readPtrRed);
231 readPtrAreAligned &= isPointerSSEAligned(readPtrGreen);
232 readPtrAreAligned &= isPointerSSEAligned(readPtrBlue);
233 readPtrAreAligned &= isPointerSSEAligned(readPtrAlpha);
234
235 bool writePtrIsAligned = isPointerSSEAligned(writePtr);
236
237 if (!readPtrAreAligned && !writePtrIsAligned)
238 {
239 writeToRGBASSETemplate<false, false> ((__m128i*&)readPtrRed,
240 (__m128i*&)readPtrGreen,
241 (__m128i*&)readPtrBlue,
242 (__m128i*&)readPtrAlpha,
243 (__m128i*&)writePtr,
244 pixelsToCopySSE);
245 }
246 else if (!readPtrAreAligned && writePtrIsAligned)
247 {
248 writeToRGBASSETemplate<false, true> ((__m128i*&)readPtrRed,
249 (__m128i*&)readPtrGreen,
250 (__m128i*&)readPtrBlue,
251 (__m128i*&)readPtrAlpha,
252 (__m128i*&)writePtr,
253 pixelsToCopySSE);
254 }
255 else if (readPtrAreAligned && !writePtrIsAligned)
256 {
257 writeToRGBASSETemplate<true, false> ((__m128i*&)readPtrRed,
258 (__m128i*&)readPtrGreen,
259 (__m128i*&)readPtrBlue,
260 (__m128i*&)readPtrAlpha,
261 (__m128i*&)writePtr,
262 pixelsToCopySSE);
263 }
264 else if(readPtrAreAligned && writePtrIsAligned)
265 {
266 writeToRGBASSETemplate<true, true> ((__m128i*&)readPtrRed,
267 (__m128i*&)readPtrGreen,
268 (__m128i*&)readPtrBlue,
269 (__m128i*&)readPtrAlpha,
270 (__m128i*&)writePtr,
271 pixelsToCopySSE);
272 }
273
274 writeToRGBANormal (readPtrRed, readPtrGreen, readPtrBlue, readPtrAlpha,
275 writePtr, pixelsToCopyNormal);
276 }
277
278
279
280 //------------------------------------------------------------------------
281 //
282 // Write to RGBA Fill A
283 //
284 //------------------------------------------------------------------------
285
286 //
287 // Using SSE intrinsics
288 //
289 template<bool READ_PTR_ALIGNED, bool WRITE_PTR_ALIGNED>
290 EXR_FORCEINLINE
291 void
writeToRGBAFillASSETemplate(__m128i * & readPtrSSERed,__m128i * & readPtrSSEGreen,__m128i * & readPtrSSEBlue,const unsigned short & alphaFillValue,__m128i * & writePtrSSE,const size_t & pixelsToCopySSE)292 writeToRGBAFillASSETemplate (__m128i*& readPtrSSERed,
293 __m128i*& readPtrSSEGreen,
294 __m128i*& readPtrSSEBlue,
295 const unsigned short& alphaFillValue,
296 __m128i*& writePtrSSE,
297 const size_t& pixelsToCopySSE)
298 {
299 const __m128i dummyAlphaRegister = _mm_set_epi16 (alphaFillValue,
300 alphaFillValue,
301 alphaFillValue,
302 alphaFillValue,
303 alphaFillValue,
304 alphaFillValue,
305 alphaFillValue,
306 alphaFillValue);
307
308 for (size_t pixelCounter = 0; pixelCounter < pixelsToCopySSE; ++pixelCounter)
309 {
310 __m128i redRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSERed);
311 __m128i greenRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEGreen);
312 __m128i blueRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEBlue);
313
314 __m128i redGreenRegister = _mm_unpacklo_epi16 (redRegister,
315 greenRegister);
316 __m128i blueAlphaRegister = _mm_unpacklo_epi16 (blueRegister,
317 dummyAlphaRegister);
318
319 __m128i pixel12Register = _mm_unpacklo_epi32 (redGreenRegister,
320 blueAlphaRegister);
321 __m128i pixel34Register = _mm_unpackhi_epi32 (redGreenRegister,
322 blueAlphaRegister);
323
324 storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel12Register);
325 ++writePtrSSE;
326
327 storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel34Register);
328 ++writePtrSSE;
329
330 redGreenRegister = _mm_unpackhi_epi16 (redRegister,
331 greenRegister);
332 blueAlphaRegister = _mm_unpackhi_epi16 (blueRegister,
333 dummyAlphaRegister);
334
335 pixel12Register = _mm_unpacklo_epi32 (redGreenRegister,
336 blueAlphaRegister);
337 pixel34Register = _mm_unpackhi_epi32 (redGreenRegister,
338 blueAlphaRegister);
339
340 storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel12Register);
341 ++writePtrSSE;
342
343 storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, pixel34Register);
344 ++writePtrSSE;
345
346 ++readPtrSSEBlue;
347 ++readPtrSSEGreen;
348 ++readPtrSSERed;
349 }
350 }
351
352 //
353 // Not using SSE intrinsics. This is still faster than the alternative
354 // because we have multiple read pointers and therefore we are able to
355 // take advantage of data locality for write operations.
356 //
357 EXR_FORCEINLINE
358 void
writeToRGBAFillANormal(unsigned short * & readPtrRed,unsigned short * & readPtrGreen,unsigned short * & readPtrBlue,const unsigned short & alphaFillValue,unsigned short * & writePtr,const size_t & pixelsToCopy)359 writeToRGBAFillANormal (unsigned short*& readPtrRed,
360 unsigned short*& readPtrGreen,
361 unsigned short*& readPtrBlue,
362 const unsigned short& alphaFillValue,
363 unsigned short*& writePtr,
364 const size_t& pixelsToCopy)
365 {
366 for (size_t i = 0; i < pixelsToCopy; ++i)
367 {
368 *(writePtr++) = *(readPtrRed++);
369 *(writePtr++) = *(readPtrGreen++);
370 *(writePtr++) = *(readPtrBlue++);
371 *(writePtr++) = alphaFillValue;
372 }
373 }
374
375 //
376 // Determine which (template) version to use by checking whether pointers
377 // are aligned.
378 //
379 EXR_FORCEINLINE
380 void
optimizedWriteToRGBAFillA(unsigned short * & readPtrRed,unsigned short * & readPtrGreen,unsigned short * & readPtrBlue,const unsigned short & alphaFillValue,unsigned short * & writePtr,const size_t & pixelsToCopySSE,const size_t & pixelsToCopyNormal)381 optimizedWriteToRGBAFillA (unsigned short*& readPtrRed,
382 unsigned short*& readPtrGreen,
383 unsigned short*& readPtrBlue,
384 const unsigned short& alphaFillValue,
385 unsigned short*& writePtr,
386 const size_t& pixelsToCopySSE,
387 const size_t& pixelsToCopyNormal)
388 {
389 bool readPtrAreAligned = true;
390
391 readPtrAreAligned &= isPointerSSEAligned (readPtrRed);
392 readPtrAreAligned &= isPointerSSEAligned (readPtrGreen);
393 readPtrAreAligned &= isPointerSSEAligned (readPtrBlue);
394
395 bool writePtrIsAligned = isPointerSSEAligned (writePtr);
396
397 if (!readPtrAreAligned && !writePtrIsAligned)
398 {
399 writeToRGBAFillASSETemplate<false, false> ((__m128i*&)readPtrRed,
400 (__m128i*&)readPtrGreen,
401 (__m128i*&)readPtrBlue,
402 alphaFillValue,
403 (__m128i*&)writePtr,
404 pixelsToCopySSE);
405 }
406 else if (!readPtrAreAligned && writePtrIsAligned)
407 {
408 writeToRGBAFillASSETemplate<false, true> ((__m128i*&)readPtrRed,
409 (__m128i*&)readPtrGreen,
410 (__m128i*&)readPtrBlue,
411 alphaFillValue,
412 (__m128i*&)writePtr,
413 pixelsToCopySSE);
414 }
415 else if (readPtrAreAligned && !writePtrIsAligned)
416 {
417 writeToRGBAFillASSETemplate<true, false> ((__m128i*&)readPtrRed,
418 (__m128i*&)readPtrGreen,
419 (__m128i*&)readPtrBlue,
420 alphaFillValue,
421 (__m128i*&)writePtr,
422 pixelsToCopySSE);
423 }
424 else if (readPtrAreAligned && writePtrIsAligned)
425 {
426 writeToRGBAFillASSETemplate<true, true> ((__m128i*&)readPtrRed,
427 (__m128i*&)readPtrGreen,
428 (__m128i*&)readPtrBlue,
429 alphaFillValue,
430 (__m128i*&)writePtr,
431 pixelsToCopySSE);
432 }
433
434 writeToRGBAFillANormal (readPtrRed,
435 readPtrGreen, readPtrBlue, alphaFillValue,
436 writePtr, pixelsToCopyNormal);
437 }
438
439
440
441 //------------------------------------------------------------------------
442 //
443 // Write to RGB
444 //
445 //------------------------------------------------------------------------
446
447 //
448 // Using SSE intrinsics
449 //
450 template<bool READ_PTR_ALIGNED, bool WRITE_PTR_ALIGNED>
451 EXR_FORCEINLINE
452 void
writeToRGBSSETemplate(__m128i * & readPtrSSERed,__m128i * & readPtrSSEGreen,__m128i * & readPtrSSEBlue,__m128i * & writePtrSSE,const size_t & pixelsToCopySSE)453 writeToRGBSSETemplate (__m128i*& readPtrSSERed,
454 __m128i*& readPtrSSEGreen,
455 __m128i*& readPtrSSEBlue,
456 __m128i*& writePtrSSE,
457 const size_t& pixelsToCopySSE)
458 {
459
460 for (size_t pixelCounter = 0; pixelCounter < pixelsToCopySSE; ++pixelCounter)
461 {
462 //
463 // Need to shuffle and unpack pointers to obtain my first register
464 // We must save 8 pixels at a time, so we must have the following three registers at the end:
465 // 1) R1 G1 B1 R2 G2 B2 R3 G3
466 // 2) B3 R4 G4 B4 R5 G5 B5 R6
467 // 3) G6 B6 R7 G7 B7 R8 G8 B8
468 //
469 __m128i redRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSERed);
470 __m128i greenRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEGreen);
471 __m128i blueRegister = loadSSE<READ_PTR_ALIGNED> (readPtrSSEBlue);
472
473 //
474 // First register: R1 G1 B1 R2 G2 B2 R3 G3
475 // Construct 2 registers and then unpack them to obtain our final result:
476 //
477 __m128i redGreenRegister = _mm_unpacklo_epi16 (redRegister,
478 greenRegister);
479 __m128i redBlueRegister = _mm_unpacklo_epi16 (redRegister,
480 blueRegister);
481 __m128i greenBlueRegister = _mm_unpacklo_epi16 (greenRegister,
482 blueRegister);
483
484 // Left Part (R1 G1 B1 R2)
485 __m128i quarterRight = _mm_shufflelo_epi16 (redBlueRegister,
486 _MM_SHUFFLE(3,0,2,1));
487 __m128i halfLeft = _mm_unpacklo_epi32 (redGreenRegister,
488 quarterRight);
489
490 // Right Part (G2 B2 R3 G3)
491 __m128i quarterLeft = _mm_shuffle_epi32 (greenBlueRegister,
492 _MM_SHUFFLE(3,2,0,1));
493 quarterRight = _mm_shuffle_epi32 (redGreenRegister,
494 _MM_SHUFFLE(3,0,1,2));
495 __m128i halfRight = _mm_unpacklo_epi32 (quarterLeft, quarterRight);
496
497 __m128i fullRegister = _mm_unpacklo_epi64 (halfLeft, halfRight);
498 storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, fullRegister);
499 ++writePtrSSE;
500
501 //
502 // Second register: B3 R4 G4 B4 R5 G5 B5 R6
503 //
504
505 // Left Part (B3, R4, G4, B4)
506 quarterLeft = _mm_shufflehi_epi16 (redBlueRegister,
507 _MM_SHUFFLE(0, 3, 2, 1));
508 quarterRight = _mm_shufflehi_epi16 (greenBlueRegister,
509 _MM_SHUFFLE(1, 0, 3, 2));
510 halfLeft = _mm_unpackhi_epi32 (quarterLeft, quarterRight);
511
512 // Update the registers
513 redGreenRegister = _mm_unpackhi_epi16 (redRegister, greenRegister);
514 redBlueRegister = _mm_unpackhi_epi16 (redRegister, blueRegister);
515 greenBlueRegister = _mm_unpackhi_epi16 (greenRegister, blueRegister);
516
517 // Right Part (R5 G5 B5 R6)
518 quarterRight = _mm_shufflelo_epi16 (redBlueRegister,
519 _MM_SHUFFLE(3,0,2,1));
520 halfRight = _mm_unpacklo_epi32 (redGreenRegister, quarterRight);
521
522 fullRegister = _mm_unpacklo_epi64 (halfLeft, halfRight);
523 storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, fullRegister);
524 ++writePtrSSE;
525
526 //
527 // Third register: G6 B6 R7 G7 B7 R8 G8 B8
528 //
529
530 // Left part (G6 B6 R7 G7)
531 quarterLeft = _mm_shuffle_epi32 (greenBlueRegister,
532 _MM_SHUFFLE(3,2,0,1));
533 quarterRight = _mm_shuffle_epi32 (redGreenRegister,
534 _MM_SHUFFLE(3,0,1,2));
535 halfLeft = _mm_unpacklo_epi32 (quarterLeft, quarterRight);
536
537 // Right part (B7 R8 G8 B8)
538 quarterLeft = _mm_shufflehi_epi16 (redBlueRegister,
539 _MM_SHUFFLE(0, 3, 2, 1));
540 quarterRight = _mm_shufflehi_epi16 (greenBlueRegister,
541 _MM_SHUFFLE(1, 0, 3, 2));
542 halfRight = _mm_unpackhi_epi32 (quarterLeft, quarterRight);
543
544 fullRegister = _mm_unpacklo_epi64 (halfLeft, halfRight);
545 storeSSE<WRITE_PTR_ALIGNED> (writePtrSSE, fullRegister);
546 ++writePtrSSE;
547
548 //
549 // Increment read pointers
550 //
551 ++readPtrSSEBlue;
552 ++readPtrSSEGreen;
553 ++readPtrSSERed;
554 }
555 }
556
557 //
558 // Not using SSE intrinsics. This is still faster than the alternative
559 // because we have multiple read pointers and therefore we are able to
560 // take advantage of data locality for write operations.
561 //
562 EXR_FORCEINLINE
563 void
writeToRGBNormal(unsigned short * & readPtrRed,unsigned short * & readPtrGreen,unsigned short * & readPtrBlue,unsigned short * & writePtr,const size_t & pixelsToCopy)564 writeToRGBNormal (unsigned short*& readPtrRed,
565 unsigned short*& readPtrGreen,
566 unsigned short*& readPtrBlue,
567 unsigned short*& writePtr,
568 const size_t& pixelsToCopy)
569 {
570 for (size_t i = 0; i < pixelsToCopy; ++i)
571 {
572 *(writePtr++) = *(readPtrRed++);
573 *(writePtr++) = *(readPtrGreen++);
574 *(writePtr++) = *(readPtrBlue++);
575 }
576 }
577
578 //
579 // Determine which (template) version to use by checking whether pointers
580 // are aligned
581 //
582 EXR_FORCEINLINE
optimizedWriteToRGB(unsigned short * & readPtrRed,unsigned short * & readPtrGreen,unsigned short * & readPtrBlue,unsigned short * & writePtr,const size_t & pixelsToCopySSE,const size_t & pixelsToCopyNormal)583 void optimizedWriteToRGB (unsigned short*& readPtrRed,
584 unsigned short*& readPtrGreen,
585 unsigned short*& readPtrBlue,
586 unsigned short*& writePtr,
587 const size_t& pixelsToCopySSE,
588 const size_t& pixelsToCopyNormal)
589 {
590 bool readPtrAreAligned = true;
591
592 readPtrAreAligned &= isPointerSSEAligned(readPtrRed);
593 readPtrAreAligned &= isPointerSSEAligned(readPtrGreen);
594 readPtrAreAligned &= isPointerSSEAligned(readPtrBlue);
595
596 bool writePtrIsAligned = isPointerSSEAligned(writePtr);
597
598 if (!readPtrAreAligned && !writePtrIsAligned)
599 {
600 writeToRGBSSETemplate<false, false> ((__m128i*&)readPtrRed,
601 (__m128i*&)readPtrGreen,
602 (__m128i*&)readPtrBlue,
603 (__m128i*&)writePtr,
604 pixelsToCopySSE);
605 }
606 else if (!readPtrAreAligned && writePtrIsAligned)
607 {
608 writeToRGBSSETemplate<false, true> ((__m128i*&)readPtrRed,
609 (__m128i*&)readPtrGreen,
610 (__m128i*&)readPtrBlue,
611 (__m128i*&)writePtr,
612 pixelsToCopySSE);
613 }
614 else if (readPtrAreAligned && !writePtrIsAligned)
615 {
616 writeToRGBSSETemplate<true, false> ((__m128i*&)readPtrRed,
617 (__m128i*&)readPtrGreen,
618 (__m128i*&)readPtrBlue,
619 (__m128i*&)writePtr,
620 pixelsToCopySSE);
621 }
622 else if (readPtrAreAligned && writePtrIsAligned)
623 {
624 writeToRGBSSETemplate<true, true> ((__m128i*&)readPtrRed,
625 (__m128i*&)readPtrGreen,
626 (__m128i*&)readPtrBlue,
627 (__m128i*&)writePtr,
628 pixelsToCopySSE);
629 }
630
631
632 writeToRGBNormal (readPtrRed, readPtrGreen, readPtrBlue,
633 writePtr, pixelsToCopyNormal);
634 }
635
636
637
638
639 #else // ! defined IMF_HAVE_SSE2
640
641 #endif // defined IMF_HAVE_SSE2
642
643
644 OPENEXR_IMF_INTERNAL_NAMESPACE_HEADER_EXIT
645
646 #endif
647