1 /*
2 ** SPDX-License-Identifier: BSD-3-Clause
3 ** Copyright Contributors to the OpenEXR Project.
4 */
5 
6 #include "internal_coding.h"
7 #include "internal_xdr.h"
8 
9 #include "openexr_attr.h"
10 
11 #include <string.h>
12 #include <stdbool.h>
13 
14 #if defined(__x86_64__) || defined(_M_X64)
15 #    ifndef _WIN32
16 #        include <cpuid.h>
17 #    endif
18 #endif
19 
20 /**************************************/
21 
22 #ifndef __F16C__
23 static inline void
half_to_float4(float * out,const uint16_t * src)24 half_to_float4 (float* out, const uint16_t* src)
25 {
26     out[0] = half_to_float (src[0]);
27     out[1] = half_to_float (src[1]);
28     out[2] = half_to_float (src[2]);
29     out[3] = half_to_float (src[3]);
30 }
31 
32 static inline void
half_to_float8(float * out,const uint16_t * src)33 half_to_float8 (float* out, const uint16_t* src)
34 {
35     half_to_float4 (out, src);
36     half_to_float4 (out + 4, src + 4);
37 }
38 #endif
39 
40 #if (defined(__x86_64__) || defined(_M_X64)) &&                                \
41     (defined(__F16C__) || defined(__GNUC__) || defined(__clang__))
42 
43 #    if defined(__F16C__)
44 static inline void
half_to_float_buffer(float * out,const uint16_t * in,int w)45 half_to_float_buffer (float* out, const uint16_t* in, int w)
46 #    elif defined(__GNUC__) || defined(__clang__)
47 __attribute__ ((target ("f16c"))) static void
48 half_to_float_buffer_f16c (float* out, const uint16_t* in, int w)
49 #    endif
50 {
51     while (w >= 8)
52     {
53         _mm256_storeu_ps (
54             out, _mm256_cvtph_ps (_mm_loadu_si128 ((const __m128i*) in)));
55         out += 8;
56         in += 8;
57         w -= 8;
58     }
59     // gcc < 9 does not have loadu_si64
60 #    if defined(__clang__) || (__GNUC__ >= 9)
61     switch (w)
62     {
63         case 7:
64             _mm_storeu_ps (out, _mm_cvtph_ps (_mm_loadu_si64 (in)));
65             out[4] = half_to_float (in[4]);
66             out[5] = half_to_float (in[5]);
67             out[6] = half_to_float (in[6]);
68             break;
69         case 6:
70             _mm_storeu_ps (out, _mm_cvtph_ps (_mm_loadu_si64 (in)));
71             out[4] = half_to_float (in[4]);
72             out[5] = half_to_float (in[5]);
73             break;
74         case 5:
75             _mm_storeu_ps (out, _mm_cvtph_ps (_mm_loadu_si64 (in)));
76             out[4] = half_to_float (in[4]);
77             break;
78         case 4: _mm_storeu_ps (out, _mm_cvtph_ps (_mm_loadu_si64 (in))); break;
79         case 3:
80             out[0] = half_to_float (in[0]);
81             out[1] = half_to_float (in[1]);
82             out[2] = half_to_float (in[2]);
83             break;
84         case 2:
85             out[0] = half_to_float (in[0]);
86             out[1] = half_to_float (in[1]);
87             break;
88         case 1: out[0] = half_to_float (in[0]); break;
89     }
90 #    else
91     while (w > 0)
92     {
93         *out++ = half_to_float (*in++);
94         --w;
95     }
96 #    endif
97 }
98 
99 #    ifndef __F16C__
100 static void
half_to_float_buffer_impl(float * out,const uint16_t * in,int w)101 half_to_float_buffer_impl (float* out, const uint16_t* in, int w)
102 {
103     while (w >= 8)
104     {
105         half_to_float8 (out, in);
106         out += 8;
107         in += 8;
108         w -= 8;
109     }
110     switch (w)
111     {
112         case 7:
113             half_to_float4 (out, in);
114             out[4] = half_to_float (in[4]);
115             out[5] = half_to_float (in[5]);
116             out[6] = half_to_float (in[6]);
117             break;
118         case 6:
119             half_to_float4 (out, in);
120             out[4] = half_to_float (in[4]);
121             out[5] = half_to_float (in[5]);
122             break;
123         case 5:
124             half_to_float4 (out, in);
125             out[4] = half_to_float (in[4]);
126             break;
127         case 4: half_to_float4 (out, in); break;
128         case 3:
129             out[0] = half_to_float (in[0]);
130             out[1] = half_to_float (in[1]);
131             out[2] = half_to_float (in[2]);
132             break;
133         case 2:
134             out[0] = half_to_float (in[0]);
135             out[1] = half_to_float (in[1]);
136             break;
137         case 1: out[0] = half_to_float (in[0]); break;
138     }
139 }
140 
141 static void (*half_to_float_buffer) (float*, const uint16_t*, int) =
142     &half_to_float_buffer_impl;
143 
144 static void
choose_half_to_float_impl()145 choose_half_to_float_impl ()
146 {
147     // regs[2] in the extended block is ECX, where f16c indicator lives
148 #        ifdef _WIN32
149     int regs[4];
150 
151     __cpuid (regs, 0);
152     if (regs[0] >= 1) { __cpuidex (regs, 1, 0); }
153     else regs[2] = 0;
154 #        else
155     unsigned int regs[4];
156     __get_cpuid (0, &regs[0], &regs[1], &regs[2], &regs[3]);
157     if (regs[0] >= 1)
158     {
159         __get_cpuid (1, &regs[0], &regs[1], &regs[2], &regs[3]);
160     }
161     else
162         regs[2] = 0;
163 #        endif
164     /* F16C is indicated by bit 29 */
165     if (regs[2] & (1 << 29)) half_to_float_buffer = &half_to_float_buffer_f16c;
166 }
167 #    else
168 /* when we explicitly compile against f16, force it in */
169 static void
choose_half_to_float_impl()170 choose_half_to_float_impl ()
171 {}
172 
173 #    endif /* F16C */
174 
175 #else
176 
177 static inline void
half_to_float_buffer(float * out,const uint16_t * in,int w)178 half_to_float_buffer (float* out, const uint16_t* in, int w)
179 {
180 #    if EXR_HOST_IS_NOT_LITTLE_ENDIAN
181     for (int x = 0; x < w; ++x)
182         out[x] = half_to_float (one_to_native16 (in[x]));
183 #    else
184     while (w >= 8)
185     {
186         half_to_float8 (out, in);
187         out += 8;
188         in += 8;
189         w -= 8;
190     }
191     switch (w)
192     {
193         case 7:
194             half_to_float4 (out, in);
195             out[4] = half_to_float (in[4]);
196             out[5] = half_to_float (in[5]);
197             out[6] = half_to_float (in[6]);
198             break;
199         case 6:
200             half_to_float4 (out, in);
201             out[4] = half_to_float (in[4]);
202             out[5] = half_to_float (in[5]);
203             break;
204         case 5:
205             half_to_float4 (out, in);
206             out[4] = half_to_float (in[4]);
207             break;
208         case 4: half_to_float4 (out, in); break;
209         case 3:
210             out[0] = half_to_float (in[0]);
211             out[1] = half_to_float (in[1]);
212             out[2] = half_to_float (in[2]);
213             break;
214         case 2:
215             out[0] = half_to_float (in[0]);
216             out[1] = half_to_float (in[1]);
217             break;
218         case 1: out[0] = half_to_float (in[0]); break;
219     }
220 #    endif
221 }
222 
223 static void
choose_half_to_float_impl()224 choose_half_to_float_impl ()
225 {}
226 
227 #endif
228 
229 /**************************************/
230 
231 static exr_result_t
unpack_16bit_3chan_interleave(exr_decode_pipeline_t * decode)232 unpack_16bit_3chan_interleave (exr_decode_pipeline_t* decode)
233 {
234     /* we know we're unpacking all the channels and there is no subsampling */
235     const uint8_t*  srcbuffer = decode->unpacked_buffer;
236     const uint16_t *in0, *in1, *in2;
237     uint8_t*        out0;
238     int             w, h;
239     int             linc0;
240 
241     w     = decode->channels[0].width;
242     h     = decode->chunk.height;
243     linc0 = decode->channels[0].user_line_stride;
244 
245     out0 = decode->channels[0].decode_to_ptr;
246 
247     /* interleaving case, we can do this! */
248     for (int y = 0; y < h; ++y)
249     {
250         uint16_t* out = (uint16_t*) out0;
251 
252         in0 = (const uint16_t*) srcbuffer;
253         in1 = in0 + w;
254         in2 = in1 + w;
255 
256         srcbuffer += w * 6; // 3 * sizeof(uint16_t), avoid type conversion
257         for (int x = 0; x < w; ++x)
258         {
259             out[0] = one_to_native16 (in0[x]);
260             out[1] = one_to_native16 (in1[x]);
261             out[2] = one_to_native16 (in2[x]);
262             out += 3;
263         }
264         out0 += linc0;
265     }
266     return EXR_ERR_SUCCESS;
267 }
268 
269 /**************************************/
270 
271 static exr_result_t
unpack_16bit_3chan_interleave_rev(exr_decode_pipeline_t * decode)272 unpack_16bit_3chan_interleave_rev (exr_decode_pipeline_t* decode)
273 {
274     /* we know we're unpacking all the channels and there is no subsampling */
275     const uint8_t*  srcbuffer = decode->unpacked_buffer;
276     const uint16_t *in0, *in1, *in2;
277     uint8_t*        out0;
278     int             w, h;
279     int             linc0;
280 
281     w     = decode->channels[0].width;
282     h     = decode->chunk.height;
283     linc0 = decode->channels[0].user_line_stride;
284 
285     out0 = decode->channels[2].decode_to_ptr;
286 
287     /* interleaving case, we can do this! */
288     for (int y = 0; y < h; ++y)
289     {
290         uint16_t* out = (uint16_t*) out0;
291 
292         in0 = (const uint16_t*) srcbuffer; // B
293         in1 = in0 + w; // G
294         in2 = in1 + w; // R
295 
296         srcbuffer += w * 6; // 3 * sizeof(uint16_t), avoid type conversion
297         for (int x = 0; x < w; ++x)
298         {
299             out[0] = one_to_native16 (in2[x]);
300             out[1] = one_to_native16 (in1[x]);
301             out[2] = one_to_native16 (in0[x]);
302             out += 3;
303         }
304         out0 += linc0;
305     }
306     return EXR_ERR_SUCCESS;
307 }
308 
309 /**************************************/
310 
311 static exr_result_t
unpack_half_to_float_3chan_interleave(exr_decode_pipeline_t * decode)312 unpack_half_to_float_3chan_interleave (exr_decode_pipeline_t* decode)
313 {
314     /* we know we're unpacking all the channels and there is no subsampling */
315     const uint8_t*  srcbuffer = decode->unpacked_buffer;
316     const uint16_t *in0, *in1, *in2;
317     uint8_t*        out0;
318     int             w, h;
319     int             linc0;
320 
321     w     = decode->channels[0].width;
322     h     = decode->chunk.height;
323     linc0 = decode->channels[0].user_line_stride;
324 
325     out0 = decode->channels[0].decode_to_ptr;
326 
327     /* interleaving case, we can do this! */
328     for (int y = 0; y < h; ++y)
329     {
330         float* out = (float*) out0;
331 
332         in0 = (const uint16_t*) srcbuffer;
333         in1 = in0 + w;
334         in2 = in1 + w;
335 
336         srcbuffer += w * 6; // 3 * sizeof(uint16_t), avoid type conversion
337         for (int x = 0; x < w; ++x)
338         {
339             out[0] = half_to_float (one_to_native16 (in0[x]));
340             out[1] = half_to_float (one_to_native16 (in1[x]));
341             out[2] = half_to_float (one_to_native16 (in2[x]));
342             out += 3;
343         }
344         out0 += linc0;
345     }
346     return EXR_ERR_SUCCESS;
347 }
348 
349 /**************************************/
350 
351 static exr_result_t
unpack_half_to_float_3chan_interleave_rev(exr_decode_pipeline_t * decode)352 unpack_half_to_float_3chan_interleave_rev (exr_decode_pipeline_t* decode)
353 {
354     /* we know we're unpacking all the channels and there is no subsampling */
355     const uint8_t*  srcbuffer = decode->unpacked_buffer;
356     const uint16_t *in0, *in1, *in2;
357     uint8_t*        out0;
358     int             w, h;
359     int             linc0;
360 
361     w     = decode->channels[0].width;
362     h     = decode->chunk.height;
363     linc0 = decode->channels[0].user_line_stride;
364 
365     out0 = decode->channels[2].decode_to_ptr;
366 
367     /* interleaving case, we can do this! */
368     for (int y = 0; y < h; ++y)
369     {
370         float* out = (float*) out0;
371 
372         in0 = (const uint16_t*) srcbuffer;
373         in1 = in0 + w;
374         in2 = in1 + w;
375 
376         srcbuffer += w * 6; // 3 * sizeof(uint16_t), avoid type conversion
377         for (int x = 0; x < w; ++x)
378         {
379             out[0] = half_to_float (one_to_native16 (in2[x]));
380             out[1] = half_to_float (one_to_native16 (in1[x]));
381             out[2] = half_to_float (one_to_native16 (in0[x]));
382             out += 3;
383         }
384         out0 += linc0;
385     }
386     return EXR_ERR_SUCCESS;
387 }
388 
389 /**************************************/
390 
391 static exr_result_t
unpack_16bit_3chan_planar(exr_decode_pipeline_t * decode)392 unpack_16bit_3chan_planar (exr_decode_pipeline_t* decode)
393 {
394     /* we know we're unpacking all the channels and there is no subsampling */
395     const uint8_t*  srcbuffer = decode->unpacked_buffer;
396     const uint16_t *in0, *in1, *in2;
397     uint8_t *       out0, *out1, *out2;
398     int             w, h;
399     int             inc0, inc1, inc2;
400     int             linc0, linc1, linc2;
401 
402     w     = decode->channels[0].width;
403     h     = decode->chunk.height;
404     inc0  = decode->channels[0].user_pixel_stride;
405     inc1  = decode->channels[1].user_pixel_stride;
406     inc2  = decode->channels[2].user_pixel_stride;
407     linc0 = decode->channels[0].user_line_stride;
408     linc1 = decode->channels[1].user_line_stride;
409     linc2 = decode->channels[2].user_line_stride;
410 
411     out0 = decode->channels[0].decode_to_ptr;
412     out1 = decode->channels[1].decode_to_ptr;
413     out2 = decode->channels[2].decode_to_ptr;
414 
415     // planar output
416     for (int y = 0; y < h; ++y)
417     {
418         in0 = (const uint16_t*) srcbuffer;
419         in1 = in0 + w;
420         in2 = in1 + w;
421         srcbuffer += w * 6; // 3 * sizeof(uint16_t), avoid type conversion
422                             /* specialise to memcpy if we can */
423 #if EXR_HOST_IS_NOT_LITTLE_ENDIAN
424         for (int x = 0; x < w; ++x)
425             *(((uint16_t*) out0) + x) = one_to_native16 (in0[x]);
426         for (int x = 0; x < w; ++x)
427             *(((uint16_t*) out1) + x) = one_to_native16 (in1[x]);
428         for (int x = 0; x < w; ++x)
429             *(((uint16_t*) out2) + x) = one_to_native16 (in2[x]);
430 #else
431         memcpy (out0, in0, (size_t) (w) * sizeof (uint16_t));
432         memcpy (out1, in1, (size_t) (w) * sizeof (uint16_t));
433         memcpy (out2, in2, (size_t) (w) * sizeof (uint16_t));
434 #endif
435         out0 += linc0;
436         out1 += linc1;
437         out2 += linc2;
438     }
439 
440     return EXR_ERR_SUCCESS;
441 }
442 
443 /**************************************/
444 
445 static exr_result_t
unpack_half_to_float_3chan_planar(exr_decode_pipeline_t * decode)446 unpack_half_to_float_3chan_planar (exr_decode_pipeline_t* decode)
447 {
448     /* we know we're unpacking all the channels and there is no subsampling */
449     const uint8_t*  srcbuffer = decode->unpacked_buffer;
450     const uint16_t *in0, *in1, *in2;
451     uint8_t *       out0, *out1, *out2;
452     int             w, h;
453     int             inc0, inc1, inc2;
454     int             linc0, linc1, linc2;
455 
456     w     = decode->channels[0].width;
457     h     = decode->chunk.height;
458     inc0  = decode->channels[0].user_pixel_stride;
459     inc1  = decode->channels[1].user_pixel_stride;
460     inc2  = decode->channels[2].user_pixel_stride;
461     linc0 = decode->channels[0].user_line_stride;
462     linc1 = decode->channels[1].user_line_stride;
463     linc2 = decode->channels[2].user_line_stride;
464 
465     out0 = decode->channels[0].decode_to_ptr;
466     out1 = decode->channels[1].decode_to_ptr;
467     out2 = decode->channels[2].decode_to_ptr;
468 
469     // planar output
470     for (int y = 0; y < h; ++y)
471     {
472         in0 = (const uint16_t*) srcbuffer;
473         in1 = in0 + w;
474         in2 = in1 + w;
475         srcbuffer += w * 6; // 3 * sizeof(uint16_t), avoid type conversion
476                             /* specialise to memcpy if we can */
477         half_to_float_buffer ((float*) out0, in0, w);
478         half_to_float_buffer ((float*) out1, in1, w);
479         half_to_float_buffer ((float*) out2, in2, w);
480 
481         out0 += linc0;
482         out1 += linc1;
483         out2 += linc2;
484     }
485 
486     return EXR_ERR_SUCCESS;
487 }
488 
489 /**************************************/
490 
491 static exr_result_t
unpack_16bit_3chan(exr_decode_pipeline_t * decode)492 unpack_16bit_3chan (exr_decode_pipeline_t* decode)
493 {
494     /* we know we're unpacking all the channels and there is no subsampling */
495     const uint8_t*  srcbuffer = decode->unpacked_buffer;
496     const uint16_t *in0, *in1, *in2;
497     uint8_t *       out0, *out1, *out2;
498     int             w, h;
499     int             inc0, inc1, inc2;
500     int             linc0, linc1, linc2;
501 
502     w     = decode->channels[0].width;
503     h     = decode->chunk.height;
504     inc0  = decode->channels[0].user_pixel_stride;
505     inc1  = decode->channels[1].user_pixel_stride;
506     inc2  = decode->channels[2].user_pixel_stride;
507     linc0 = decode->channels[0].user_line_stride;
508     linc1 = decode->channels[1].user_line_stride;
509     linc2 = decode->channels[2].user_line_stride;
510 
511     out0 = decode->channels[0].decode_to_ptr;
512     out1 = decode->channels[1].decode_to_ptr;
513     out2 = decode->channels[2].decode_to_ptr;
514 
515     for (int y = 0; y < h; ++y)
516     {
517         in0 = (const uint16_t*) srcbuffer;
518         in1 = in0 + w;
519         in2 = in1 + w;
520         srcbuffer += w * 6; // 3 * sizeof(uint16_t), avoid type conversion
521         for (int x = 0; x < w; ++x)
522             *((uint16_t*) (out0 + x * inc0)) = one_to_native16 (in0[x]);
523         for (int x = 0; x < w; ++x)
524             *((uint16_t*) (out1 + x * inc1)) = one_to_native16 (in1[x]);
525         for (int x = 0; x < w; ++x)
526             *((uint16_t*) (out2 + x * inc2)) = one_to_native16 (in2[x]);
527         out0 += linc0;
528         out1 += linc1;
529         out2 += linc2;
530     }
531 
532     return EXR_ERR_SUCCESS;
533 }
534 
535 /**************************************/
536 
537 static exr_result_t
unpack_16bit_4chan_interleave(exr_decode_pipeline_t * decode)538 unpack_16bit_4chan_interleave (exr_decode_pipeline_t* decode)
539 {
540     /* we know we're unpacking all the channels and there is no subsampling */
541     const uint8_t*  srcbuffer = decode->unpacked_buffer;
542     const uint16_t *in0, *in1, *in2, *in3;
543     uint8_t*        out0;
544     int             w, h;
545     int             linc0;
546     /* TODO: can do this with sse and do 2 outpixels at once */
547     union
548     {
549         struct
550         {
551             uint16_t a;
552             uint16_t b;
553             uint16_t g;
554             uint16_t r;
555         };
556         uint64_t allc;
557     } combined;
558 
559     w     = decode->channels[0].width;
560     h     = decode->chunk.height;
561     linc0 = decode->channels[0].user_line_stride;
562 
563     out0 = decode->channels[0].decode_to_ptr;
564 
565     /* interleaving case, we can do this! */
566     for (int y = 0; y < h; ++y)
567     {
568         uint64_t* outall = (uint64_t*) out0;
569         in0              = (const uint16_t*) srcbuffer;
570         in1              = in0 + w;
571         in2              = in1 + w;
572         in3              = in2 + w;
573 
574         srcbuffer += w * 8; // 4 * sizeof(uint16_t), avoid type conversion
575         for (int x = 0; x < w; ++x)
576         {
577             combined.a = one_to_native16 (in0[x]);
578             combined.b = one_to_native16 (in1[x]);
579             combined.g = one_to_native16 (in2[x]);
580             combined.r = one_to_native16 (in3[x]);
581             outall[x]  = combined.allc;
582         }
583         out0 += linc0;
584     }
585     return EXR_ERR_SUCCESS;
586 }
587 
588 /**************************************/
589 
590 static exr_result_t
unpack_16bit_4chan_interleave_rev(exr_decode_pipeline_t * decode)591 unpack_16bit_4chan_interleave_rev (exr_decode_pipeline_t* decode)
592 {
593     /* we know we're unpacking all the channels and there is no subsampling */
594     const uint8_t*  srcbuffer = decode->unpacked_buffer;
595     const uint16_t *in0, *in1, *in2, *in3;
596     uint8_t*        out0;
597     int             w, h;
598     int             linc0;
599     /* TODO: can do this with sse and do 2 outpixels at once */
600     union
601     {
602         struct
603         {
604             uint16_t r;
605             uint16_t g;
606             uint16_t b;
607             uint16_t a;
608         };
609         uint64_t allc;
610     } combined;
611 
612     w     = decode->channels[0].width;
613     h     = decode->chunk.height;
614     linc0 = decode->channels[0].user_line_stride;
615 
616     out0 = decode->channels[3].decode_to_ptr;
617 
618     /* interleaving case, we can do this! */
619     for (int y = 0; y < h; ++y)
620     {
621         uint64_t* outall = (uint64_t*) out0;
622         in0              = (const uint16_t*) srcbuffer;
623         in1              = in0 + w;
624         in2              = in1 + w;
625         in3              = in2 + w;
626 
627         srcbuffer += w * 8; // 4 * sizeof(uint16_t), avoid type conversion
628         for (int x = 0; x < w; ++x)
629         {
630             combined.a = one_to_native16 (in0[x]);
631             combined.b = one_to_native16 (in1[x]);
632             combined.g = one_to_native16 (in2[x]);
633             combined.r = one_to_native16 (in3[x]);
634             outall[x]  = combined.allc;
635         }
636         out0 += linc0;
637     }
638     return EXR_ERR_SUCCESS;
639 }
640 
641 /**************************************/
642 
643 static exr_result_t
unpack_half_to_float_4chan_interleave(exr_decode_pipeline_t * decode)644 unpack_half_to_float_4chan_interleave (exr_decode_pipeline_t* decode)
645 {
646     /* we know we're unpacking all the channels and there is no subsampling */
647     const uint8_t*  srcbuffer = decode->unpacked_buffer;
648     const uint16_t *in0, *in1, *in2, *in3;
649     uint8_t*        out0;
650     int             w, h;
651     int             linc0;
652 
653     w     = decode->channels[0].width;
654     h     = decode->chunk.height;
655     linc0 = decode->channels[0].user_line_stride;
656 
657     out0 = decode->channels[0].decode_to_ptr;
658 
659     /* interleaving case, we can do this! */
660     for (int y = 0; y < h; ++y)
661     {
662         float* out = (float*) out0;
663         in0        = (const uint16_t*) srcbuffer;
664         in1        = in0 + w;
665         in2        = in1 + w;
666         in3        = in2 + w;
667 
668         srcbuffer += w * 8; // 4 * sizeof(uint16_t), avoid type conversion
669         for (int x = 0; x < w; ++x)
670         {
671             out[0] = half_to_float (one_to_native16 (in3[x]));
672             out[1] = half_to_float (one_to_native16 (in2[x]));
673             out[2] = half_to_float (one_to_native16 (in1[x]));
674             out[3] = half_to_float (one_to_native16 (in0[x]));
675             out += 4;
676         }
677         out0 += linc0;
678     }
679     return EXR_ERR_SUCCESS;
680 }
681 
682 /**************************************/
683 
684 static exr_result_t
unpack_half_to_float_4chan_interleave_rev(exr_decode_pipeline_t * decode)685 unpack_half_to_float_4chan_interleave_rev (exr_decode_pipeline_t* decode)
686 {
687     /* we know we're unpacking all the channels and there is no subsampling */
688     const uint8_t*  srcbuffer = decode->unpacked_buffer;
689     const uint16_t *in0, *in1, *in2, *in3;
690     uint8_t*        out0;
691     int             w, h;
692     int             linc0;
693 
694     w     = decode->channels[0].width;
695     h     = decode->chunk.height;
696     linc0 = decode->channels[0].user_line_stride;
697 
698     out0 = decode->channels[3].decode_to_ptr;
699 
700     /* interleaving case, we can do this! */
701     for (int y = 0; y < h; ++y)
702     {
703         float* out = (float*) out0;
704         in0        = (const uint16_t*) srcbuffer;
705         in1        = in0 + w;
706         in2        = in1 + w;
707         in3        = in2 + w;
708 
709         srcbuffer += w * 8; // 4 * sizeof(uint16_t), avoid type conversion
710         for (int x = 0; x < w; ++x)
711         {
712             out[0] = half_to_float (one_to_native16 (in0[x]));
713             out[1] = half_to_float (one_to_native16 (in1[x]));
714             out[2] = half_to_float (one_to_native16 (in2[x]));
715             out[3] = half_to_float (one_to_native16 (in3[x]));
716             out += 4;
717         }
718         out0 += linc0;
719     }
720     return EXR_ERR_SUCCESS;
721 }
722 
723 /**************************************/
724 
725 static exr_result_t
unpack_16bit_4chan_planar(exr_decode_pipeline_t * decode)726 unpack_16bit_4chan_planar (exr_decode_pipeline_t* decode)
727 {
728     /* we know we're unpacking all the channels and there is no subsampling */
729     const uint8_t*  srcbuffer = decode->unpacked_buffer;
730     const uint16_t *in0, *in1, *in2, *in3;
731     uint8_t *       out0, *out1, *out2, *out3;
732     int             w, h;
733     int             linc0, linc1, linc2, linc3;
734 
735     w     = decode->channels[0].width;
736     h     = decode->chunk.height;
737     linc0 = decode->channels[0].user_line_stride;
738     linc1 = decode->channels[1].user_line_stride;
739     linc2 = decode->channels[2].user_line_stride;
740     linc3 = decode->channels[3].user_line_stride;
741 
742     out0 = decode->channels[0].decode_to_ptr;
743     out1 = decode->channels[1].decode_to_ptr;
744     out2 = decode->channels[2].decode_to_ptr;
745     out3 = decode->channels[3].decode_to_ptr;
746 
747     // planar output
748     for (int y = 0; y < h; ++y)
749     {
750         in0 = (const uint16_t*) srcbuffer;
751         in1 = in0 + w;
752         in2 = in1 + w;
753         in3 = in2 + w;
754         srcbuffer += w * 8; // 4 * sizeof(uint16_t), avoid type conversion
755                             /* specialize to memcpy if we can */
756 #if EXR_HOST_IS_NOT_LITTLE_ENDIAN
757         for (int x = 0; x < w; ++x)
758             *(((uint16_t*) out0) + x) = one_to_native16 (in0[x]);
759         for (int x = 0; x < w; ++x)
760             *(((uint16_t*) out1) + x) = one_to_native16 (in1[x]);
761         for (int x = 0; x < w; ++x)
762             *(((uint16_t*) out2) + x) = one_to_native16 (in2[x]);
763         for (int x = 0; x < w; ++x)
764             *(((uint16_t*) out3) + x) = one_to_native16 (in3[x]);
765 #else
766         memcpy (out0, in0, (size_t) (w) * sizeof (uint16_t));
767         memcpy (out1, in1, (size_t) (w) * sizeof (uint16_t));
768         memcpy (out2, in2, (size_t) (w) * sizeof (uint16_t));
769         memcpy (out3, in3, (size_t) (w) * sizeof (uint16_t));
770 #endif
771         out0 += linc0;
772         out1 += linc1;
773         out2 += linc2;
774         out3 += linc3;
775     }
776     return EXR_ERR_SUCCESS;
777 }
778 
779 /**************************************/
780 
781 static exr_result_t
unpack_half_to_float_4chan_planar(exr_decode_pipeline_t * decode)782 unpack_half_to_float_4chan_planar (exr_decode_pipeline_t* decode)
783 {
784     /* we know we're unpacking all the channels and there is no subsampling */
785     const uint8_t*  srcbuffer = decode->unpacked_buffer;
786     const uint16_t *in0, *in1, *in2, *in3;
787     uint8_t *       out0, *out1, *out2, *out3;
788     int             w, h;
789     int             linc0, linc1, linc2, linc3;
790 
791     w     = decode->channels[0].width;
792     h     = decode->chunk.height;
793     linc0 = decode->channels[0].user_line_stride;
794     linc1 = decode->channels[1].user_line_stride;
795     linc2 = decode->channels[2].user_line_stride;
796     linc3 = decode->channels[3].user_line_stride;
797 
798     out0 = decode->channels[0].decode_to_ptr;
799     out1 = decode->channels[1].decode_to_ptr;
800     out2 = decode->channels[2].decode_to_ptr;
801     out3 = decode->channels[3].decode_to_ptr;
802 
803     // planar output
804     for (int y = 0; y < h; ++y)
805     {
806         in0 = (const uint16_t*) srcbuffer;
807         in1 = in0 + w;
808         in2 = in1 + w;
809         in3 = in2 + w;
810         srcbuffer += w * 8; // 4 * sizeof(uint16_t), avoid type conversion
811 
812         half_to_float_buffer ((float*) out0, in0, w);
813         half_to_float_buffer ((float*) out1, in1, w);
814         half_to_float_buffer ((float*) out2, in2, w);
815         half_to_float_buffer ((float*) out3, in3, w);
816 
817         out0 += linc0;
818         out1 += linc1;
819         out2 += linc2;
820         out3 += linc3;
821     }
822     return EXR_ERR_SUCCESS;
823 }
824 
825 /**************************************/
826 
827 static exr_result_t
unpack_16bit_4chan(exr_decode_pipeline_t * decode)828 unpack_16bit_4chan (exr_decode_pipeline_t* decode)
829 {
830     /* we know we're unpacking all the channels and there is no subsampling */
831     const uint8_t*  srcbuffer = decode->unpacked_buffer;
832     const uint16_t *in0, *in1, *in2, *in3;
833     uint8_t *       out0, *out1, *out2, *out3;
834     int             w, h;
835     int             inc0, inc1, inc2, inc3;
836     int             linc0, linc1, linc2, linc3;
837 
838     w     = decode->channels[0].width;
839     h     = decode->chunk.height;
840     inc0  = decode->channels[0].user_pixel_stride;
841     inc1  = decode->channels[1].user_pixel_stride;
842     inc2  = decode->channels[2].user_pixel_stride;
843     inc3  = decode->channels[3].user_pixel_stride;
844     linc0 = decode->channels[0].user_line_stride;
845     linc1 = decode->channels[1].user_line_stride;
846     linc2 = decode->channels[2].user_line_stride;
847     linc3 = decode->channels[3].user_line_stride;
848 
849     out0 = decode->channels[0].decode_to_ptr;
850     out1 = decode->channels[1].decode_to_ptr;
851     out2 = decode->channels[2].decode_to_ptr;
852     out3 = decode->channels[3].decode_to_ptr;
853 
854     for (int y = 0; y < h; ++y)
855     {
856         in0 = (const uint16_t*) srcbuffer;
857         in1 = in0 + w;
858         in2 = in1 + w;
859         in3 = in2 + w;
860         srcbuffer += w * 8; // 4 * sizeof(uint16_t), avoid type conversion
861         for (int x = 0; x < w; ++x)
862             *((uint16_t*) (out0 + x * inc0)) = one_to_native16 (in0[x]);
863         for (int x = 0; x < w; ++x)
864             *((uint16_t*) (out1 + x * inc1)) = one_to_native16 (in1[x]);
865         for (int x = 0; x < w; ++x)
866             *((uint16_t*) (out2 + x * inc2)) = one_to_native16 (in2[x]);
867         for (int x = 0; x < w; ++x)
868             *((uint16_t*) (out3 + x * inc3)) = one_to_native16 (in3[x]);
869         out0 += linc0;
870         out1 += linc1;
871         out2 += linc2;
872         out3 += linc3;
873     }
874     return EXR_ERR_SUCCESS;
875 }
876 
877 /**************************************/
878 
879 static exr_result_t
unpack_16bit(exr_decode_pipeline_t * decode)880 unpack_16bit (exr_decode_pipeline_t* decode)
881 {
882     /* we know we're unpacking all the channels and there is no subsampling */
883     const uint8_t* srcbuffer = decode->unpacked_buffer;
884     uint8_t*       cdata;
885     int            w, h, pixincrement;
886 
887     h = decode->chunk.height;
888     for (int y = 0; y < h; ++y)
889     {
890         for (int c = 0; c < decode->channel_count; ++c)
891         {
892             exr_coding_channel_info_t* decc = (decode->channels + c);
893 
894             cdata        = decc->decode_to_ptr;
895             w            = decc->width;
896             pixincrement = decc->user_pixel_stride;
897             cdata += (uint64_t) y * (uint64_t) decc->user_line_stride;
898             /* specialize to memcpy if we can */
899 #if EXR_HOST_IS_NOT_LITTLE_ENDIAN
900             if (pixincrement == 2)
901             {
902                 uint16_t*       tmp = (uint16_t*) cdata;
903                 const uint16_t* src = (const uint16_t*) srcbuffer;
904                 uint16_t*       end = tmp + w;
905 
906                 while (tmp < end)
907                     *tmp++ = one_to_native16 (*src++);
908             }
909             else
910             {
911                 const uint16_t* src = (const uint16_t*) srcbuffer;
912                 for (int x = 0; x < w; ++x)
913                 {
914                     *((uint16_t*) cdata) = one_to_native16 (*src++);
915                     cdata += pixincrement;
916                 }
917             }
918 #else
919             if (pixincrement == 2)
920             {
921                 memcpy (cdata, srcbuffer, (size_t) (w) *2);
922             }
923             else
924             {
925                 const uint16_t* src = (const uint16_t*) srcbuffer;
926                 for (int x = 0; x < w; ++x)
927                 {
928                     *((uint16_t*) cdata) = *src++;
929                     cdata += pixincrement;
930                 }
931             }
932 #endif
933             srcbuffer += w * 2;
934         }
935     }
936     return EXR_ERR_SUCCESS;
937 }
938 
939 //static exr_result_t unpack_32bit_3chan (exr_decode_pipeline_t* decode);
940 //static exr_result_t unpack_32bit_4chan (exr_decode_pipeline_t* decode);
941 
942 static exr_result_t
unpack_32bit(exr_decode_pipeline_t * decode)943 unpack_32bit (exr_decode_pipeline_t* decode)
944 {
945     /* we know we're unpacking all the channels and there is no subsampling */
946     const uint8_t* srcbuffer = decode->unpacked_buffer;
947     uint8_t*       cdata;
948     int64_t        w, h, pixincrement;
949     int            chans = decode->channel_count;
950 
951     h = (int64_t) decode->chunk.height;
952 
953     for (int64_t y = 0; y < h; ++y)
954     {
955         for (int c = 0; c < chans; ++c)
956         {
957             exr_coding_channel_info_t* decc = (decode->channels + c);
958 
959             cdata        = decc->decode_to_ptr;
960             w            = decc->width;
961             pixincrement = decc->user_pixel_stride;
962             cdata += y * (int64_t) decc->user_line_stride;
963             /* specialize to memcpy if we can */
964 #if EXR_HOST_IS_NOT_LITTLE_ENDIAN
965             if (pixincrement == 4)
966             {
967                 uint32_t*       tmp = (uint32_t*) cdata;
968                 const uint32_t* src = (const uint32_t*) srcbuffer;
969                 uint32_t*       end = tmp + w;
970 
971                 while (tmp < end)
972                     *tmp++ = le32toh (*src++);
973             }
974             else
975             {
976                 const uint32_t* src = (const uint32_t*) srcbuffer;
977                 for (int64_t x = 0; x < w; ++x)
978                 {
979                     *((uint32_t*) cdata) = le32toh (*src++);
980                     cdata += pixincrement;
981                 }
982             }
983 #else
984             if (pixincrement == 4)
985             {
986                 memcpy (cdata, srcbuffer, (size_t) (w) *4);
987             }
988             else
989             {
990                 const uint32_t* src = (const uint32_t*) srcbuffer;
991                 for (int64_t x = 0; x < w; ++x)
992                 {
993                     *((uint32_t*) cdata) = *src++;
994                     cdata += pixincrement;
995                 }
996             }
997 #endif
998             srcbuffer += w * 4;
999         }
1000     }
1001     return EXR_ERR_SUCCESS;
1002 }
1003 
1004 #define UNPACK_SAMPLES(samps)                                                  \
1005     switch (decc->data_type)                                                   \
1006     {                                                                          \
1007         case EXR_PIXEL_HALF:                                                   \
1008             switch (decc->user_data_type)                                      \
1009             {                                                                  \
1010                 case EXR_PIXEL_HALF: {                                         \
1011                     const uint16_t* src = (const uint16_t*) srcbuffer;         \
1012                     for (int s = 0; s < samps; ++s)                            \
1013                     {                                                          \
1014                         *((uint16_t*) cdata) = unaligned_load16 (src);         \
1015                         ++src;                                                 \
1016                         cdata += ubpc;                                         \
1017                     }                                                          \
1018                     break;                                                     \
1019                 }                                                              \
1020                 case EXR_PIXEL_FLOAT: {                                        \
1021                     const uint16_t* src = (const uint16_t*) srcbuffer;         \
1022                     for (int s = 0; s < samps; ++s)                            \
1023                     {                                                          \
1024                         uint16_t cval = unaligned_load16 (src);                \
1025                         ++src;                                                 \
1026                         *((float*) cdata) = half_to_float (cval);              \
1027                         cdata += ubpc;                                         \
1028                     }                                                          \
1029                     break;                                                     \
1030                 }                                                              \
1031                 case EXR_PIXEL_UINT: {                                         \
1032                     const uint16_t* src = (const uint16_t*) srcbuffer;         \
1033                     for (int s = 0; s < samps; ++s)                            \
1034                     {                                                          \
1035                         uint16_t cval = unaligned_load16 (src);                \
1036                         ++src;                                                 \
1037                         *((uint32_t*) cdata) = half_to_uint (cval);            \
1038                         cdata += ubpc;                                         \
1039                     }                                                          \
1040                     break;                                                     \
1041                 }                                                              \
1042                 default: return EXR_ERR_INVALID_ARGUMENT;                      \
1043             }                                                                  \
1044             break;                                                             \
1045         case EXR_PIXEL_FLOAT:                                                  \
1046             switch (decc->user_data_type)                                      \
1047             {                                                                  \
1048                 case EXR_PIXEL_HALF: {                                         \
1049                     const uint32_t* src = (const uint32_t*) srcbuffer;         \
1050                     for (int s = 0; s < samps; ++s)                            \
1051                     {                                                          \
1052                         uint32_t fint = unaligned_load32 (src);                \
1053                         ++src;                                                 \
1054                         *((uint16_t*) cdata) = float_to_half_int (fint);       \
1055                         cdata += ubpc;                                         \
1056                     }                                                          \
1057                     break;                                                     \
1058                 }                                                              \
1059                 case EXR_PIXEL_FLOAT: {                                        \
1060                     const uint32_t* src = (const uint32_t*) srcbuffer;         \
1061                     for (int s = 0; s < samps; ++s)                            \
1062                     {                                                          \
1063                         *((uint32_t*) cdata) = unaligned_load32 (src);         \
1064                         ++src;                                                 \
1065                         cdata += ubpc;                                         \
1066                     }                                                          \
1067                     break;                                                     \
1068                 }                                                              \
1069                 case EXR_PIXEL_UINT: {                                         \
1070                     const uint32_t* src = (const uint32_t*) srcbuffer;         \
1071                     for (int s = 0; s < samps; ++s)                            \
1072                     {                                                          \
1073                         uint32_t fint = unaligned_load32 (src);                \
1074                         ++src;                                                 \
1075                         *((uint32_t*) cdata) = float_to_uint_int (fint);       \
1076                         cdata += ubpc;                                         \
1077                     }                                                          \
1078                     break;                                                     \
1079                 }                                                              \
1080                 default: return EXR_ERR_INVALID_ARGUMENT;                      \
1081             }                                                                  \
1082             break;                                                             \
1083         case EXR_PIXEL_UINT:                                                   \
1084             switch (decc->user_data_type)                                      \
1085             {                                                                  \
1086                 case EXR_PIXEL_HALF: {                                         \
1087                     const uint32_t* src = (const uint32_t*) srcbuffer;         \
1088                     for (int s = 0; s < samps; ++s)                            \
1089                     {                                                          \
1090                         uint32_t fint = unaligned_load32 (src);                \
1091                         ++src;                                                 \
1092                         *((uint16_t*) cdata) = uint_to_half (fint);            \
1093                         cdata += ubpc;                                         \
1094                     }                                                          \
1095                     break;                                                     \
1096                 }                                                              \
1097                 case EXR_PIXEL_FLOAT: {                                        \
1098                     const uint32_t* src = (const uint32_t*) srcbuffer;         \
1099                     for (int s = 0; s < samps; ++s)                            \
1100                     {                                                          \
1101                         uint32_t fint = unaligned_load32 (src);                \
1102                         ++src;                                                 \
1103                         *((float*) cdata) = uint_to_float (fint);              \
1104                         cdata += ubpc;                                         \
1105                     }                                                          \
1106                     break;                                                     \
1107                 }                                                              \
1108                 case EXR_PIXEL_UINT: {                                         \
1109                     const uint32_t* src = (const uint32_t*) srcbuffer;         \
1110                     for (int s = 0; s < samps; ++s)                            \
1111                     {                                                          \
1112                         *((uint32_t*) cdata) = unaligned_load32 (src);         \
1113                         ++src;                                                 \
1114                         cdata += ubpc;                                         \
1115                     }                                                          \
1116                     break;                                                     \
1117                 }                                                              \
1118                 default: return EXR_ERR_INVALID_ARGUMENT;                      \
1119             }                                                                  \
1120             break;                                                             \
1121         default: return EXR_ERR_INVALID_ARGUMENT;                              \
1122     }
1123 
1124 static exr_result_t
generic_unpack(exr_decode_pipeline_t * decode)1125 generic_unpack (exr_decode_pipeline_t* decode)
1126 {
1127     const uint8_t* srcbuffer = decode->unpacked_buffer;
1128     uint8_t*       cdata;
1129     int            w, bpc, ubpc;
1130 
1131     for (int y = 0; y < decode->chunk.height; ++y)
1132     {
1133         int cury = y + decode->chunk.start_y;
1134 
1135         for (int c = 0; c < decode->channel_count; ++c)
1136         {
1137             exr_coding_channel_info_t* decc = (decode->channels + c);
1138 
1139             cdata = decc->decode_to_ptr;
1140             w     = decc->width;
1141             bpc   = decc->bytes_per_element;
1142             ubpc  = decc->user_pixel_stride;
1143 
1144             if (decc->y_samples > 1)
1145             {
1146                 if ((cury % decc->y_samples) != 0) continue;
1147                 if (cdata)
1148                     cdata +=
1149                         ((uint64_t) (y / decc->y_samples) *
1150                          (uint64_t) decc->user_line_stride);
1151                 else
1152                 {
1153                     srcbuffer += w * bpc;
1154                     continue;
1155                 }
1156             }
1157             else if (cdata)
1158             {
1159                 cdata +=
1160                     ((uint64_t) y) * ((uint64_t) decc->user_line_stride);
1161             }
1162             else
1163             {
1164                 srcbuffer += w * bpc;
1165                 continue;
1166             }
1167 
1168             UNPACK_SAMPLES (w)
1169             srcbuffer += w * bpc;
1170         }
1171     }
1172     return EXR_ERR_SUCCESS;
1173 }
1174 
1175 static exr_result_t
generic_unpack_deep_pointers(exr_decode_pipeline_t * decode)1176 generic_unpack_deep_pointers (exr_decode_pipeline_t* decode)
1177 {
1178     const uint8_t* srcbuffer  = decode->unpacked_buffer;
1179     const int32_t* sampbuffer = decode->sample_count_table;
1180     void**         pdata;
1181     int            w, h, bpc, ubpc;
1182 
1183     w = decode->chunk.width;
1184     h = decode->chunk.height;
1185 
1186     for (int y = 0; y < h; ++y)
1187     {
1188         for (int c = 0; c < decode->channel_count; ++c)
1189         {
1190             exr_coding_channel_info_t* decc      = (decode->channels + c);
1191             int32_t                    prevsamps = 0;
1192             size_t                     pixstride;
1193             bpc   = decc->bytes_per_element;
1194             ubpc  = decc->user_bytes_per_element;
1195             pdata = (void**) decc->decode_to_ptr;
1196 
1197             if (!pdata)
1198             {
1199                 prevsamps = 0;
1200                 if ((decode->decode_flags &
1201                      EXR_DECODE_SAMPLE_COUNTS_AS_INDIVIDUAL))
1202                 {
1203                     for (int x = 0; x < w; ++x)
1204                         prevsamps += sampbuffer[x];
1205                 }
1206                 else
1207                     prevsamps = sampbuffer[w - 1];
1208                 srcbuffer += ((size_t) bpc) * ((size_t) prevsamps);
1209                 continue;
1210             }
1211 
1212             pdata += ((size_t)y) * (((size_t)decc->user_line_stride) / sizeof (void*));
1213             pixstride = ((size_t)decc->user_pixel_stride) / sizeof (void*);
1214 
1215             for (int x = 0; x < w; ++x)
1216             {
1217                 void*   outpix = *pdata;
1218                 int32_t samps  = sampbuffer[x];
1219                 if (0 == (decode->decode_flags &
1220                           EXR_DECODE_SAMPLE_COUNTS_AS_INDIVIDUAL))
1221                 {
1222                     int32_t tmp = samps - prevsamps;
1223                     prevsamps   = samps;
1224                     samps       = tmp;
1225                 }
1226 
1227                 pdata += pixstride;
1228                 if (outpix)
1229                 {
1230                     uint8_t* cdata = outpix;
1231                     UNPACK_SAMPLES (samps)
1232                 }
1233                 srcbuffer += bpc * samps;
1234             }
1235         }
1236         sampbuffer += w;
1237     }
1238     return EXR_ERR_SUCCESS;
1239 }
1240 
1241 static exr_result_t
generic_unpack_deep(exr_decode_pipeline_t * decode)1242 generic_unpack_deep (exr_decode_pipeline_t* decode)
1243 {
1244     const uint8_t* srcbuffer  = decode->unpacked_buffer;
1245     const int32_t* sampbuffer = decode->sample_count_table;
1246     uint8_t*       cdata;
1247     int            w, h, bpc, ubpc;
1248     size_t         totsamps = 0;
1249 
1250     w = decode->chunk.width;
1251     h = decode->chunk.height;
1252 
1253     for (int y = 0; y < h; ++y)
1254     {
1255         for (int c = 0; c < decode->channel_count; ++c)
1256         {
1257             exr_coding_channel_info_t* decc      = (decode->channels + c);
1258             int32_t                    prevsamps = 0;
1259 
1260             int incr_tot = ((c + 1) == decode->channel_count);
1261 
1262             bpc   = decc->bytes_per_element;
1263             ubpc  = decc->user_bytes_per_element;
1264             cdata = decc->decode_to_ptr;
1265 
1266             if (!cdata)
1267             {
1268                 prevsamps = 0;
1269                 if ((decode->decode_flags &
1270                      EXR_DECODE_SAMPLE_COUNTS_AS_INDIVIDUAL))
1271                 {
1272                     for (int x = 0; x < w; ++x)
1273                         prevsamps += sampbuffer[x];
1274                 }
1275                 else
1276                     prevsamps = sampbuffer[w - 1];
1277                 srcbuffer += ((size_t) bpc) * ((size_t) prevsamps);
1278 
1279                 if (incr_tot) totsamps += (size_t) prevsamps;
1280 
1281                 continue;
1282             }
1283             cdata += totsamps * ((size_t) ubpc);
1284 
1285             for (int x = 0; x < w; ++x)
1286             {
1287                 int32_t samps = sampbuffer[x];
1288                 if (0 == (decode->decode_flags &
1289                           EXR_DECODE_SAMPLE_COUNTS_AS_INDIVIDUAL))
1290                 {
1291                     int32_t tmp = samps - prevsamps;
1292                     prevsamps   = samps;
1293                     samps       = tmp;
1294                 }
1295 
1296                 UNPACK_SAMPLES (samps)
1297 
1298                 srcbuffer += bpc * samps;
1299                 if (incr_tot) totsamps += (size_t) samps;
1300             }
1301         }
1302         sampbuffer += w;
1303     }
1304 
1305     return EXR_ERR_SUCCESS;
1306 }
1307 
1308 /**************************************/
1309 
1310 internal_exr_unpack_fn
internal_exr_match_decode(exr_decode_pipeline_t * decode,int isdeep,int chanstofill,int chanstounpack,int sametype,int sameouttype,int samebpc,int sameoutbpc,int hassampling,int hastypechange,int sameoutinc,int simpinterleave,int simpinterleaverev,int simplineoff)1311 internal_exr_match_decode (
1312     exr_decode_pipeline_t* decode,
1313     int                    isdeep,
1314     int                    chanstofill,
1315     int                    chanstounpack,
1316     int                    sametype,
1317     int                    sameouttype,
1318     int                    samebpc,
1319     int                    sameoutbpc,
1320     int                    hassampling,
1321     int                    hastypechange,
1322     int                    sameoutinc,
1323     int                    simpinterleave,
1324     int                    simpinterleaverev,
1325     int                    simplineoff)
1326 {
1327     static int init_cpu_check = 1;
1328     if (init_cpu_check)
1329     {
1330         choose_half_to_float_impl ();
1331         init_cpu_check = 0;
1332     }
1333 
1334     if (isdeep)
1335     {
1336         if ((decode->decode_flags & EXR_DECODE_SAMPLE_COUNTS_AS_INDIVIDUAL))
1337             return &generic_unpack_deep_pointers;
1338         return &generic_unpack_deep;
1339     }
1340 
1341     if (hastypechange > 0)
1342     {
1343         /* other optimizations would not be difficult, but this will
1344          * be the common one (where on encode / pack we want to do the
1345          * opposite) */
1346         if (sametype == (int) EXR_PIXEL_HALF &&
1347             sameouttype == (int) EXR_PIXEL_FLOAT)
1348         {
1349             if (simpinterleave > 0)
1350             {
1351                 if (decode->channel_count == 4)
1352                     return &unpack_half_to_float_4chan_interleave;
1353                 if (decode->channel_count == 3)
1354                     return &unpack_half_to_float_3chan_interleave;
1355             }
1356 
1357             if (simpinterleaverev > 0)
1358             {
1359                 if (decode->channel_count == 4)
1360                     return &unpack_half_to_float_4chan_interleave_rev;
1361                 if (decode->channel_count == 3)
1362                     return &unpack_half_to_float_3chan_interleave_rev;
1363             }
1364 
1365             if (sameoutinc == 4)
1366             {
1367                 if (decode->channel_count == 4)
1368                     return &unpack_half_to_float_4chan_planar;
1369                 if (decode->channel_count == 3)
1370                     return &unpack_half_to_float_3chan_planar;
1371             }
1372         }
1373 
1374         return &generic_unpack;
1375     }
1376 
1377     if (hassampling || chanstofill != decode->channel_count || samebpc <= 0 ||
1378         sameoutbpc <= 0)
1379         return &generic_unpack;
1380 
1381     (void) chanstounpack;
1382     (void) simplineoff;
1383 
1384     if (samebpc == 2)
1385     {
1386         if (simpinterleave > 0)
1387         {
1388             if (decode->channel_count == 4)
1389                 return &unpack_16bit_4chan_interleave;
1390             if (decode->channel_count == 3)
1391                 return &unpack_16bit_3chan_interleave;
1392         }
1393 
1394         if (simpinterleaverev > 0)
1395         {
1396             if (decode->channel_count == 4)
1397                 return &unpack_16bit_4chan_interleave_rev;
1398             if (decode->channel_count == 3)
1399                 return &unpack_16bit_3chan_interleave_rev;
1400         }
1401 
1402         if (sameoutinc == 2)
1403         {
1404             if (decode->channel_count == 4) return &unpack_16bit_4chan_planar;
1405             if (decode->channel_count == 3) return &unpack_16bit_3chan_planar;
1406         }
1407 
1408         if (decode->channel_count == 4) return &unpack_16bit_4chan;
1409         if (decode->channel_count == 3) return &unpack_16bit_3chan;
1410 
1411         return &unpack_16bit;
1412     }
1413 
1414     if (samebpc == 4)
1415     {
1416         //if (decode->channel_count == 4) return &unpack_32bit_4chan;
1417         //if (decode->channel_count == 3) return &unpack_32bit_3chan;
1418         return &unpack_32bit;
1419     }
1420 
1421     return &generic_unpack;
1422 }
1423