1 ///////////////////////////////////////////////////////////////////////////
2 //
3 // Copyright (c) 2009-2014 DreamWorks Animation LLC.
4 //
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are
9 // met:
10 // *       Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // *       Redistributions in binary form must reproduce the above
13 // copyright notice, this list of conditions and the following disclaimer
14 // in the documentation and/or other materials provided with the
15 // distribution.
16 // *       Neither the name of DreamWorks Animation nor the names of
17 // its contributors may be used to endorse or promote products derived
18 // from this software without specific prior written permission.
19 //
20 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 //
32 ///////////////////////////////////////////////////////////////////////////
33 
34 //---------------------------------------------------
35 //
36 // class DwaCompressor -- Store lossy RGB data by quantizing
37 //                          DCT components.
38 //
39 // First, we try and figure out what compression strategy to take
40 // based in channel name. For RGB channels, we want a lossy method
41 // described below. But, if we have alpha, we should do something
42 // different (and probably using RLE). If we have depth, or velocity,
43 // or something else, just fall back to ZIP. The rules for deciding
44 // which strategy to use are setup in initializeDefaultChannelRules().
45 // When writing a file, the relevant rules needed to decode are written
46 // into the start of the data block, making a self-contained file.
47 // If initializeDefaultChannelRules() doesn't quite suite your naming
48 // conventions, you can adjust the rules without breaking decoder
49 // compatability.
50 //
51 // If we're going to lossy compress R, G, or B channels, it's easier
52 // to toss bits in a more perceptual uniform space. One could argue
53 // at length as to what constitutes perceptually uniform, expecially
54 // when storing either scene/input/focal plane referred and output referred
55 // data.
56 //
57 // We'll compromise. For values <= 1, we use a traditional power function
58 // (without any of that straight-line business at the bottom). For values > 1,
59 // we want something more like a log function, since power functions blow
60 // up. At 1, we want a smooth blend between the functions. So, we use a
61 // piecewise function that does just that - see dwaLookups.cpp for
62 // a little more detail.
63 //
64 // Also, if we find that we have R, G, and B channels from the same layer,
65 // we can get a bit more compression efficiency by transforming to a Y'CbCr
66 // space. We use the 709 transform, but with Cb,Cr = 0 for an input of
67 // (0, 0, 0), instead of the traditional Cb,Cr = .5. Shifting the zero point
68 // makes no sense with large range data. Transforms are done to from
69 // the perceptual space data, not the linear-light space data (R'G'B' ->
70 // (Y'CbCr, not RGB -> YCbCr).
71 //
72 // Next, we forward DCT the data. This is done with a floating
73 // point DCT, as we don't really have control over the src range. The
74 // resulting values are dropped to half-float precision.
75 //
76 // Now, we need to quantize. Quantization departs from the usual way
77 // of dividing and rounding. Instead, we start with some floating
78 // point "base-error" value. From this, we can derive quantization
79 // error for each DCT component. Take the standard JPEG quantization
80 // tables and normalize them by the smallest value. Then, multiply
81 // the normalized quant tables by our base-error value. This gives
82 // a range of errors for each DCT component.
83 //
84 // For each DCT component, we want to find a quantized value that
85 // is within +- the per-component error. Pick the quantized value
86 // that has the fewest bits set in its' binary representation.
87 // Brute-forcing the search would make for extremly inefficient
88 // compression. Fortunatly, we can precompute a table to assist
89 // with this search.
90 //
91 // For each 16-bit float value, there are at most 15 other values with
92 // fewer bits set. We can precompute these values in a compact form, since
93 // many source values have far fewer that 15 possible quantized values.
94 // Now, instead of searching the entire range +- the component error,
95 // we can just search at most 15 quantization candidates. The search can
96 // be accelerated a bit more by sorting the candidates by the
97 // number of bits set, in increasing order. Then, the search can stop
98 // once a candidate is found w/i the per-component quantization
99 // error range.
100 //
101 // The quantization strategy has the side-benefit that there is no
102 // de-quantization step upon decode, so we don't bother recording
103 // the quantization table.
104 //
105 // Ok. So we now have quantized values. Time for entropy coding. We
106 // can use either static Huffman or zlib/DEFLATE. The static Huffman
107 // is more efficient at compacting data, but can have a greater
108 // overhead, especially for smaller tile/strip sizes.
109 //
110 // There is some additional fun, like ZIP compressing the DC components
111 // instead of Huffman/zlib, which helps make things slightly smaller.
112 //
113 // Compression level is controlled by setting an int/float/double attribute
114 // on the header named "dwaCompressionLevel". This is a thinly veiled name for
115 // the "base-error" value mentioned above. The "base-error" is just
116 // dwaCompressionLevel / 100000. The default value of 45.0 is generally
117 // pretty good at generating "visually lossless" values at reasonable
118 // data rates. Setting dwaCompressionLevel to 0 should result in no additional
119 // quantization at the quantization stage (though there may be
120 // quantization in practice at the CSC/DCT steps). But if you really
121 // want lossless compression, there are pleanty of other choices
122 // of compressors ;)
123 //
124 // When dealing with FLOAT source buffers, we first quantize the source
125 // to HALF and continue down as we would for HALF source.
126 //
127 //---------------------------------------------------
128 
129 
130 #include "ImfDwaCompressor.h"
131 #include "ImfDwaCompressorSimd.h"
132 
133 #include "ImfChannelList.h"
134 #include "ImfStandardAttributes.h"
135 #include "ImfHeader.h"
136 #include "ImfHuf.h"
137 #include "ImfInt64.h"
138 #include "ImfIntAttribute.h"
139 #include "ImfIO.h"
140 #include "ImfMisc.h"
141 #include "ImfNamespace.h"
142 #include "ImfRle.h"
143 #include "ImfSimd.h"
144 #include "ImfSystemSpecific.h"
145 #include "ImfXdr.h"
146 #include "ImfZip.h"
147 
148 #include "ImathFun.h"
149 #include "ImathBox.h"
150 #include "ImathVec.h"
151 #include "half.h"
152 
153 #include "dwaLookups.h"
154 
155 #include <vector>
156 #include <string>
157 #include <cctype>
158 #include <cassert>
159 #include <algorithm>
160 
161 // Windows specific addition to prevent the indirect import of the redefined min/max macros
162 #if defined _WIN32 || defined _WIN64
163 	#ifdef NOMINMAX
164 		#undef NOMINMAX
165 	#endif
166 	#define NOMINMAX
167 #endif
168 #include <zlib.h>
169 
170 
171 OPENEXR_IMF_INTERNAL_NAMESPACE_SOURCE_ENTER
172 
173 
174 namespace {
175 
176     //
177     // Function pointer to dispatch to an approprate
178     // convertFloatToHalf64_* impl, based on runtime cpu checking.
179     // Should be initialized in DwaCompressor::initializeFuncs()
180     //
181 
182     void (*convertFloatToHalf64)(unsigned short*, float*) =
183         convertFloatToHalf64_scalar;
184 
185     //
186     // Function pointer for dispatching a fromHalfZigZag_ impl
187     //
188 
189     void (*fromHalfZigZag)(unsigned short*, float*) =
190         fromHalfZigZag_scalar;
191 
192     //
193     // Dispatch the inverse DCT on an 8x8 block, where the last
194     // n rows can be all zeros. The n=0 case converts the full block.
195     //
196     void (*dctInverse8x8_0)(float*) = dctInverse8x8_scalar<0>;
197     void (*dctInverse8x8_1)(float*) = dctInverse8x8_scalar<1>;
198     void (*dctInverse8x8_2)(float*) = dctInverse8x8_scalar<2>;
199     void (*dctInverse8x8_3)(float*) = dctInverse8x8_scalar<3>;
200     void (*dctInverse8x8_4)(float*) = dctInverse8x8_scalar<4>;
201     void (*dctInverse8x8_5)(float*) = dctInverse8x8_scalar<5>;
202     void (*dctInverse8x8_6)(float*) = dctInverse8x8_scalar<6>;
203     void (*dctInverse8x8_7)(float*) = dctInverse8x8_scalar<7>;
204 
205 } // namespace
206 
207 
208 struct DwaCompressor::ChannelData
209 {
210     std::string         name;
211     CompressorScheme    compression;
212     int                 xSampling;
213     int                 ySampling;
214     PixelType           type;
215     bool                pLinear;
216 
217     int                 width;
218     int                 height;
219 
220     //
221     // Incoming and outgoing data is scanline interleaved, and it's much
222     // easier to operate on contiguous data.  Assuming the planare unc
223     // buffer is to hold RLE data, we need to rearrange to make bytes
224     // adjacent.
225     //
226 
227     char               *planarUncBuffer;
228     char               *planarUncBufferEnd;
229 
230     char               *planarUncRle[4];
231     char               *planarUncRleEnd[4];
232 
233     PixelType           planarUncType;
234     int                 planarUncSize;
235 };
236 
237 
238 struct DwaCompressor::CscChannelSet
239 {
240     int idx[3];
241 };
242 
243 
244 struct DwaCompressor::Classifier
245 {
ClassifierDwaCompressor::Classifier246     Classifier (std::string suffix,
247                 CompressorScheme scheme,
248                 PixelType type,
249                 int cscIdx,
250                 bool caseInsensitive):
251         _suffix(suffix),
252         _scheme(scheme),
253         _type(type),
254         _cscIdx(cscIdx),
255         _caseInsensitive(caseInsensitive)
256     {
257         if (caseInsensitive)
258             transform(_suffix.begin(), _suffix.end(), _suffix.begin(), tolower);
259     }
260 
ClassifierDwaCompressor::Classifier261     Classifier (const char *&ptr, int size)
262     {
263         if (size <= 0)
264             throw Iex::InputExc("Error uncompressing DWA data"
265                                 " (truncated rule).");
266 
267         {
268             char suffix[Name::SIZE];
269             memset (suffix, 0, Name::SIZE);
270             Xdr::read<CharPtrIO> (ptr, std::min(size, Name::SIZE-1), suffix);
271             _suffix = std::string(suffix);
272         }
273 
274         if (size < _suffix.length() + 1 + 2*Xdr::size<char>())
275             throw Iex::InputExc("Error uncompressing DWA data"
276                                 " (truncated rule).");
277 
278         char value;
279         Xdr::read<CharPtrIO> (ptr, value);
280 
281         _cscIdx = (int)(value >> 4) - 1;
282         if (_cscIdx < -1 || _cscIdx >= 3)
283             throw Iex::InputExc("Error uncompressing DWA data"
284                                 " (corrupt cscIdx rule).");
285 
286         _scheme = (CompressorScheme)((value >> 2) & 3);
287         if (_scheme < 0 || _scheme >= NUM_COMPRESSOR_SCHEMES)
288             throw Iex::InputExc("Error uncompressing DWA data"
289                                 " (corrupt scheme rule).");
290 
291         _caseInsensitive = (value & 1 ? true : false);
292 
293         Xdr::read<CharPtrIO> (ptr, value);
294         if (value < 0 || value >= NUM_PIXELTYPES)
295             throw Iex::InputExc("Error uncompressing DWA data"
296                                 " (corrupt rule).");
297         _type = (PixelType)value;
298     }
299 
matchDwaCompressor::Classifier300     bool match (const std::string &suffix, const PixelType type) const
301     {
302         if (_type != type) return false;
303 
304         if (_caseInsensitive)
305         {
306             std::string tmp(suffix);
307             transform(tmp.begin(), tmp.end(), tmp.begin(), tolower);
308             return tmp == _suffix;
309         }
310 
311         return suffix == _suffix;
312     }
313 
sizeDwaCompressor::Classifier314     size_t size () const
315     {
316         // string length + \0
317         size_t sizeBytes = _suffix.length() + 1;
318 
319         // 1 byte for scheme / cscIdx / caseInsensitive, and 1 byte for type
320         sizeBytes += 2 * Xdr::size<char>();
321 
322         return sizeBytes;
323     }
324 
writeDwaCompressor::Classifier325     void write (char *&ptr) const
326     {
327         Xdr::write<CharPtrIO> (ptr, _suffix.c_str());
328 
329         // Encode _cscIdx (-1-3) in the upper 4 bits,
330         //        _scheme (0-2)  in the next 2 bits
331         //        _caseInsen     in the bottom bit
332         unsigned char value = 0;
333         value |= ((unsigned char)(_cscIdx+1)      & 15) << 4;
334         value |= ((unsigned char)_scheme          &  3) << 2;
335         value |=  (unsigned char)_caseInsensitive &  1;
336 
337         Xdr::write<CharPtrIO> (ptr, value);
338         Xdr::write<CharPtrIO> (ptr, (unsigned char)_type);
339     }
340 
341     std::string      _suffix;
342     CompressorScheme _scheme;
343     PixelType        _type;
344     int              _cscIdx;
345     bool             _caseInsensitive;
346 };
347 
348 
349 //
350 // Base class for the LOSSY_DCT decoder classes
351 //
352 
353 class DwaCompressor::LossyDctDecoderBase
354 {
355   public:
356 
357     LossyDctDecoderBase
358         (char *packedAc,
359          char *packedDc,
360          const unsigned short *toLinear,
361          int width,
362          int height);
363 
364     virtual ~LossyDctDecoderBase ();
365 
366     void execute();
367 
368     //
369     // These return number of items, not bytes. Each item
370     // is an unsigned short
371     //
372 
numAcValuesEncoded() const373     int numAcValuesEncoded() const { return _packedAcCount; }
numDcValuesEncoded() const374     int numDcValuesEncoded() const { return _packedDcCount; }
375 
376   protected:
377 
378     //
379     // Un-RLE the packed AC components into
380     // a half buffer. The half block should
381     // be the full 8x8 block (in zig-zag order
382     // still), not the first AC component.
383     //
384     // currAcComp is advanced as bytes are decoded.
385     //
386     // This returns the index of the last non-zero
387     // value in the buffer - with the index into zig zag
388     // order data. If we return 0, we have DC only data.
389     //
390 
391     int unRleAc (unsigned short *&currAcComp,
392                  unsigned short  *halfZigBlock);
393 
394 
395     //
396     // if NATIVE and XDR are really the same values, we can
397     // skip some processing and speed things along
398     //
399 
400     bool                  _isNativeXdr;
401 
402 
403     //
404     // Counts of how many items have been packed into the
405     // AC and DC buffers
406     //
407 
408     int                   _packedAcCount;
409     int                   _packedDcCount;
410 
411 
412     //
413     // AC and DC buffers to pack
414     //
415 
416     char                 *_packedAc;
417     char                 *_packedDc;
418 
419 
420     //
421     // half -> half LUT to transform from nonlinear to linear
422     //
423 
424     const unsigned short *_toLinear;
425 
426 
427     //
428     // image dimensions
429     //
430 
431     int                   _width;
432     int                   _height;
433 
434 
435     //
436     // Pointers to the start of each scanlines, to be filled on decode
437     // Generally, these will be filled by the subclasses.
438     //
439 
440     std::vector< std::vector<char *> > _rowPtrs;
441 
442 
443     //
444     // The type of each data that _rowPtrs[i] is referring. Layout
445     // is in the same order as _rowPtrs[].
446     //
447 
448     std::vector<PixelType>             _type;
449     std::vector<SimdAlignedBuffer64f>  _dctData;
450 };
451 
452 
453 //
454 // Used to decode a single channel of LOSSY_DCT data.
455 //
456 
457 class DwaCompressor::LossyDctDecoder: public LossyDctDecoderBase
458 {
459   public:
460 
461     //
462     // toLinear is a half-float LUT to convert the encoded values
463     // back to linear light. If you want to skip this step, pass
464     // in NULL here.
465     //
466 
LossyDctDecoder(std::vector<char * > & rowPtrs,char * packedAc,char * packedDc,const unsigned short * toLinear,int width,int height,PixelType type)467     LossyDctDecoder
468         (std::vector<char *> &rowPtrs,
469          char *packedAc,
470          char *packedDc,
471          const unsigned short *toLinear,
472          int width,
473          int height,
474          PixelType type)
475     :
476         LossyDctDecoderBase(packedAc, packedDc, toLinear, width, height)
477     {
478         _rowPtrs.push_back(rowPtrs);
479         _type.push_back(type);
480     }
481 
~LossyDctDecoder()482     virtual ~LossyDctDecoder () {}
483 };
484 
485 
486 //
487 // Used to decode 3 channels of LOSSY_DCT data that
488 // are grouped together and color space converted.
489 //
490 
491 class DwaCompressor::LossyDctDecoderCsc: public LossyDctDecoderBase
492 {
493   public:
494 
495     //
496     // toLinear is a half-float LUT to convert the encoded values
497     // back to linear light. If you want to skip this step, pass
498     // in NULL here.
499     //
500 
LossyDctDecoderCsc(std::vector<char * > & rowPtrsR,std::vector<char * > & rowPtrsG,std::vector<char * > & rowPtrsB,char * packedAc,char * packedDc,const unsigned short * toLinear,int width,int height,PixelType typeR,PixelType typeG,PixelType typeB)501     LossyDctDecoderCsc
502         (std::vector<char *> &rowPtrsR,
503          std::vector<char *> &rowPtrsG,
504          std::vector<char *> &rowPtrsB,
505          char *packedAc,
506          char *packedDc,
507          const unsigned short *toLinear,
508          int width,
509          int height,
510          PixelType typeR,
511          PixelType typeG,
512          PixelType typeB)
513     :
514         LossyDctDecoderBase(packedAc, packedDc, toLinear, width, height)
515     {
516         _rowPtrs.push_back(rowPtrsR);
517         _rowPtrs.push_back(rowPtrsG);
518         _rowPtrs.push_back(rowPtrsB);
519         _type.push_back(typeR);
520         _type.push_back(typeG);
521         _type.push_back(typeB);
522     }
523 
~LossyDctDecoderCsc()524     virtual ~LossyDctDecoderCsc () {}
525 };
526 
527 
528 //
529 // Base class for encoding using the lossy DCT scheme
530 //
531 
532 class DwaCompressor::LossyDctEncoderBase
533 {
534   public:
535 
536     LossyDctEncoderBase
537         (float quantBaseError,
538          char *packedAc,
539          char *packedDc,
540          const unsigned short *toNonlinear,
541          int width,
542          int height);
543 
544     virtual ~LossyDctEncoderBase ();
545 
546     void execute ();
547 
548     //
549     // These return number of items, not bytes. Each item
550     // is an unsigned short
551     //
552 
numAcValuesEncoded() const553     int     numAcValuesEncoded () const {return _numAcComp;}
numDcValuesEncoded() const554     int     numDcValuesEncoded () const {return _numDcComp;}
555 
556   protected:
557 
558     void    toZigZag (half *dst, half *src);
559     int     countSetBits (unsigned short src);
560     half    quantize (half src, float errorTolerance);
561     void    rleAc (half *block, unsigned short *&acPtr);
562 
563     float                      _quantBaseError;
564 
565     int                        _width,
566                                _height;
567     const unsigned short      *_toNonlinear;
568 
569     int                        _numAcComp,
570                                _numDcComp;
571 
572     std::vector< std::vector<const char *> > _rowPtrs;
573     std::vector<PixelType>                   _type;
574     std::vector<SimdAlignedBuffer64f>        _dctData;
575 
576 
577     //
578     // Pointers to the buffers where AC and DC
579     // DCT components should be packed for
580     // lossless compression downstream
581     //
582 
583     char                      *_packedAc;
584     char                      *_packedDc;
585 
586 
587     //
588     // Our "quantization tables" - the example JPEG tables,
589     // normalized so that the smallest value in each is 1.0.
590     // This gives us a relationship between error in DCT
591     // components
592     //
593 
594     float                      _quantTableY[64];
595     float                      _quantTableCbCr[64];
596 };
597 
598 
599 
600 //
601 // Single channel lossy DCT encoder
602 //
603 
604 class DwaCompressor::LossyDctEncoder: public LossyDctEncoderBase
605 {
606   public:
607 
LossyDctEncoder(float quantBaseError,std::vector<const char * > & rowPtrs,char * packedAc,char * packedDc,const unsigned short * toNonlinear,int width,int height,PixelType type)608     LossyDctEncoder
609         (float quantBaseError,
610          std::vector<const char *> &rowPtrs,
611          char *packedAc,
612          char *packedDc,
613          const unsigned short *toNonlinear,
614          int width,
615          int height,
616          PixelType type)
617     :
618         LossyDctEncoderBase
619             (quantBaseError, packedAc, packedDc, toNonlinear, width, height)
620     {
621         _rowPtrs.push_back(rowPtrs);
622         _type.push_back(type);
623     }
624 
~LossyDctEncoder()625     virtual ~LossyDctEncoder () {}
626 };
627 
628 
629 //
630 // RGB channel lossy DCT encoder
631 //
632 
633 class DwaCompressor::LossyDctEncoderCsc: public LossyDctEncoderBase
634 {
635   public:
636 
LossyDctEncoderCsc(float quantBaseError,std::vector<const char * > & rowPtrsR,std::vector<const char * > & rowPtrsG,std::vector<const char * > & rowPtrsB,char * packedAc,char * packedDc,const unsigned short * toNonlinear,int width,int height,PixelType typeR,PixelType typeG,PixelType typeB)637     LossyDctEncoderCsc
638         (float quantBaseError,
639          std::vector<const char *> &rowPtrsR,
640          std::vector<const char *> &rowPtrsG,
641          std::vector<const char *> &rowPtrsB,
642          char *packedAc,
643          char *packedDc,
644          const unsigned short *toNonlinear,
645          int width,
646          int height,
647          PixelType typeR,
648          PixelType typeG,
649          PixelType typeB)
650     :
651         LossyDctEncoderBase
652             (quantBaseError, packedAc, packedDc, toNonlinear, width, height)
653     {
654         _type.push_back(typeR);
655         _type.push_back(typeG);
656         _type.push_back(typeB);
657 
658         _rowPtrs.push_back(rowPtrsR);
659         _rowPtrs.push_back(rowPtrsG);
660         _rowPtrs.push_back(rowPtrsB);
661     }
662 
~LossyDctEncoderCsc()663     virtual ~LossyDctEncoderCsc () {}
664 };
665 
666 
667 // ==============================================================
668 //
669 //                     LossyDctDecoderBase
670 //
671 // --------------------------------------------------------------
672 
LossyDctDecoderBase(char * packedAc,char * packedDc,const unsigned short * toLinear,int width,int height)673 DwaCompressor::LossyDctDecoderBase::LossyDctDecoderBase
674     (char *packedAc,
675      char *packedDc,
676      const unsigned short *toLinear,
677      int width,
678      int height)
679 :
680     _isNativeXdr(false),
681     _packedAcCount(0),
682     _packedDcCount(0),
683     _packedAc(packedAc),
684     _packedDc(packedDc),
685     _toLinear(toLinear),
686     _width(width),
687     _height(height)
688 {
689     if (_toLinear == 0)
690         _toLinear = dwaCompressorNoOp;
691 
692     _isNativeXdr = GLOBAL_SYSTEM_LITTLE_ENDIAN;
693 }
694 
695 
~LossyDctDecoderBase()696 DwaCompressor::LossyDctDecoderBase::~LossyDctDecoderBase () {}
697 
698 
699 void
execute()700 DwaCompressor::LossyDctDecoderBase::execute ()
701 {
702     int numComp        = _rowPtrs.size();
703     int lastNonZero    = 0;
704     int numBlocksX     = (int) ceil ((float)_width  / 8.0f);
705     int numBlocksY     = (int) ceil ((float)_height / 8.0f);
706     int leftoverX      = _width  - (numBlocksX-1) * 8;
707     int leftoverY      = _height - (numBlocksY-1) * 8;
708 
709     int numFullBlocksX = (int)floor ((float)_width / 8.0f);
710 
711     unsigned short tmpShortNative = 0;
712     unsigned short tmpShortXdr    = 0;
713     const char *tmpConstCharPtr   = 0;
714 
715     unsigned short                    *currAcComp = (unsigned short *)_packedAc;
716     std::vector<unsigned short *>      currDcComp (_rowPtrs.size());
717     std::vector<SimdAlignedBuffer64us> halfZigBlock (_rowPtrs.size());
718 
719     if (_type.size() != _rowPtrs.size())
720         throw Iex::BaseExc ("Row pointers and types mismatch in count");
721 
722     if ((_rowPtrs.size() != 3) && (_rowPtrs.size() != 1))
723         throw Iex::NoImplExc ("Only 1 and 3 channel encoding is supported");
724 
725     _dctData.resize(numComp);
726 
727     //
728     // Allocate a temp aligned buffer to hold a rows worth of full
729     // 8x8 half-float blocks
730     //
731 
732     unsigned char *rowBlockHandle = new unsigned char
733         [numComp * numBlocksX * 64 * sizeof(unsigned short) + _SSE_ALIGNMENT];
734 
735     unsigned short *rowBlock[3];
736 
737     rowBlock[0] = (unsigned short*)rowBlockHandle;
738 
739     for (int i = 0; i < _SSE_ALIGNMENT; ++i)
740     {
741         if (((size_t)(rowBlockHandle + i) & _SSE_ALIGNMENT_MASK) == 0)
742             rowBlock[0] = (unsigned short *)(rowBlockHandle + i);
743     }
744 
745     for (int comp = 1; comp < numComp; ++comp)
746         rowBlock[comp] = rowBlock[comp - 1] + numBlocksX * 64;
747 
748     //
749     // Pack DC components together by common plane, so we can get
750     // a little more out of differencing them. We'll always have
751     // one component per block, so we can computed offsets.
752     //
753 
754     currDcComp[0] = (unsigned short *)_packedDc;
755 
756     for (unsigned int comp = 1; comp < numComp; ++comp)
757         currDcComp[comp] = currDcComp[comp - 1] + numBlocksX * numBlocksY;
758 
759     for (int blocky = 0; blocky < numBlocksY; ++blocky)
760     {
761         int maxY = 8;
762 
763         if (blocky == numBlocksY-1)
764             maxY = leftoverY;
765 
766         int maxX = 8;
767 
768         for (int blockx = 0; blockx < numBlocksX; ++blockx)
769         {
770             if (blockx == numBlocksX-1)
771                 maxX = leftoverX;
772 
773             //
774             // If we can detect that the block is constant values
775             // (all components only have DC values, and all AC is 0),
776             // we can do everything only on 1 value, instead of all
777             // 64.
778             //
779             // This won't really help for regular images, but it is
780             // meant more for layers with large swaths of black
781             //
782 
783             bool blockIsConstant = true;
784 
785             for (unsigned int comp = 0; comp < numComp; ++comp)
786             {
787 
788                 //
789                 // DC component is stored separately
790                 //
791 
792                 #ifdef IMF_HAVE_SSE2
793                     {
794                         __m128i *dst = (__m128i*)halfZigBlock[comp]._buffer;
795 
796                         dst[7] = _mm_setzero_si128();
797                         dst[6] = _mm_setzero_si128();
798                         dst[5] = _mm_setzero_si128();
799                         dst[4] = _mm_setzero_si128();
800                         dst[3] = _mm_setzero_si128();
801                         dst[2] = _mm_setzero_si128();
802                         dst[1] = _mm_setzero_si128();
803                         dst[0] = _mm_insert_epi16
804                             (_mm_setzero_si128(), *currDcComp[comp]++, 0);
805                     }
806                 #else  /* IMF_HAVE_SSE2 */
807 
808                     memset (halfZigBlock[comp]._buffer, 0, 64 * 2);
809                     halfZigBlock[comp]._buffer[0] = *currDcComp[comp]++;
810 
811                 #endif /* IMF_HAVE_SSE2 */
812 
813                 _packedDcCount++;
814 
815                 //
816                 // UnRLE the AC. This will modify currAcComp
817                 //
818 
819                 lastNonZero = unRleAc (currAcComp, halfZigBlock[comp]._buffer);
820 
821                 //
822                 // Convert from XDR to NATIVE
823                 //
824 
825                 if (!_isNativeXdr)
826                 {
827                     for (int i = 0; i < 64; ++i)
828                     {
829                         tmpShortXdr      = halfZigBlock[comp]._buffer[i];
830                         tmpConstCharPtr  = (const char *)&tmpShortXdr;
831 
832                         Xdr::read<CharPtrIO> (tmpConstCharPtr, tmpShortNative);
833 
834                         halfZigBlock[comp]._buffer[i] = tmpShortNative;
835                     }
836                 }
837 
838                 if (lastNonZero == 0)
839                 {
840                     //
841                     // DC only case - AC components are all 0
842                     //
843 
844                     half h;
845 
846                     h.setBits (halfZigBlock[comp]._buffer[0]);
847                     _dctData[comp]._buffer[0] = (float)h;
848 
849                     dctInverse8x8DcOnly (_dctData[comp]._buffer);
850                 }
851                 else
852                 {
853                     //
854                     // We have some AC components that are non-zero.
855                     // Can't use the 'constant block' optimization
856                     //
857 
858                     blockIsConstant = false;
859 
860                     //
861                     // Un-Zig zag
862                     //
863 
864                     (*fromHalfZigZag)
865                         (halfZigBlock[comp]._buffer, _dctData[comp]._buffer);
866 
867                     //
868                     // Zig-Zag indices in normal layout are as follows:
869                     //
870                     // 0   1   3   6   10  15  21  28
871                     // 2   4   7   11  16  22  29  36
872                     // 5   8   12  17  23  30  37  43
873                     // 9   13  18  24  31  38  44  49
874                     // 14  19  25  32  39  45  50  54
875                     // 20  26  33  40  46  51  55  58
876                     // 27  34  41  47  52  56  59  61
877                     // 35  42  48  53  57  60  62  63
878                     //
879                     // If lastNonZero is less than the first item on
880                     // each row, we know that the whole row is zero and
881                     // can be skipped in the row-oriented part of the
882                     // iDCT.
883                     //
884                     // The unrolled logic here is:
885                     //
886                     //    if lastNonZero < rowStartIdx[i],
887                     //    zeroedRows = rowsEmpty[i]
888                     //
889                     // where:
890                     //
891                     //    const int rowStartIdx[] = {2, 5, 9, 14, 20, 27, 35};
892                     //    const int rowsEmpty[]   = {7, 6, 5,  4,  3,  2,  1};
893                     //
894 
895                     if (lastNonZero < 2)
896                         dctInverse8x8_7(_dctData[comp]._buffer);
897                     else if (lastNonZero < 5)
898                         dctInverse8x8_6(_dctData[comp]._buffer);
899                     else if (lastNonZero < 9)
900                         dctInverse8x8_5(_dctData[comp]._buffer);
901                     else if (lastNonZero < 14)
902                         dctInverse8x8_4(_dctData[comp]._buffer);
903                     else if (lastNonZero < 20)
904                         dctInverse8x8_3(_dctData[comp]._buffer);
905                     else if (lastNonZero < 27)
906                         dctInverse8x8_2(_dctData[comp]._buffer);
907                     else if (lastNonZero < 35)
908                         dctInverse8x8_1(_dctData[comp]._buffer);
909                     else
910                         dctInverse8x8_0(_dctData[comp]._buffer);
911                 }
912             }
913 
914             //
915             // Perform the CSC
916             //
917 
918             if (numComp == 3)
919             {
920                 if (!blockIsConstant)
921                 {
922                     csc709Inverse64 (_dctData[0]._buffer,
923                                      _dctData[1]._buffer,
924                                      _dctData[2]._buffer);
925 
926                 }
927                 else
928                 {
929                     csc709Inverse (_dctData[0]._buffer[0],
930                                    _dctData[1]._buffer[0],
931                                    _dctData[2]._buffer[0]);
932                 }
933             }
934 
935             //
936             // Float -> Half conversion.
937             //
938             // If the block has a constant value, just convert the first pixel.
939             //
940 
941             for (unsigned int comp = 0; comp < numComp; ++comp)
942             {
943                 if (!blockIsConstant)
944                 {
945                     (*convertFloatToHalf64)
946                         (&rowBlock[comp][blockx*64], _dctData[comp]._buffer);
947                 }
948                 else
949                 {
950                     #if IMF_HAVE_SSE2
951 
952                         __m128i *dst = (__m128i*)&rowBlock[comp][blockx*64];
953 
954                         dst[0] = _mm_set1_epi16
955                             (((half)_dctData[comp]._buffer[0]).bits());
956 
957                         dst[1] = dst[0];
958                         dst[2] = dst[0];
959                         dst[3] = dst[0];
960                         dst[4] = dst[0];
961                         dst[5] = dst[0];
962                         dst[6] = dst[0];
963                         dst[7] = dst[0];
964 
965                     #else  /* IMF_HAVE_SSE2 */
966 
967                         unsigned short *dst = &rowBlock[comp][blockx*64];
968 
969                         dst[0] = ((half)_dctData[comp]._buffer[0]).bits();
970 
971                         for (int i = 1; i < 64; ++i)
972                         {
973                             dst[i] = dst[0];
974                         }
975 
976                     #endif /* IMF_HAVE_SSE2 */
977                 } // blockIsConstant
978             } // comp
979         } // blockx
980 
981         //
982         // At this point, we have half-float nonlinear value blocked
983         // in rowBlock[][]. We need to unblock the data, transfer
984         // back to linear, and write the results in the _rowPtrs[].
985         //
986         // There is a fast-path for aligned rows, which helps
987         // things a little. Since this fast path is only valid
988         // for full 8-element wide blocks, the partial x blocks
989         // are broken into a separate loop below.
990         //
991         // At the moment, the fast path requires:
992         //   * sse support
993         //   * aligned row pointers
994         //   * full 8-element wide blocks
995         //
996 
997         for (int comp = 0; comp < numComp; ++comp)
998         {
999             //
1000             // Test if we can use the fast path
1001             //
1002 
1003         #ifdef IMF_HAVE_SSE2
1004 
1005             bool fastPath = true;
1006 
1007             for (int y = 8 * blocky; y < 8 * blocky + maxY; ++y)
1008             {
1009                 if ((size_t)_rowPtrs[comp][y] & _SSE_ALIGNMENT_MASK)
1010                     fastPath = false;
1011             }
1012 
1013             if (fastPath)
1014             {
1015                 //
1016                 // Handle all the full X blocks, in a fast path with sse2 and
1017                 // aligned row pointers
1018                 //
1019 
1020                 for (int y=8*blocky; y<8*blocky+maxY; ++y)
1021                 {
1022                     __m128i *dst = (__m128i *)_rowPtrs[comp][y];
1023                     __m128i *src = (__m128i *)&rowBlock[comp][(y & 0x7) * 8];
1024 
1025 
1026                     for (int blockx = 0; blockx < numFullBlocksX; ++blockx)
1027                     {
1028                         //
1029                         // These may need some twiddling.
1030                         // Run with multiples of 8
1031                         //
1032 
1033                         _mm_prefetch ((char *)(src + 16), _MM_HINT_NTA);
1034 
1035                         unsigned short i0  = _mm_extract_epi16 (*src, 0);
1036                         unsigned short i1  = _mm_extract_epi16 (*src, 1);
1037                         unsigned short i2  = _mm_extract_epi16 (*src, 2);
1038                         unsigned short i3  = _mm_extract_epi16 (*src, 3);
1039 
1040                         unsigned short i4  = _mm_extract_epi16 (*src, 4);
1041                         unsigned short i5  = _mm_extract_epi16 (*src, 5);
1042                         unsigned short i6  = _mm_extract_epi16 (*src, 6);
1043                         unsigned short i7  = _mm_extract_epi16 (*src, 7);
1044 
1045                         i0 = _toLinear[i0];
1046                         i1 = _toLinear[i1];
1047                         i2 = _toLinear[i2];
1048                         i3 = _toLinear[i3];
1049 
1050                         i4 = _toLinear[i4];
1051                         i5 = _toLinear[i5];
1052                         i6 = _toLinear[i6];
1053                         i7 = _toLinear[i7];
1054 
1055                         *dst = _mm_insert_epi16 (_mm_setzero_si128(), i0, 0);
1056                         *dst = _mm_insert_epi16 (*dst, i1, 1);
1057                         *dst = _mm_insert_epi16 (*dst, i2, 2);
1058                         *dst = _mm_insert_epi16 (*dst, i3, 3);
1059 
1060                         *dst = _mm_insert_epi16 (*dst, i4, 4);
1061                         *dst = _mm_insert_epi16 (*dst, i5, 5);
1062                         *dst = _mm_insert_epi16 (*dst, i6, 6);
1063                         *dst = _mm_insert_epi16 (*dst, i7, 7);
1064 
1065                         src += 8;
1066                         dst++;
1067                     }
1068                 }
1069             }
1070             else
1071             {
1072 
1073         #endif /* IMF_HAVE_SSE2 */
1074 
1075                 //
1076                 // Basic scalar kinda slow path for handling the full X blocks
1077                 //
1078 
1079                 for (int y = 8 * blocky; y < 8 * blocky + maxY; ++y)
1080                 {
1081                     unsigned short *dst = (unsigned short *)_rowPtrs[comp][y];
1082 
1083                     for (int blockx = 0; blockx < numFullBlocksX; ++blockx)
1084                     {
1085                         unsigned short *src =
1086                             &rowBlock[comp][blockx * 64 + ((y & 0x7) * 8)];
1087 
1088                         dst[0] = _toLinear[src[0]];
1089                         dst[1] = _toLinear[src[1]];
1090                         dst[2] = _toLinear[src[2]];
1091                         dst[3] = _toLinear[src[3]];
1092 
1093                         dst[4] = _toLinear[src[4]];
1094                         dst[5] = _toLinear[src[5]];
1095                         dst[6] = _toLinear[src[6]];
1096                         dst[7] = _toLinear[src[7]];
1097 
1098                         dst += 8;
1099                     }
1100                 }
1101 
1102         #ifdef IMF_HAVE_SSE2
1103 
1104             }
1105 
1106         #endif /* IMF_HAVE_SSE2 */
1107 
1108             //
1109             // If we have partial X blocks, deal with all those now
1110             // Since this should be minimal work, there currently
1111             // is only one path that should work for everyone.
1112             //
1113 
1114             if (numFullBlocksX != numBlocksX)
1115             {
1116                 for (int y = 8 * blocky; y < 8 * blocky + maxY; ++y)
1117                 {
1118                     unsigned short *src = (unsigned short *)
1119                         &rowBlock[comp][numFullBlocksX * 64 + ((y & 0x7) * 8)];
1120 
1121                     unsigned short *dst = (unsigned short *)_rowPtrs[comp][y];
1122 
1123                     dst += 8 * numFullBlocksX;
1124 
1125                     for (int x = 0; x < maxX; ++x)
1126                     {
1127                         *dst++ = _toLinear[*src++];
1128                     }
1129                 }
1130             }
1131         } // comp
1132     } // blocky
1133 
1134     //
1135     // Walk over all the channels that are of type FLOAT.
1136     // Convert from HALF XDR back to FLOAT XDR.
1137     //
1138 
1139     for (unsigned int chan = 0; chan < numComp; ++chan)
1140     {
1141 
1142         if (_type[chan] != FLOAT)
1143             continue;
1144 
1145         std::vector<unsigned short> halfXdr (_width);
1146 
1147         for (int y=0; y<_height; ++y)
1148         {
1149             char *floatXdrPtr = _rowPtrs[chan][y];
1150 
1151             memcpy(&halfXdr[0], floatXdrPtr, _width*sizeof(unsigned short));
1152 
1153             const char *halfXdrPtr = (const char *)(&halfXdr[0]);
1154 
1155             for (int x=0; x<_width; ++x)
1156             {
1157                 half tmpHalf;
1158 
1159                 Xdr::read<CharPtrIO> (halfXdrPtr, tmpHalf);
1160                 Xdr::write<CharPtrIO> (floatXdrPtr, (float)tmpHalf);
1161 
1162                 //
1163                 // Xdr::write and Xdr::read will advance the ptrs
1164                 //
1165             }
1166         }
1167     }
1168 
1169     delete[] rowBlockHandle;
1170 }
1171 
1172 
1173 //
1174 // Un-RLE the packed AC components into
1175 // a half buffer. The half block should
1176 // be the full 8x8 block (in zig-zag order
1177 // still), not the first AC component.
1178 //
1179 // currAcComp is advanced as bytes are decoded.
1180 //
1181 // This returns the index of the last non-zero
1182 // value in the buffer - with the index into zig zag
1183 // order data. If we return 0, we have DC only data.
1184 //
1185 // This is assuminging that halfZigBlock is zero'ed
1186 // prior to calling
1187 //
1188 
1189 int
unRleAc(unsigned short * & currAcComp,unsigned short * halfZigBlock)1190 DwaCompressor::LossyDctDecoderBase::unRleAc
1191     (unsigned short *&currAcComp,
1192      unsigned short  *halfZigBlock)
1193 {
1194     //
1195     // Un-RLE the RLE'd blocks. If we find an item whose
1196     // high byte is 0xff, then insert the number of 0's
1197     // as indicated by the low byte.
1198     //
1199     // Otherwise, just copy the number verbaitm.
1200     //
1201 
1202     int lastNonZero          = 0;
1203     int dctComp              = 1;
1204 
1205     //
1206     // Start with a zero'ed block, so we don't have to
1207     // write when we hit a run symbol
1208     //
1209 
1210     while (dctComp < 64)
1211     {
1212         if (*currAcComp == 0xff00)
1213         {
1214             //
1215             // End of block
1216             //
1217 
1218             dctComp = 64;
1219 
1220         }
1221         else if ((*currAcComp) >> 8 == 0xff)
1222         {
1223             //
1224             // Run detected! Insert 0's.
1225             //
1226             // Since the block has been zeroed, just advance the ptr
1227             //
1228 
1229             dctComp += (*currAcComp) & 0xff;
1230         }
1231         else
1232         {
1233             //
1234             // Not a run, just copy over the value
1235             //
1236 
1237             lastNonZero = dctComp;
1238             halfZigBlock[dctComp] = *currAcComp;
1239 
1240             dctComp++;
1241         }
1242 
1243         _packedAcCount++;
1244         currAcComp++;
1245     }
1246 
1247     return lastNonZero;
1248 }
1249 
1250 
1251 // ==============================================================
1252 //
1253 //                     LossyDctEncoderBase
1254 //
1255 // --------------------------------------------------------------
1256 
LossyDctEncoderBase(float quantBaseError,char * packedAc,char * packedDc,const unsigned short * toNonlinear,int width,int height)1257 DwaCompressor::LossyDctEncoderBase::LossyDctEncoderBase
1258     (float quantBaseError,
1259      char *packedAc,
1260      char *packedDc,
1261      const unsigned short *toNonlinear,
1262      int width,
1263      int height)
1264 :
1265     _quantBaseError(quantBaseError),
1266     _width(width),
1267     _height(height),
1268     _toNonlinear(toNonlinear),
1269     _numAcComp(0),
1270     _numDcComp(0),
1271     _packedAc(packedAc),
1272     _packedDc(packedDc)
1273 {
1274     //
1275     // Here, we take the generic JPEG quantization tables and
1276     // normalize them by the smallest component in each table.
1277     // This gives us a relationship amongst the DCT components,
1278     // in terms of how sensitive each component is to
1279     // error.
1280     //
1281     // A higher normalized value means we can quantize more,
1282     // and a small normalized value means we can quantize less.
1283     //
1284     // Eventually, we will want an acceptable quantization
1285     // error range for each component. We find this by
1286     // multiplying some user-specified level (_quantBaseError)
1287     // by the normalized table (_quantTableY, _quantTableCbCr) to
1288     // find the acceptable quantization error range.
1289     //
1290     // The quantization table is not needed for decoding, and
1291     // is not transmitted. So, if you want to get really fancy,
1292     // you could derive some content-dependent quantization
1293     // table, and the decoder would not need to be changed. But,
1294     // for now, we'll just use statice quantization tables.
1295     //
1296 
1297     int jpegQuantTableY[] =
1298     {
1299         16,  11,  10,  16,   24,   40,   51,   61,
1300         12,  12,  14,  19,   26,   58,   60,   55,
1301         14,  13,  16,  24,   40,   57,   69,   56,
1302         14,  17,  22,  29,   51,   87,   80,   62,
1303         18,  22,  37,  56,   68,  109,  103,   77,
1304         24,  35,  55,  64,   81,  104,  113,   92,
1305         49,  64,  78,  87,  103,  121,  120,  101,
1306         72,  92,  95,  98,  112,  100,  103,   99
1307     };
1308 
1309     int jpegQuantTableYMin = 10;
1310 
1311     int jpegQuantTableCbCr[] =
1312     {
1313         17,  18,  24,  47,  99,  99,  99,  99,
1314         18,  21,  26,  66,  99,  99,  99,  99,
1315         24,  26,  56,  99,  99,  99,  99,  99,
1316         47,  66,  99,  99,  99,  99,  99,  99,
1317         99,  99,  99,  99,  99,  99,  99,  99,
1318         99,  99,  99,  99,  99,  99,  99,  99,
1319         99,  99,  99,  99,  99,  99,  99,  99,
1320         99,  99,  99,  99,  99,  99,  99,  99
1321     };
1322 
1323     int jpegQuantTableCbCrMin = 17;
1324 
1325     for (int idx = 0; idx < 64; ++idx)
1326     {
1327         _quantTableY[idx] = static_cast<float> (jpegQuantTableY[idx]) /
1328                             static_cast<float> (jpegQuantTableYMin);
1329 
1330         _quantTableCbCr[idx] = static_cast<float> (jpegQuantTableCbCr[idx]) /
1331                                static_cast<float> (jpegQuantTableCbCrMin);
1332     }
1333 
1334     if (_quantBaseError < 0)
1335         quantBaseError = 0;
1336 }
1337 
1338 
~LossyDctEncoderBase()1339 DwaCompressor::LossyDctEncoderBase::~LossyDctEncoderBase ()
1340 {
1341 }
1342 
1343 
1344 //
1345 // Given three channels of source data, encoding by first applying
1346 // a color space conversion to a YCbCr space.  Otherwise, if we only
1347 // have one channel, just encode it as is.
1348 //
1349 // Other numbers of channels are somewhat unexpected at this point,
1350 // and will throw an exception.
1351 //
1352 
1353 void
execute()1354 DwaCompressor::LossyDctEncoderBase::execute ()
1355 {
1356     int  numBlocksX   = (int)ceil ((float)_width / 8.0f);
1357     int  numBlocksY   = (int)ceil ((float)_height/ 8.0f);
1358 
1359     half halfZigCoef[64];
1360     half halfCoef[64];
1361 
1362     std::vector<unsigned short *> currDcComp (_rowPtrs.size());
1363     unsigned short               *currAcComp = (unsigned short *)_packedAc;
1364 
1365     _dctData.resize (_rowPtrs.size());
1366     _numAcComp = 0;
1367     _numDcComp = 0;
1368 
1369     assert (_type.size() == _rowPtrs.size());
1370     assert ((_rowPtrs.size() == 3) || (_rowPtrs.size() == 1));
1371 
1372     //
1373     // Allocate a temp half buffer to quantize into for
1374     // any FLOAT source channels.
1375     //
1376 
1377     int tmpHalfBufferElements = 0;
1378 
1379     for (unsigned int chan = 0; chan < _rowPtrs.size(); ++chan)
1380         if (_type[chan] == FLOAT)
1381             tmpHalfBufferElements += _width * _height;
1382 
1383     std::vector<unsigned short> tmpHalfBuffer (tmpHalfBufferElements);
1384 
1385     char *tmpHalfBufferPtr = 0;
1386 
1387     if (tmpHalfBufferElements)
1388         tmpHalfBufferPtr = (char *)&tmpHalfBuffer[0];
1389 
1390     //
1391     // Run over all the float scanlines, quantizing,
1392     // and re-assigning _rowPtr[y]. We need to translate
1393     // FLOAT XDR to HALF XDR.
1394     //
1395 
1396     for (unsigned int chan = 0; chan < _rowPtrs.size(); ++chan)
1397     {
1398         if (_type[chan] != FLOAT)
1399             continue;
1400 
1401         for (int y = 0; y < _height; ++y)
1402         {
1403             float       src = 0;
1404             const char *srcXdr = _rowPtrs[chan][y];
1405             char       *dstXdr = tmpHalfBufferPtr;
1406 
1407             for (int x = 0; x < _width; ++x)
1408             {
1409 
1410                 Xdr::read<CharPtrIO> (srcXdr, src);
1411                 Xdr::write<CharPtrIO> (dstXdr, ((half)src).bits());
1412 
1413                 //
1414                 // Xdr::read and Xdr::write will advance the ptr
1415                 //
1416             }
1417 
1418             _rowPtrs[chan][y] = (const char *)tmpHalfBufferPtr;
1419             tmpHalfBufferPtr += _width * sizeof (unsigned short);
1420         }
1421     }
1422 
1423     //
1424     // Pack DC components together by common plane, so we can get
1425     // a little more out of differencing them. We'll always have
1426     // one component per block, so we can computed offsets.
1427     //
1428 
1429     currDcComp[0] = (unsigned short *)_packedDc;
1430 
1431     for (unsigned int chan = 1; chan < _rowPtrs.size(); ++chan)
1432         currDcComp[chan] = currDcComp[chan-1] + numBlocksX * numBlocksY;
1433 
1434     for (int blocky = 0; blocky < numBlocksY; ++blocky)
1435     {
1436         for (int blockx = 0; blockx < numBlocksX; ++blockx)
1437         {
1438             half           h;
1439             unsigned short tmpShortXdr, tmpShortNative;
1440             char          *tmpCharPtr;
1441 
1442             for (unsigned int chan = 0; chan < _rowPtrs.size(); ++chan)
1443             {
1444                 //
1445                 // Break the source into 8x8 blocks. If we don't
1446                 // fit at the edges, mirror.
1447                 //
1448                 // Also, convert from linear to nonlinear representation.
1449                 // Our source is assumed to be XDR, and we need to convert
1450                 // to NATIVE prior to converting to float.
1451                 //
1452                 // If we're converting linear -> nonlinear, assume that the
1453                 // XDR -> NATIVE conversion is built into the lookup. Otherwise,
1454                 // we'll need to explicitly do it.
1455                 //
1456 
1457                 for (int y = 0; y < 8; ++y)
1458                 {
1459                     for (int x = 0; x < 8; ++x)
1460                     {
1461                         int vx = 8 * blockx + x;
1462                         int vy = 8 * blocky + y;
1463 
1464                         if (vx >= _width)
1465                             vx = _width - (vx - (_width - 1));
1466 
1467                         if (vx < 0) vx = _width-1;
1468 
1469                         if (vy >=_height)
1470                             vy = _height - (vy - (_height - 1));
1471 
1472                         if (vy < 0) vy = _height-1;
1473 
1474                         tmpShortXdr =
1475                             ((const unsigned short *)(_rowPtrs[chan])[vy])[vx];
1476 
1477                         if (_toNonlinear)
1478                         {
1479                             h.setBits (_toNonlinear[tmpShortXdr]);
1480                         }
1481                         else
1482                         {
1483                             const char *tmpConstCharPtr =
1484                                 (const char *)(&tmpShortXdr);
1485 
1486                             Xdr::read<CharPtrIO>
1487                                 (tmpConstCharPtr, tmpShortNative);
1488 
1489                             h.setBits(tmpShortNative);
1490                         }
1491 
1492                         _dctData[chan]._buffer[y * 8 + x] = (float)h;
1493                     } // x
1494                 } // y
1495             } // chan
1496 
1497             //
1498             // Color space conversion
1499             //
1500 
1501             if (_rowPtrs.size() == 3)
1502             {
1503                 csc709Forward64 (_dctData[0]._buffer,
1504                                  _dctData[1]._buffer,
1505                                  _dctData[2]._buffer);
1506             }
1507 
1508             for (unsigned int chan = 0; chan < _rowPtrs.size(); ++chan)
1509             {
1510                 //
1511                 // Forward DCT
1512                 //
1513 
1514                 dctForward8x8(_dctData[chan]._buffer);
1515 
1516                 //
1517                 // Quantize to half, and zigzag
1518                 //
1519 
1520                 if (chan == 0)
1521                 {
1522                     for (int i = 0; i < 64; ++i)
1523                     {
1524                         halfCoef[i] =
1525                             quantize ((half)_dctData[chan]._buffer[i],
1526                                       _quantBaseError*_quantTableY[i]);
1527                     }
1528                 }
1529                 else
1530                 {
1531                     for (int i = 0; i < 64; ++i)
1532                     {
1533                         halfCoef[i] =
1534                             quantize ((half)_dctData[chan]._buffer[i],
1535                                       _quantBaseError*_quantTableCbCr[i]);
1536                     }
1537                 }
1538 
1539                 toZigZag (halfZigCoef, halfCoef);
1540 
1541                 //
1542                 // Convert from NATIVE back to XDR, before we write out
1543                 //
1544 
1545                 for (int i = 0; i < 64; ++i)
1546                 {
1547                     tmpCharPtr = (char *)&tmpShortXdr;
1548                     Xdr::write<CharPtrIO>(tmpCharPtr, halfZigCoef[i].bits());
1549                     halfZigCoef[i].setBits(tmpShortXdr);
1550                 }
1551 
1552                 //
1553                 // Save the DC component separately, to be compressed on
1554                 // its own.
1555                 //
1556 
1557                 *currDcComp[chan]++ = halfZigCoef[0].bits();
1558                 _numDcComp++;
1559 
1560                 //
1561                 // Then RLE the AC components (which will record the count
1562                 // of the resulting number of items)
1563                 //
1564 
1565                 rleAc (halfZigCoef, currAcComp);
1566             } // chan
1567         } // blockx
1568     } // blocky
1569 }
1570 
1571 
1572 //
1573 // Reorder from zig-zag order to normal ordering
1574 //
1575 
1576 void
toZigZag(half * dst,half * src)1577 DwaCompressor::LossyDctEncoderBase::toZigZag (half *dst, half *src)
1578 {
1579     const int remap[] =
1580     {
1581          0,
1582          1,  8,
1583         16,  9,  2,
1584          3, 10, 17, 24,
1585         32, 25, 18, 11, 4,
1586          5, 12, 19, 26, 33, 40,
1587         48, 41, 34, 27, 20, 13, 6,
1588          7, 14, 21, 28, 35, 42, 49, 56,
1589             57, 50, 43, 36, 29, 22, 15,
1590                 23, 30, 37, 44, 51, 58,
1591                     59, 52, 45, 38, 31,
1592                         39, 46, 53, 60,
1593                             61, 54, 47,
1594                                 55, 62,
1595                                     63
1596     };
1597 
1598     for (int i=0; i<64; ++i)
1599         dst[i] = src[remap[i]];
1600 }
1601 
1602 
1603 //
1604 // Precomputing the bit count runs faster than using
1605 // the builtin instruction, at least in one case..
1606 //
1607 // Precomputing 8-bits is no slower than 16-bits,
1608 // and saves a fair bit of overhead..
1609 //
1610 
1611 int
countSetBits(unsigned short src)1612 DwaCompressor::LossyDctEncoderBase::countSetBits (unsigned short src)
1613 {
1614     static const unsigned short numBitsSet[256] =
1615     {
1616         0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1617         1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1618         1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1619         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1620         1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1621         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1622         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1623         3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1624         1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1625         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1626         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1627         3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1628         2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1629         3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1630         3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1631         4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
1632     };
1633 
1634     return numBitsSet[src & 0xff] + numBitsSet[src >> 8];
1635 }
1636 
1637 
1638 //
1639 // Take a DCT coefficient, as well as an acceptable error. Search
1640 // nearby values within the error tolerance, that have fewer
1641 // bits set.
1642 //
1643 // The list of candidates has been pre-computed and sorted
1644 // in order of increasing numbers of bits set. This way, we
1645 // can stop searching as soon as we find a candidate that
1646 // is within the error tolerance.
1647 //
1648 
1649 half
quantize(half src,float errorTolerance)1650 DwaCompressor::LossyDctEncoderBase::quantize (half src, float errorTolerance)
1651 {
1652     half            tmp;
1653     float           srcFloat      = (float)src;
1654     int             numSetBits    = countSetBits(src.bits());
1655     const unsigned short *closest = closestData + closestDataOffset[src.bits()];
1656 
1657     for (int targetNumSetBits = numSetBits - 1;
1658          targetNumSetBits >= 0;
1659          --targetNumSetBits)
1660     {
1661         tmp.setBits (*closest);
1662 
1663         if (fabs ((float)tmp - srcFloat) < errorTolerance)
1664             return tmp;
1665 
1666         closest++;
1667     }
1668 
1669     return src;
1670 }
1671 
1672 
1673 //
1674 // RLE the zig-zag of the AC components + copy over
1675 // into another tmp buffer
1676 //
1677 // Try to do a simple RLE scheme to reduce run's of 0's. This
1678 // differs from the jpeg EOB case, since EOB just indicates that
1679 // the rest of the block is zero. In our case, we have lots of
1680 // NaN symbols, which shouldn't be allowed to occur in DCT
1681 // coefficents - so we'll use them for encoding runs.
1682 //
1683 // If the high byte is 0xff, then we have a run of 0's, of length
1684 // given by the low byte. For example, 0xff03 would be a run
1685 // of 3 0's, starting at the current location.
1686 //
1687 // block is our block of 64 coefficients
1688 // acPtr a pointer to back the RLE'd values into.
1689 //
1690 // This will advance the counter, _numAcComp.
1691 //
1692 
1693 void
rleAc(half * block,unsigned short * & acPtr)1694 DwaCompressor::LossyDctEncoderBase::rleAc
1695     (half *block,
1696      unsigned short *&acPtr)
1697 {
1698     int dctComp              = 1;
1699     unsigned short rleSymbol = 0x0;
1700 
1701     while (dctComp < 64)
1702     {
1703         int runLen = 1;
1704 
1705         //
1706         // If we don't have a 0, output verbatim
1707         //
1708 
1709         if (block[dctComp].bits() != rleSymbol)
1710         {
1711             *acPtr++ =  block[dctComp].bits();
1712             _numAcComp++;
1713 
1714             dctComp += runLen;
1715             continue;
1716         }
1717 
1718         //
1719         // We're sitting on a 0, so see how big the run is.
1720         //
1721 
1722         while ((dctComp+runLen < 64) &&
1723                (block[dctComp+runLen].bits() == rleSymbol))
1724         {
1725             runLen++;
1726         }
1727 
1728         //
1729         // If the run len is too small, just output verbatim
1730         // otherwise output our run token
1731         //
1732         // Originally, we wouldn't have a separate symbol for
1733         // "end of block". But in some experimentation, it looks
1734         // like using 0xff00 for "end of block" can save a bit
1735         // of space.
1736         //
1737 
1738         if (runLen == 1)
1739         {
1740             runLen           = 1;
1741             *acPtr++ = block[dctComp].bits();
1742             _numAcComp++;
1743 
1744             //
1745             // Using 0xff00 for "end of block"
1746             //
1747         }
1748         else if (runLen + dctComp == 64)
1749         {
1750             //
1751             // Signal EOB
1752             //
1753 
1754             *acPtr++ = 0xff00;
1755             _numAcComp++;
1756         }
1757         else
1758         {
1759             //
1760             // Signal normal run
1761             //
1762 
1763             *acPtr++   = 0xff00 | runLen;
1764             _numAcComp++;
1765         }
1766 
1767         //
1768         // Advance by runLen
1769         //
1770 
1771         dctComp += runLen;
1772     }
1773 }
1774 
1775 
1776 // ==============================================================
1777 //
1778 //                     DwaCompressor
1779 //
1780 // --------------------------------------------------------------
1781 
1782 //
1783 // DwaCompressor()
1784 //
1785 
DwaCompressor(const Header & hdr,int maxScanLineSize,int numScanLines,AcCompression acCompression)1786 DwaCompressor::DwaCompressor
1787     (const Header &hdr,
1788      int maxScanLineSize,
1789      int numScanLines,
1790      AcCompression acCompression)
1791 :
1792     Compressor(hdr),
1793     _acCompression(acCompression),
1794     _maxScanLineSize(maxScanLineSize),
1795     _numScanLines(numScanLines),
1796     _channels(hdr.channels()),
1797     _packedAcBuffer(0),
1798     _packedAcBufferSize(0),
1799     _packedDcBuffer(0),
1800     _packedDcBufferSize(0),
1801     _rleBuffer(0),
1802     _rleBufferSize(0),
1803     _outBuffer(0),
1804     _outBufferSize(0),
1805     _zip(0),
1806     _dwaCompressionLevel(45.0)
1807 {
1808     _min[0] = hdr.dataWindow().min.x;
1809     _min[1] = hdr.dataWindow().min.y;
1810     _max[0] = hdr.dataWindow().max.x;
1811     _max[1] = hdr.dataWindow().max.y;
1812 
1813     for (int i=0; i < NUM_COMPRESSOR_SCHEMES; ++i)
1814     {
1815         _planarUncBuffer[i] = 0;
1816         _planarUncBufferSize[i] = 0;
1817     }
1818 
1819     //
1820     // Check the header for a quality attribute
1821     //
1822 
1823     if (hasDwaCompressionLevel (hdr))
1824         _dwaCompressionLevel = dwaCompressionLevel (hdr);
1825 }
1826 
1827 
~DwaCompressor()1828 DwaCompressor::~DwaCompressor()
1829 {
1830     delete[] _packedAcBuffer;
1831     delete[] _packedDcBuffer;
1832     delete[] _rleBuffer;
1833     delete[] _outBuffer;
1834     delete _zip;
1835 
1836     for (int i=0; i<NUM_COMPRESSOR_SCHEMES; ++i)
1837         delete[] _planarUncBuffer[i];
1838 }
1839 
1840 
1841 int
numScanLines() const1842 DwaCompressor::numScanLines() const
1843 {
1844     return _numScanLines;
1845 }
1846 
1847 
1848 Imf::Compressor::Format
format() const1849 DwaCompressor::format() const
1850 {
1851     if (GLOBAL_SYSTEM_LITTLE_ENDIAN)
1852         return NATIVE;
1853     else
1854         return XDR;
1855 }
1856 
1857 
1858 int
compress(const char * inPtr,int inSize,int minY,const char * & outPtr)1859 DwaCompressor::compress
1860     (const char *inPtr,
1861      int inSize,
1862      int minY,
1863      const char *&outPtr)
1864 {
1865     return compress
1866         (inPtr,
1867          inSize,
1868          Imath::Box2i (Imath::V2i (_min[0], minY),
1869                        Imath::V2i (_max[0], minY + numScanLines() - 1)),
1870          outPtr);
1871 }
1872 
1873 
1874 int
compressTile(const char * inPtr,int inSize,Imath::Box2i range,const char * & outPtr)1875 DwaCompressor::compressTile
1876     (const char *inPtr,
1877      int inSize,
1878      Imath::Box2i range,
1879      const char *&outPtr)
1880 {
1881     return compress (inPtr, inSize, range, outPtr);
1882 }
1883 
1884 
1885 int
compress(const char * inPtr,int inSize,Imath::Box2i range,const char * & outPtr)1886 DwaCompressor::compress
1887     (const char *inPtr,
1888      int inSize,
1889      Imath::Box2i range,
1890      const char  *&outPtr)
1891 {
1892     const char *inDataPtr   = inPtr;
1893     char       *packedAcEnd = 0;
1894     char       *packedDcEnd = 0;
1895     int         fileVersion = 2;   // Starting with 2, we write the channel
1896                                    // classification rules into the file
1897 
1898     if (fileVersion < 2)
1899         initializeLegacyChannelRules();
1900     else
1901         initializeDefaultChannelRules();
1902 
1903     size_t outBufferSize = 0;
1904     initializeBuffers(outBufferSize);
1905 
1906     unsigned short          channelRuleSize = 0;
1907     std::vector<Classifier> channelRules;
1908     if (fileVersion >= 2)
1909     {
1910         relevantChannelRules(channelRules);
1911 
1912         channelRuleSize = Xdr::size<unsigned short>();
1913         for (size_t i = 0; i < channelRules.size(); ++i)
1914             channelRuleSize += channelRules[i].size();
1915     }
1916 
1917     //
1918     // Remember to allocate _outBuffer, if we haven't done so already.
1919     //
1920 
1921     outBufferSize += channelRuleSize;
1922     if (outBufferSize > _outBufferSize)
1923     {
1924         _outBufferSize = outBufferSize;
1925         if (_outBuffer == 0)
1926             delete[] _outBuffer;
1927         _outBuffer = new char[outBufferSize];
1928     }
1929 
1930     char *outDataPtr = &_outBuffer[NUM_SIZES_SINGLE * sizeof(Imf::Int64) +
1931                                    channelRuleSize];
1932 
1933     //
1934     // We might not be dealing with any color data, in which
1935     // case the AC buffer size will be 0, and deferencing
1936     // a vector will not be a good thing to do.
1937     //
1938 
1939     if (_packedAcBuffer)
1940         packedAcEnd = _packedAcBuffer;
1941 
1942     if (_packedDcBuffer)
1943         packedDcEnd = _packedDcBuffer;
1944 
1945     #define OBIDX(x) (Int64 *)&_outBuffer[x * sizeof (Int64)]
1946 
1947     Int64 *version                 = OBIDX (VERSION);
1948     Int64 *unknownUncompressedSize = OBIDX (UNKNOWN_UNCOMPRESSED_SIZE);
1949     Int64 *unknownCompressedSize   = OBIDX (UNKNOWN_COMPRESSED_SIZE);
1950     Int64 *acCompressedSize        = OBIDX (AC_COMPRESSED_SIZE);
1951     Int64 *dcCompressedSize        = OBIDX (DC_COMPRESSED_SIZE);
1952     Int64 *rleCompressedSize       = OBIDX (RLE_COMPRESSED_SIZE);
1953     Int64 *rleUncompressedSize     = OBIDX (RLE_UNCOMPRESSED_SIZE);
1954     Int64 *rleRawSize              = OBIDX (RLE_RAW_SIZE);
1955 
1956     Int64 *totalAcUncompressedCount = OBIDX (AC_UNCOMPRESSED_COUNT);
1957     Int64 *totalDcUncompressedCount = OBIDX (DC_UNCOMPRESSED_COUNT);
1958 
1959     Int64 *acCompression            = OBIDX (AC_COMPRESSION);
1960 
1961     int minX   = range.min.x;
1962     int maxX   = std::min(range.max.x, _max[0]);
1963     int minY   = range.min.y;
1964     int maxY   = std::min(range.max.y, _max[1]);
1965 
1966     //
1967     // Zero all the numbers in the chunk header
1968     //
1969 
1970     memset (_outBuffer, 0, NUM_SIZES_SINGLE * sizeof (Int64));
1971 
1972     //
1973     // Setup the AC compression strategy and the version in the data block,
1974     // then write the relevant channel classification rules if needed
1975     //
1976     *version       = fileVersion;
1977     *acCompression = _acCompression;
1978 
1979     setupChannelData (minX, minY, maxX, maxY);
1980 
1981     if (fileVersion >= 2)
1982     {
1983         char *writePtr = &_outBuffer[NUM_SIZES_SINGLE * sizeof(Imf::Int64)];
1984         Xdr::write<CharPtrIO> (writePtr, channelRuleSize);
1985 
1986         for (size_t i = 0; i < channelRules.size(); ++i)
1987             channelRules[i].write(writePtr);
1988     }
1989 
1990     //
1991     // Determine the start of each row in the input buffer
1992     // Channels are interleaved by scanline
1993     //
1994 
1995     std::vector<bool> encodedChannels (_channelData.size());
1996     std::vector< std::vector<const char *> > rowPtrs (_channelData.size());
1997 
1998     for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
1999         encodedChannels[chan] = false;
2000 
2001     inDataPtr =  inPtr;
2002 
2003     for (int y = minY; y <= maxY; ++y)
2004     {
2005         for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
2006         {
2007 
2008             ChannelData *cd = &_channelData[chan];
2009 
2010             if (Imath::modp(y, cd->ySampling) != 0)
2011                 continue;
2012 
2013             rowPtrs[chan].push_back(inDataPtr);
2014             inDataPtr += cd->width * Imf::pixelTypeSize(cd->type);
2015         }
2016     }
2017 
2018     inDataPtr = inPtr;
2019 
2020     //
2021     // Make a pass over all our CSC sets and try to encode them first
2022     //
2023 
2024     for (unsigned int csc = 0; csc < _cscSets.size(); ++csc)
2025     {
2026 
2027         LossyDctEncoderCsc encoder
2028             (_dwaCompressionLevel / 100000.f,
2029              rowPtrs[_cscSets[csc].idx[0]],
2030              rowPtrs[_cscSets[csc].idx[1]],
2031              rowPtrs[_cscSets[csc].idx[2]],
2032              packedAcEnd,
2033              packedDcEnd,
2034              dwaCompressorToNonlinear,
2035              _channelData[_cscSets[csc].idx[0]].width,
2036              _channelData[_cscSets[csc].idx[0]].height,
2037              _channelData[_cscSets[csc].idx[0]].type,
2038              _channelData[_cscSets[csc].idx[1]].type,
2039              _channelData[_cscSets[csc].idx[2]].type);
2040 
2041         encoder.execute();
2042 
2043         *totalAcUncompressedCount  += encoder.numAcValuesEncoded();
2044         *totalDcUncompressedCount  += encoder.numDcValuesEncoded();
2045 
2046         packedAcEnd += encoder.numAcValuesEncoded() * sizeof(unsigned short);
2047         packedDcEnd += encoder.numDcValuesEncoded() * sizeof(unsigned short);
2048 
2049         encodedChannels[_cscSets[csc].idx[0]] = true;
2050         encodedChannels[_cscSets[csc].idx[1]] = true;
2051         encodedChannels[_cscSets[csc].idx[2]] = true;
2052     }
2053 
2054     for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
2055     {
2056         ChannelData *cd = &_channelData[chan];
2057 
2058         if (encodedChannels[chan])
2059             continue;
2060 
2061         switch (cd->compression)
2062         {
2063           case LOSSY_DCT:
2064 
2065             //
2066             // For LOSSY_DCT, treat this just like the CSC'd case,
2067             // but only operate on one channel
2068             //
2069 
2070             {
2071                 const unsigned short *nonlinearLut = 0;
2072 
2073                 if (!cd->pLinear)
2074                     nonlinearLut = dwaCompressorToNonlinear;
2075 
2076                 LossyDctEncoder encoder
2077                     (_dwaCompressionLevel / 100000.f,
2078                      rowPtrs[chan],
2079                      packedAcEnd,
2080                      packedDcEnd,
2081                      nonlinearLut,
2082                      cd->width,
2083                      cd->height,
2084                      cd->type);
2085 
2086                 encoder.execute();
2087 
2088                 *totalAcUncompressedCount  += encoder.numAcValuesEncoded();
2089                 *totalDcUncompressedCount  += encoder.numDcValuesEncoded();
2090 
2091                 packedAcEnd +=
2092                     encoder.numAcValuesEncoded() * sizeof (unsigned short);
2093 
2094                 packedDcEnd +=
2095                     encoder.numDcValuesEncoded() * sizeof (unsigned short);
2096             }
2097 
2098             break;
2099 
2100           case RLE:
2101 
2102             //
2103             // For RLE, bash the bytes up so that the first bytes of each
2104             // pixel are contingous, as are the second bytes, and so on.
2105             //
2106 
2107             for (unsigned int y = 0; y < rowPtrs[chan].size(); ++y)
2108             {
2109                 const char *row = rowPtrs[chan][y];
2110 
2111                 for (int x = 0; x < cd->width; ++x)
2112                 {
2113                     for (int byte = 0;
2114                          byte < Imf::pixelTypeSize (cd->type);
2115                          ++byte)
2116                     {
2117 
2118                         *cd->planarUncRleEnd[byte]++ = *row++;
2119                     }
2120                 }
2121 
2122                 *rleRawSize += cd->width * Imf::pixelTypeSize(cd->type);
2123             }
2124 
2125             break;
2126 
2127           case UNKNOWN:
2128 
2129             //
2130             // Otherwise, just copy data over verbatim
2131             //
2132 
2133             {
2134                 int scanlineSize = cd->width * Imf::pixelTypeSize(cd->type);
2135 
2136                 for (unsigned int y = 0; y < rowPtrs[chan].size(); ++y)
2137                 {
2138                     memcpy (cd->planarUncBufferEnd,
2139                             rowPtrs[chan][y],
2140                             scanlineSize);
2141 
2142                     cd->planarUncBufferEnd += scanlineSize;
2143                 }
2144 
2145                 *unknownUncompressedSize += cd->planarUncSize;
2146             }
2147 
2148             break;
2149 
2150           default:
2151 
2152             assert (false);
2153         }
2154 
2155         encodedChannels[chan] = true;
2156     }
2157 
2158     //
2159     // Pack the Unknown data into the output buffer first. Instead of
2160     // just copying it uncompressed, try zlib compression at least.
2161     //
2162 
2163     if (*unknownUncompressedSize > 0)
2164     {
2165         uLongf inSize  = (uLongf)(*unknownUncompressedSize);
2166         uLongf outSize = (uLongf)(ceil ((float)inSize * 1.01f) + 100);
2167 
2168         if (Z_OK != ::compress2 ((Bytef *)outDataPtr,
2169                                  &outSize,
2170                                  (const Bytef *)_planarUncBuffer[UNKNOWN],
2171                                  inSize,
2172                                  9))
2173         {
2174             throw Iex::BaseExc ("Data compression (zlib) failed.");
2175         }
2176 
2177         outDataPtr += outSize;
2178         *unknownCompressedSize = outSize;
2179     }
2180 
2181     //
2182     // Now, pack all the Lossy DCT coefficients into our output
2183     // buffer, with Huffman encoding.
2184     //
2185     // Also, record the compressed size and the number of
2186     // uncompressed componentns we have.
2187     //
2188 
2189     if (*totalAcUncompressedCount > 0)
2190     {
2191         switch (_acCompression)
2192         {
2193           case STATIC_HUFFMAN:
2194 
2195             *acCompressedSize = (int)
2196                 hufCompress((unsigned short *)_packedAcBuffer,
2197                             (int)*totalAcUncompressedCount,
2198                             outDataPtr);
2199             break;
2200 
2201           case DEFLATE:
2202 
2203             {
2204                 uLongf destLen = (uLongf)
2205                     (2 * (*totalAcUncompressedCount) * sizeof (unsigned short));
2206 
2207                 if (Z_OK != ::compress2
2208                                 ((Bytef *)outDataPtr,
2209                                  &destLen,
2210                                  (Bytef *)_packedAcBuffer,
2211                                  (uLong)(*totalAcUncompressedCount
2212                                                 * sizeof (unsigned short)),
2213                                  9))
2214                 {
2215                     throw Iex::InputExc ("Data compression (zlib) failed.");
2216                 }
2217 
2218                 *acCompressedSize = destLen;
2219             }
2220 
2221             break;
2222 
2223           default:
2224 
2225             assert (false);
2226         }
2227 
2228         outDataPtr += *acCompressedSize;
2229     }
2230 
2231     //
2232     // Handle the DC components separately
2233     //
2234 
2235     if (*totalDcUncompressedCount > 0)
2236     {
2237         *dcCompressedSize = _zip->compress
2238             (_packedDcBuffer,
2239              (int)(*totalDcUncompressedCount) * sizeof (unsigned short),
2240              outDataPtr);
2241 
2242         outDataPtr += *dcCompressedSize;
2243     }
2244 
2245     //
2246     // If we have RLE data, first RLE encode it and set the uncompressed
2247     // size. Then, deflate the results and set the compressed size.
2248     //
2249 
2250     if (*rleRawSize > 0)
2251     {
2252         *rleUncompressedSize = rleCompress
2253             ((int)(*rleRawSize),
2254              _planarUncBuffer[RLE],
2255              (signed char *)_rleBuffer);
2256 
2257         uLongf dstLen =
2258             (uLongf)ceil (1.01f * (float) * rleUncompressedSize) + 24;
2259 
2260         if (Z_OK != ::compress2
2261                         ((Bytef *)outDataPtr,
2262                          &dstLen,
2263                          (Bytef *)_rleBuffer,
2264                          (uLong)(*rleUncompressedSize),
2265                          9))
2266         {
2267             throw Iex::BaseExc ("Error compressing RLE'd data.");
2268         }
2269 
2270        *rleCompressedSize = dstLen;
2271         outDataPtr       += *rleCompressedSize;
2272     }
2273 
2274     //
2275     // Flip the counters to XDR format
2276     //
2277 
2278     for (int i = 0; i < NUM_SIZES_SINGLE; ++i)
2279     {
2280         Int64  src = *(((Int64 *)_outBuffer) + i);
2281         char  *dst = (char *)(((Int64 *)_outBuffer) + i);
2282 
2283         Xdr::write<CharPtrIO> (dst, src);
2284     }
2285 
2286     //
2287     // We're done - compute the number of bytes we packed
2288     //
2289 
2290     outPtr = _outBuffer;
2291 
2292     return static_cast<int>(outDataPtr - _outBuffer + 1);
2293 }
2294 
2295 
2296 int
uncompress(const char * inPtr,int inSize,int minY,const char * & outPtr)2297 DwaCompressor::uncompress
2298     (const char *inPtr,
2299      int inSize,
2300      int minY,
2301      const char *&outPtr)
2302 {
2303     return uncompress (inPtr,
2304                        inSize,
2305                        Imath::Box2i (Imath::V2i (_min[0], minY),
2306                        Imath::V2i (_max[0], minY + numScanLines() - 1)),
2307                        outPtr);
2308 }
2309 
2310 
2311 int
uncompressTile(const char * inPtr,int inSize,Imath::Box2i range,const char * & outPtr)2312 DwaCompressor::uncompressTile
2313     (const char *inPtr,
2314      int inSize,
2315      Imath::Box2i range,
2316      const char *&outPtr)
2317 {
2318     return uncompress (inPtr, inSize, range, outPtr);
2319 }
2320 
2321 
2322 int
uncompress(const char * inPtr,int inSize,Imath::Box2i range,const char * & outPtr)2323 DwaCompressor::uncompress
2324     (const char *inPtr,
2325      int inSize,
2326      Imath::Box2i range,
2327      const char *&outPtr)
2328 {
2329     int minX = range.min.x;
2330     int maxX = std::min (range.max.x, _max[0]);
2331     int minY = range.min.y;
2332     int maxY = std::min (range.max.y, _max[1]);
2333 
2334     int headerSize = NUM_SIZES_SINGLE*sizeof(Int64);
2335     if (inSize < headerSize)
2336     {
2337         throw Iex::InputExc("Error uncompressing DWA data"
2338                             "(truncated header).");
2339     }
2340 
2341     //
2342     // Flip the counters from XDR to NATIVE
2343     //
2344 
2345     for (int i = 0; i < NUM_SIZES_SINGLE; ++i)
2346     {
2347         Int64      *dst =  (((Int64 *)inPtr) + i);
2348         const char *src = (char *)(((Int64 *)inPtr) + i);
2349 
2350         Xdr::read<CharPtrIO> (src, *dst);
2351     }
2352 
2353     //
2354     // Unwind all the counter info
2355     //
2356 
2357     const Int64 *inPtr64 = (const Int64*) inPtr;
2358 
2359     Int64 version                  = *(inPtr64 + VERSION);
2360     Int64 unknownUncompressedSize  = *(inPtr64 + UNKNOWN_UNCOMPRESSED_SIZE);
2361     Int64 unknownCompressedSize    = *(inPtr64 + UNKNOWN_COMPRESSED_SIZE);
2362     Int64 acCompressedSize         = *(inPtr64 + AC_COMPRESSED_SIZE);
2363     Int64 dcCompressedSize         = *(inPtr64 + DC_COMPRESSED_SIZE);
2364     Int64 rleCompressedSize        = *(inPtr64 + RLE_COMPRESSED_SIZE);
2365     Int64 rleUncompressedSize      = *(inPtr64 + RLE_UNCOMPRESSED_SIZE);
2366     Int64 rleRawSize               = *(inPtr64 + RLE_RAW_SIZE);
2367 
2368     Int64 totalAcUncompressedCount = *(inPtr64 + AC_UNCOMPRESSED_COUNT);
2369     Int64 totalDcUncompressedCount = *(inPtr64 + DC_UNCOMPRESSED_COUNT);
2370 
2371     Int64 acCompression            = *(inPtr64 + AC_COMPRESSION);
2372 
2373     Int64 compressedSize           = unknownCompressedSize +
2374                                      acCompressedSize +
2375                                      dcCompressedSize +
2376                                      rleCompressedSize;
2377 
2378     const char *dataPtr            = inPtr + NUM_SIZES_SINGLE * sizeof(Int64);
2379 
2380     /* Both the sum and individual sizes are checked in case of overflow. */
2381     if (inSize < (headerSize + compressedSize) ||
2382         inSize < unknownCompressedSize ||
2383         inSize < acCompressedSize ||
2384         inSize < dcCompressedSize ||
2385         inSize < rleCompressedSize)
2386     {
2387         throw Iex::InputExc("Error uncompressing DWA data"
2388                             "(truncated file).");
2389     }
2390 
2391     if (unknownUncompressedSize < 0  ||
2392         unknownCompressedSize < 0    ||
2393         acCompressedSize < 0         ||
2394         dcCompressedSize < 0         ||
2395         rleCompressedSize < 0        ||
2396         rleUncompressedSize < 0      ||
2397         rleRawSize < 0               ||
2398         totalAcUncompressedCount < 0 ||
2399         totalDcUncompressedCount < 0)
2400     {
2401         throw Iex::InputExc("Error uncompressing DWA data"
2402                             " (corrupt header).");
2403     }
2404 
2405     if (version < 2)
2406         initializeLegacyChannelRules();
2407     else
2408     {
2409         unsigned short ruleSize = 0;
2410         Xdr::read<CharPtrIO>(dataPtr, ruleSize);
2411 
2412         if (ruleSize < 0)
2413             throw Iex::InputExc("Error uncompressing DWA data"
2414                                 " (corrupt header file).");
2415 
2416         headerSize += ruleSize;
2417         if (inSize < headerSize + compressedSize)
2418             throw Iex::InputExc("Error uncompressing DWA data"
2419                                 " (truncated file).");
2420 
2421         _channelRules.clear();
2422         ruleSize -= Xdr::size<unsigned short> ();
2423         while (ruleSize > 0)
2424         {
2425             Classifier rule(dataPtr, ruleSize);
2426 
2427             _channelRules.push_back(rule);
2428             ruleSize -= rule.size();
2429         }
2430     }
2431 
2432 
2433     size_t outBufferSize = 0;
2434     initializeBuffers(outBufferSize);
2435 
2436     //
2437     // Allocate _outBuffer, if we haven't done so already
2438     //
2439 
2440     if (_maxScanLineSize * numScanLines() > _outBufferSize)
2441     {
2442         _outBufferSize = _maxScanLineSize * numScanLines();
2443         if (_outBuffer != 0)
2444             delete[] _outBuffer;
2445         _outBuffer = new char[_maxScanLineSize * numScanLines()];
2446     }
2447 
2448 
2449     char *outBufferEnd = _outBuffer;
2450 
2451 
2452     //
2453     // Find the start of the RLE packed AC components and
2454     // the DC components for each channel. This will be handy
2455     // if you want to decode the channels in parallel later on.
2456     //
2457 
2458     char *packedAcBufferEnd = 0;
2459 
2460     if (_packedAcBuffer)
2461         packedAcBufferEnd = _packedAcBuffer;
2462 
2463     char *packedDcBufferEnd = 0;
2464 
2465     if (_packedDcBuffer)
2466         packedDcBufferEnd = _packedDcBuffer;
2467 
2468     //
2469     // UNKNOWN data is packed first, followed by the
2470     // Huffman-compressed AC, then the DC values,
2471     // and then the zlib compressed RLE data.
2472     //
2473 
2474     const char *compressedUnknownBuf = dataPtr;
2475 
2476     const char *compressedAcBuf      = compressedUnknownBuf +
2477                                   static_cast<ptrdiff_t>(unknownCompressedSize);
2478     const char *compressedDcBuf      = compressedAcBuf +
2479                                   static_cast<ptrdiff_t>(acCompressedSize);
2480     const char *compressedRleBuf     = compressedDcBuf +
2481                                   static_cast<ptrdiff_t>(dcCompressedSize);
2482 
2483     //
2484     // Sanity check that the version is something we expect. Right now,
2485     // we can decode version 0, 1, and 2. v1 adds 'end of block' symbols
2486     // to the AC RLE. v2 adds channel classification rules at the
2487     // start of the data block.
2488     //
2489 
2490     if ((version < 0) || (version > 2))
2491         throw Iex::InputExc ("Invalid version of compressed data block");
2492 
2493     setupChannelData(minX, minY, maxX, maxY);
2494 
2495     //
2496     // Uncompress the UNKNOWN data into _planarUncBuffer[UNKNOWN]
2497     //
2498 
2499     if (unknownCompressedSize > 0)
2500     {
2501         uLongf outSize = static_cast<uLongf>(
2502                 ceil( (float)unknownUncompressedSize * 1.01) + 100);
2503 
2504         if (unknownUncompressedSize < 0 ||
2505             outSize > _planarUncBufferSize[UNKNOWN])
2506         {
2507             throw Iex::InputExc("Error uncompressing DWA data"
2508                                 "(corrupt header).");
2509         }
2510 
2511         if (Z_OK != ::uncompress
2512                         ((Bytef *)_planarUncBuffer[UNKNOWN],
2513                          &outSize,
2514                          (Bytef *)compressedUnknownBuf,
2515                          (uLong)unknownCompressedSize))
2516         {
2517             throw Iex::BaseExc("Error uncompressing UNKNOWN data.");
2518         }
2519     }
2520 
2521     //
2522     // Uncompress the AC data into _packedAcBuffer
2523     //
2524 
2525     if (acCompressedSize > 0)
2526     {
2527         if (totalAcUncompressedCount*sizeof(unsigned short) > _packedAcBufferSize)
2528         {
2529             throw Iex::InputExc("Error uncompressing DWA data"
2530                                 "(corrupt header).");
2531         }
2532 
2533         //
2534         // Don't trust the user to get it right, look in the file.
2535         //
2536 
2537         switch (acCompression)
2538         {
2539           case STATIC_HUFFMAN:
2540 
2541             hufUncompress
2542                 (compressedAcBuf,
2543                  (int)acCompressedSize,
2544                  (unsigned short *)_packedAcBuffer,
2545                  (int)totalAcUncompressedCount);
2546 
2547             break;
2548 
2549           case DEFLATE:
2550             {
2551                 uLongf destLen =
2552                     (int)(totalAcUncompressedCount) * sizeof (unsigned short);
2553 
2554                 if (Z_OK != ::uncompress
2555                                 ((Bytef *)_packedAcBuffer,
2556                                  &destLen,
2557                                  (Bytef *)compressedAcBuf,
2558                                  (uLong)acCompressedSize))
2559                 {
2560                     throw Iex::InputExc ("Data decompression (zlib) failed.");
2561                 }
2562 
2563                 if (totalAcUncompressedCount * sizeof (unsigned short) !=
2564                                 destLen)
2565                 {
2566                     throw Iex::InputExc ("AC data corrupt.");
2567                 }
2568             }
2569             break;
2570 
2571           default:
2572 
2573             throw Iex::NoImplExc ("Unknown AC Compression");
2574             break;
2575         }
2576     }
2577 
2578     //
2579     // Uncompress the DC data into _packedDcBuffer
2580     //
2581 
2582     if (dcCompressedSize > 0)
2583     {
2584         if (totalDcUncompressedCount*sizeof(unsigned short) > _packedDcBufferSize)
2585         {
2586             throw Iex::InputExc("Error uncompressing DWA data"
2587                                 "(corrupt header).");
2588         }
2589 
2590         if (_zip->uncompress
2591                     (compressedDcBuf, (int)dcCompressedSize, _packedDcBuffer)
2592             != (int)totalDcUncompressedCount * sizeof (unsigned short))
2593         {
2594             throw Iex::BaseExc("DC data corrupt.");
2595         }
2596     }
2597 
2598     //
2599     // Uncompress the RLE data into _rleBuffer, then unRLE the results
2600     // into _planarUncBuffer[RLE]
2601     //
2602 
2603     if (rleRawSize > 0)
2604     {
2605         if (rleUncompressedSize > _rleBufferSize ||
2606             rleRawSize > _planarUncBufferSize[RLE])
2607         {
2608             throw Iex::InputExc("Error uncompressing DWA data"
2609                                 "(corrupt header).");
2610         }
2611 
2612         uLongf dstLen = (uLongf)rleUncompressedSize;
2613 
2614         if (Z_OK != ::uncompress
2615                         ((Bytef *)_rleBuffer,
2616                          &dstLen,
2617                          (Bytef *)compressedRleBuf,
2618                          (uLong)rleCompressedSize))
2619         {
2620             throw Iex::BaseExc("Error uncompressing RLE data.");
2621         }
2622 
2623         if (dstLen != rleUncompressedSize)
2624             throw Iex::BaseExc("RLE data corrupted");
2625 
2626         if (rleUncompress
2627                 ((int)rleUncompressedSize,
2628                  (int)rleRawSize,
2629                  (signed char *)_rleBuffer,
2630                  _planarUncBuffer[RLE]) != rleRawSize)
2631         {
2632             throw Iex::BaseExc("RLE data corrupted");
2633         }
2634     }
2635 
2636     //
2637     // Determine the start of each row in the output buffer
2638     //
2639 
2640     std::vector<bool> decodedChannels (_channelData.size());
2641     std::vector< std::vector<char *> > rowPtrs (_channelData.size());
2642 
2643     for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
2644         decodedChannels[chan] = false;
2645 
2646     outBufferEnd = _outBuffer;
2647 
2648     for (int y = minY; y <= maxY; ++y)
2649     {
2650         for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
2651         {
2652             ChannelData *cd = &_channelData[chan];
2653 
2654             if (Imath::modp (y, cd->ySampling) != 0)
2655                 continue;
2656 
2657             rowPtrs[chan].push_back (outBufferEnd);
2658             outBufferEnd += cd->width * Imf::pixelTypeSize (cd->type);
2659         }
2660     }
2661 
2662     //
2663     // Setup to decode each block of 3 channels that need to
2664     // be handled together
2665     //
2666 
2667     for (unsigned int csc = 0; csc < _cscSets.size(); ++csc)
2668     {
2669         int rChan = _cscSets[csc].idx[0];
2670         int gChan = _cscSets[csc].idx[1];
2671         int bChan = _cscSets[csc].idx[2];
2672 
2673 
2674         LossyDctDecoderCsc decoder
2675             (rowPtrs[rChan],
2676              rowPtrs[gChan],
2677              rowPtrs[bChan],
2678              packedAcBufferEnd,
2679              packedDcBufferEnd,
2680              dwaCompressorToLinear,
2681              _channelData[rChan].width,
2682              _channelData[rChan].height,
2683              _channelData[rChan].type,
2684              _channelData[gChan].type,
2685              _channelData[bChan].type);
2686 
2687         decoder.execute();
2688 
2689         packedAcBufferEnd +=
2690             decoder.numAcValuesEncoded() * sizeof (unsigned short);
2691 
2692         packedDcBufferEnd +=
2693             decoder.numDcValuesEncoded() * sizeof (unsigned short);
2694 
2695         decodedChannels[rChan] = true;
2696         decodedChannels[gChan] = true;
2697         decodedChannels[bChan] = true;
2698     }
2699 
2700     //
2701     // Setup to handle the remaining channels by themselves
2702     //
2703 
2704     for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
2705     {
2706         if (decodedChannels[chan])
2707             continue;
2708 
2709         ChannelData *cd = &_channelData[chan];
2710         int pixelSize = Imf::pixelTypeSize (cd->type);
2711 
2712         switch (cd->compression)
2713         {
2714           case LOSSY_DCT:
2715 
2716             //
2717             // Setup a single-channel lossy DCT decoder pointing
2718             // at the output buffer
2719             //
2720 
2721             {
2722                 const unsigned short *linearLut = 0;
2723 
2724                 if (!cd->pLinear)
2725                     linearLut = dwaCompressorToLinear;
2726 
2727                 LossyDctDecoder decoder
2728                     (rowPtrs[chan],
2729                      packedAcBufferEnd,
2730                      packedDcBufferEnd,
2731                      linearLut,
2732                      cd->width,
2733                      cd->height,
2734                      cd->type);
2735 
2736                 decoder.execute();
2737 
2738                 packedAcBufferEnd +=
2739                     decoder.numAcValuesEncoded() * sizeof (unsigned short);
2740 
2741                 packedDcBufferEnd +=
2742                     decoder.numDcValuesEncoded() * sizeof (unsigned short);
2743             }
2744 
2745             break;
2746 
2747           case RLE:
2748 
2749             //
2750             // For the RLE case, the data has been un-RLE'd into
2751             // planarUncRleEnd[], but is still split out by bytes.
2752             // We need to rearrange the bytes back into the correct
2753             // order in the output buffer;
2754             //
2755 
2756             {
2757                 int row = 0;
2758 
2759                 for (int y = minY; y <= maxY; ++y)
2760                 {
2761                     if (Imath::modp (y, cd->ySampling) != 0)
2762                         continue;
2763 
2764                     char *dst = rowPtrs[chan][row];
2765 
2766                     if (pixelSize == 2)
2767                     {
2768                         interleaveByte2 (dst,
2769                                          cd->planarUncRleEnd[0],
2770                                          cd->planarUncRleEnd[1],
2771                                          cd->width);
2772 
2773                         cd->planarUncRleEnd[0] += cd->width;
2774                         cd->planarUncRleEnd[1] += cd->width;
2775                     }
2776                     else
2777                     {
2778                         for (int x = 0; x < cd->width; ++x)
2779                         {
2780                             for (int byte = 0; byte < pixelSize; ++byte)
2781                             {
2782                                *dst++ = *cd->planarUncRleEnd[byte]++;
2783                             }
2784                         }
2785                     }
2786 
2787                     row++;
2788                 }
2789             }
2790 
2791             break;
2792 
2793           case UNKNOWN:
2794 
2795             //
2796             // In the UNKNOWN case, data is already in planarUncBufferEnd
2797             // and just needs to copied over to the output buffer
2798             //
2799 
2800             {
2801                 int row             = 0;
2802                 int dstScanlineSize = cd->width * Imf::pixelTypeSize (cd->type);
2803 
2804                 for (int y = minY; y <= maxY; ++y)
2805                 {
2806                     if (Imath::modp (y, cd->ySampling) != 0)
2807                         continue;
2808 
2809                     memcpy (rowPtrs[chan][row],
2810                             cd->planarUncBufferEnd,
2811                             dstScanlineSize);
2812 
2813                     cd->planarUncBufferEnd += dstScanlineSize;
2814                     row++;
2815                 }
2816             }
2817 
2818             break;
2819 
2820           default:
2821 
2822             throw Iex::NoImplExc ("Unhandled compression scheme case");
2823             break;
2824         }
2825 
2826         decodedChannels[chan] = true;
2827     }
2828 
2829     //
2830     // Return a ptr to _outBuffer
2831     //
2832 
2833     outPtr = _outBuffer;
2834     return (int)(outBufferEnd - _outBuffer);
2835 }
2836 
2837 
2838 // static
2839 void
initializeFuncs()2840 DwaCompressor::initializeFuncs()
2841 {
2842     convertFloatToHalf64 = convertFloatToHalf64_scalar;
2843     fromHalfZigZag       = fromHalfZigZag_scalar;
2844 
2845     CpuId cpuId;
2846 
2847     //
2848     // Setup HALF <-> FLOAT conversion implementations
2849     //
2850 
2851     if (cpuId.avx && cpuId.f16c)
2852     {
2853         convertFloatToHalf64 = convertFloatToHalf64_f16c;
2854         fromHalfZigZag       = fromHalfZigZag_f16c;
2855     }
2856 
2857     //
2858     // Setup inverse DCT implementations
2859     //
2860 
2861     dctInverse8x8_0 = dctInverse8x8_scalar<0>;
2862     dctInverse8x8_1 = dctInverse8x8_scalar<1>;
2863     dctInverse8x8_2 = dctInverse8x8_scalar<2>;
2864     dctInverse8x8_3 = dctInverse8x8_scalar<3>;
2865     dctInverse8x8_4 = dctInverse8x8_scalar<4>;
2866     dctInverse8x8_5 = dctInverse8x8_scalar<5>;
2867     dctInverse8x8_6 = dctInverse8x8_scalar<6>;
2868     dctInverse8x8_7 = dctInverse8x8_scalar<7>;
2869 
2870     if (cpuId.avx)
2871     {
2872         dctInverse8x8_0 = dctInverse8x8_avx<0>;
2873         dctInverse8x8_1 = dctInverse8x8_avx<1>;
2874         dctInverse8x8_2 = dctInverse8x8_avx<2>;
2875         dctInverse8x8_3 = dctInverse8x8_avx<3>;
2876         dctInverse8x8_4 = dctInverse8x8_avx<4>;
2877         dctInverse8x8_5 = dctInverse8x8_avx<5>;
2878         dctInverse8x8_6 = dctInverse8x8_avx<6>;
2879         dctInverse8x8_7 = dctInverse8x8_avx<7>;
2880     }
2881     else if (cpuId.sse2)
2882     {
2883         dctInverse8x8_0 = dctInverse8x8_sse2<0>;
2884         dctInverse8x8_1 = dctInverse8x8_sse2<1>;
2885         dctInverse8x8_2 = dctInverse8x8_sse2<2>;
2886         dctInverse8x8_3 = dctInverse8x8_sse2<3>;
2887         dctInverse8x8_4 = dctInverse8x8_sse2<4>;
2888         dctInverse8x8_5 = dctInverse8x8_sse2<5>;
2889         dctInverse8x8_6 = dctInverse8x8_sse2<6>;
2890         dctInverse8x8_7 = dctInverse8x8_sse2<7>;
2891     }
2892 }
2893 
2894 
2895 //
2896 // Handle channel classification and buffer allocation once we know
2897 // how to classify channels
2898 //
2899 
2900 void
initializeBuffers(size_t & outBufferSize)2901 DwaCompressor::initializeBuffers (size_t &outBufferSize)
2902 {
2903     classifyChannels (_channels, _channelData, _cscSets);
2904 
2905     //
2906     // _outBuffer needs to be big enough to hold all our
2907     // compressed data - which could vary depending on what sort
2908     // of channels we have.
2909     //
2910 
2911     int maxOutBufferSize  = 0;
2912     int numLossyDctChans  = 0;
2913     int unknownBufferSize = 0;
2914     int rleBufferSize     = 0;
2915 
2916     int maxLossyDctAcSize = (int)ceil ((float)numScanLines() / 8.0f) *
2917                             (int)ceil ((float)(_max[0] - _min[0] + 1) / 8.0f) *
2918                             63 * sizeof (unsigned short);
2919 
2920     int maxLossyDctDcSize = (int)ceil ((float)numScanLines() / 8.0f) *
2921                             (int)ceil ((float)(_max[0] - _min[0] + 1) / 8.0f) *
2922                             sizeof (unsigned short);
2923 
2924     for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
2925     {
2926         switch (_channelData[chan].compression)
2927         {
2928           case LOSSY_DCT:
2929 
2930             //
2931             // This is the size of the number of packed
2932             // components, plus the requirements for
2933             // maximum Huffman encoding size.
2934             //
2935 
2936             maxOutBufferSize += 2 * maxLossyDctAcSize + 65536;
2937             numLossyDctChans++;
2938             break;
2939 
2940           case RLE:
2941             {
2942                 //
2943                 // RLE, if gone horribly wrong, could double the size
2944                 // of the source data.
2945                 //
2946 
2947                 int rleAmount = 2 * numScanLines() * (_max[0] - _min[0] + 1) *
2948                                 Imf::pixelTypeSize (_channelData[chan].type);
2949 
2950                 rleBufferSize += rleAmount;
2951             }
2952             break;
2953 
2954 
2955           case UNKNOWN:
2956 
2957             unknownBufferSize += numScanLines() * (_max[0] - _min[0] + 1) *
2958                                  Imf::pixelTypeSize (_channelData[chan].type);
2959             break;
2960 
2961           default:
2962 
2963             throw Iex::NoImplExc ("Unhandled compression scheme case");
2964             break;
2965         }
2966     }
2967 
2968     //
2969     // Also, since the results of the RLE are packed into
2970     // the output buffer, we need the extra room there. But
2971     // we're going to zlib compress() the data we pack,
2972     // which could take slightly more space
2973     //
2974 
2975     maxOutBufferSize += (int)(ceil (1.01f * (float)rleBufferSize) + 100);
2976 
2977     //
2978     // And the same goes for the UNKNOWN data
2979     //
2980 
2981     maxOutBufferSize += (int)(ceil (1.01f * (float)unknownBufferSize) + 100);
2982 
2983     //
2984     // Allocate a zip/deflate compressor big enought to hold the DC data
2985     // and include it's compressed results in the size requirements
2986     // for our output buffer
2987     //
2988 
2989     if (_zip == 0)
2990         _zip = new Zip (maxLossyDctDcSize * numLossyDctChans);
2991     else if (_zip->maxRawSize() < maxLossyDctDcSize * numLossyDctChans)
2992     {
2993         delete _zip;
2994         _zip = new Zip (maxLossyDctDcSize * numLossyDctChans);
2995     }
2996 
2997 
2998     maxOutBufferSize += _zip->maxCompressedSize();
2999 
3000     //
3001     // We also need to reserve space at the head of the buffer to
3002     // write out the size of our various packed and compressed data.
3003     //
3004 
3005     maxOutBufferSize += NUM_SIZES_SINGLE * sizeof (Int64);
3006 
3007 
3008     //
3009     // Later, we're going to hijack outBuffer for the result of
3010     // both encoding and decoding. So it needs to be big enough
3011     // to hold either a buffers' worth of uncompressed or
3012     // compressed data
3013     //
3014     // For encoding, we'll need _outBuffer to hold maxOutBufferSize bytes,
3015     // but for decoding, we only need it to be maxScanLineSize*numScanLines.
3016     // Cache the max size for now, and alloc the buffer when we either
3017     // encode or decode.
3018     //
3019 
3020     outBufferSize = maxOutBufferSize;
3021 
3022 
3023     //
3024     // _packedAcBuffer holds the quantized DCT coefficients prior
3025     // to Huffman encoding
3026     //
3027 
3028     if (maxLossyDctAcSize * numLossyDctChans > _packedAcBufferSize)
3029     {
3030         _packedAcBufferSize = maxLossyDctAcSize * numLossyDctChans;
3031         if (_packedAcBuffer != 0)
3032             delete[] _packedAcBuffer;
3033         _packedAcBuffer = new char[_packedAcBufferSize];
3034     }
3035 
3036     //
3037     // _packedDcBuffer holds one quantized DCT coef per 8x8 block
3038     //
3039 
3040     if (maxLossyDctDcSize * numLossyDctChans > _packedDcBufferSize)
3041     {
3042         _packedDcBufferSize = maxLossyDctDcSize * numLossyDctChans;
3043         if (_packedDcBuffer != 0)
3044             delete[] _packedDcBuffer;
3045         _packedDcBuffer     = new char[_packedDcBufferSize];
3046     }
3047 
3048     if (rleBufferSize > _rleBufferSize)
3049     {
3050         _rleBufferSize = rleBufferSize;
3051         if (_rleBuffer != 0)
3052             delete[] _rleBuffer;
3053         _rleBuffer = new char[rleBufferSize];
3054     }
3055 
3056     //
3057     // The planar uncompressed buffer will hold float data for LOSSY_DCT
3058     // compressed values, and whatever the native type is for other
3059     // channels. We're going to use this to hold data in a planar
3060     // format, as opposed to the native interleaved format we take
3061     // into compress() and give back from uncompress().
3062     //
3063     // This also makes it easier to compress the UNKNOWN and RLE data
3064     // all in one swoop (for each compression scheme).
3065     //
3066 
3067     int planarUncBufferSize[NUM_COMPRESSOR_SCHEMES];
3068     for (int i=0; i<NUM_COMPRESSOR_SCHEMES; ++i)
3069         planarUncBufferSize[i] = 0;
3070 
3071     for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
3072     {
3073         switch (_channelData[chan].compression)
3074         {
3075           case LOSSY_DCT:
3076             break;
3077 
3078           case RLE:
3079             planarUncBufferSize[RLE] +=
3080                      numScanLines() * (_max[0] - _min[0] + 1) *
3081                      Imf::pixelTypeSize (_channelData[chan].type);
3082             break;
3083 
3084           case UNKNOWN:
3085             planarUncBufferSize[UNKNOWN] +=
3086                      numScanLines() * (_max[0] - _min[0] + 1) *
3087                      Imf::pixelTypeSize (_channelData[chan].type);
3088             break;
3089 
3090           default:
3091             throw Iex::NoImplExc ("Unhandled compression scheme case");
3092             break;
3093         }
3094     }
3095 
3096     //
3097     // UNKNOWN data is going to be zlib compressed, which needs
3098     // a little extra headroom
3099     //
3100 
3101     if (planarUncBufferSize[UNKNOWN] > 0)
3102     {
3103         planarUncBufferSize[UNKNOWN] =
3104             (int) ceil (1.01f * (float)planarUncBufferSize[UNKNOWN]) + 100;
3105     }
3106 
3107     for (int i = 0; i < NUM_COMPRESSOR_SCHEMES; ++i)
3108     {
3109         if (planarUncBufferSize[i] > _planarUncBufferSize[i])
3110         {
3111             _planarUncBufferSize[i] = planarUncBufferSize[i];
3112             if (_planarUncBuffer[i] != 0)
3113                 delete[] _planarUncBuffer[i];
3114             _planarUncBuffer[i] = new char[planarUncBufferSize[i]];
3115         }
3116     }
3117 }
3118 
3119 
3120 //
3121 // Setup channel classification rules to use when writing files
3122 //
3123 
3124 void
initializeDefaultChannelRules()3125 DwaCompressor::initializeDefaultChannelRules ()
3126 {
3127     _channelRules.clear();
3128 
3129     _channelRules.push_back (Classifier ("R",     LOSSY_DCT, HALF,   0, false));
3130     _channelRules.push_back (Classifier ("R",     LOSSY_DCT, FLOAT,  0, false));
3131     _channelRules.push_back (Classifier ("G",     LOSSY_DCT, HALF,   1, false));
3132     _channelRules.push_back (Classifier ("G",     LOSSY_DCT, FLOAT,  1, false));
3133     _channelRules.push_back (Classifier ("B",     LOSSY_DCT, HALF,   2, false));
3134     _channelRules.push_back (Classifier ("B",     LOSSY_DCT, FLOAT,  2, false));
3135 
3136     _channelRules.push_back (Classifier ("Y",     LOSSY_DCT, HALF,  -1, false));
3137     _channelRules.push_back (Classifier ("Y",     LOSSY_DCT, FLOAT, -1, false));
3138     _channelRules.push_back (Classifier ("BY",    LOSSY_DCT, HALF,  -1, false));
3139     _channelRules.push_back (Classifier ("BY",    LOSSY_DCT, FLOAT, -1, false));
3140     _channelRules.push_back (Classifier ("RY",    LOSSY_DCT, HALF,  -1, false));
3141     _channelRules.push_back (Classifier ("RY",    LOSSY_DCT, FLOAT, -1, false));
3142 
3143     _channelRules.push_back (Classifier ("A",     RLE,       UINT,  -1, false));
3144     _channelRules.push_back (Classifier ("A",     RLE,       HALF,  -1, false));
3145     _channelRules.push_back (Classifier ("A",     RLE,       FLOAT, -1, false));
3146 }
3147 
3148 
3149 //
3150 // Setup channel classification rules when reading files with VERSION < 2
3151 //
3152 
3153 void
initializeLegacyChannelRules()3154 DwaCompressor::initializeLegacyChannelRules ()
3155 {
3156     _channelRules.clear();
3157 
3158     _channelRules.push_back (Classifier ("r",     LOSSY_DCT, HALF,   0, true));
3159     _channelRules.push_back (Classifier ("r",     LOSSY_DCT, FLOAT,  0, true));
3160     _channelRules.push_back (Classifier ("red",   LOSSY_DCT, HALF,   0, true));
3161     _channelRules.push_back (Classifier ("red",   LOSSY_DCT, FLOAT,  0, true));
3162     _channelRules.push_back (Classifier ("g",     LOSSY_DCT, HALF,   1, true));
3163     _channelRules.push_back (Classifier ("g",     LOSSY_DCT, FLOAT,  1, true));
3164     _channelRules.push_back (Classifier ("grn",   LOSSY_DCT, HALF,   1, true));
3165     _channelRules.push_back (Classifier ("grn",   LOSSY_DCT, FLOAT,  1, true));
3166     _channelRules.push_back (Classifier ("green", LOSSY_DCT, HALF,   1, true));
3167     _channelRules.push_back (Classifier ("green", LOSSY_DCT, FLOAT,  1, true));
3168     _channelRules.push_back (Classifier ("b",     LOSSY_DCT, HALF,   2, true));
3169     _channelRules.push_back (Classifier ("b",     LOSSY_DCT, FLOAT,  2, true));
3170     _channelRules.push_back (Classifier ("blu",   LOSSY_DCT, HALF,   2, true));
3171     _channelRules.push_back (Classifier ("blu",   LOSSY_DCT, FLOAT,  2, true));
3172     _channelRules.push_back (Classifier ("blue",  LOSSY_DCT, HALF,   2, true));
3173     _channelRules.push_back (Classifier ("blue",  LOSSY_DCT, FLOAT,  2, true));
3174 
3175     _channelRules.push_back (Classifier ("y",     LOSSY_DCT, HALF,  -1, true));
3176     _channelRules.push_back (Classifier ("y",     LOSSY_DCT, FLOAT, -1, true));
3177     _channelRules.push_back (Classifier ("by",    LOSSY_DCT, HALF,  -1, true));
3178     _channelRules.push_back (Classifier ("by",    LOSSY_DCT, FLOAT, -1, true));
3179     _channelRules.push_back (Classifier ("ry",    LOSSY_DCT, HALF,  -1, true));
3180     _channelRules.push_back (Classifier ("ry",    LOSSY_DCT, FLOAT, -1, true));
3181     _channelRules.push_back (Classifier ("a",     RLE,       UINT,  -1, true));
3182     _channelRules.push_back (Classifier ("a",     RLE,       HALF,  -1, true));
3183     _channelRules.push_back (Classifier ("a",     RLE,       FLOAT, -1, true));
3184 }
3185 
3186 
3187 //
3188 // Given a set of rules and ChannelData, figure out which rules apply
3189 //
3190 
3191 void
relevantChannelRules(std::vector<Classifier> & rules) const3192 DwaCompressor::relevantChannelRules (std::vector<Classifier> &rules) const
3193 {
3194     rules.clear();
3195 
3196     std::vector<std::string> suffixes;
3197 
3198     for (size_t cd = 0; cd < _channelData.size(); ++cd)
3199     {
3200         std::string suffix  = _channelData[cd].name;
3201         size_t      lastDot = suffix.find_last_of ('.');
3202 
3203         if (lastDot != std::string::npos)
3204             suffix = suffix.substr (lastDot+1, std::string::npos);
3205 
3206         suffixes.push_back(suffix);
3207     }
3208 
3209 
3210     for (size_t i = 0; i < _channelRules.size(); ++i)
3211     {
3212         for (size_t cd = 0; cd < _channelData.size(); ++cd)
3213         {
3214             if (_channelRules[i].match (suffixes[cd], _channelData[cd].type ))
3215             {
3216                 rules.push_back (_channelRules[i]);
3217                 break;
3218             }
3219         }
3220     }
3221 }
3222 
3223 
3224 //
3225 // Take our initial list of channels, and cache the contents.
3226 //
3227 // Determine approprate compression schemes for each channel,
3228 // and figure out which sets should potentially be CSC'ed
3229 // prior to lossy compression.
3230 //
3231 
3232 void
classifyChannels(ChannelList channels,std::vector<ChannelData> & chanData,std::vector<CscChannelSet> & cscData)3233 DwaCompressor::classifyChannels
3234     (ChannelList channels,
3235      std::vector<ChannelData> &chanData,
3236      std::vector<CscChannelSet> &cscData)
3237 {
3238     //
3239     // prefixMap used to map channel name prefixes to
3240     // potential CSC-able sets of channels.
3241     //
3242 
3243     std::map<std::string, DwaCompressor::CscChannelSet> prefixMap;
3244     std::vector<DwaCompressor::CscChannelSet>           tmpCscSet;
3245 
3246     unsigned int numChan = 0;
3247 
3248     for (ChannelList::Iterator c = channels.begin(); c != channels.end(); ++c)
3249         numChan++;
3250 
3251     if (numChan)
3252         chanData.resize (numChan);
3253 
3254     //
3255     // Cache the relevant data from the channel structs.
3256     //
3257 
3258     unsigned int offset = 0;
3259 
3260     for (ChannelList::Iterator c = channels.begin(); c != channels.end(); ++c)
3261     {
3262         chanData[offset].name        = std::string (c.name());
3263         chanData[offset].compression = UNKNOWN;
3264         chanData[offset].xSampling   = c.channel().xSampling;
3265         chanData[offset].ySampling   = c.channel().ySampling;
3266         chanData[offset].type        = c.channel().type;
3267         chanData[offset].pLinear     = c.channel().pLinear;
3268 
3269         offset++;
3270     }
3271 
3272     //
3273     // Try and figure out which channels should be
3274     // compressed by which means.
3275     //
3276 
3277     for (offset = 0; offset<numChan; ++offset)
3278     {
3279         std::string prefix  = "";
3280         std::string suffix  = chanData[offset].name;
3281         size_t      lastDot = suffix.find_last_of ('.');
3282 
3283         if (lastDot != std::string::npos)
3284         {
3285             prefix = suffix.substr (0,         lastDot);
3286             suffix = suffix.substr (lastDot+1, std::string::npos);
3287         }
3288 
3289         //
3290         // Make sure we have an entry in our CSC set map
3291         //
3292 
3293         std::map<std::string, DwaCompressor::CscChannelSet>::iterator
3294             theSet = prefixMap.find (prefix);
3295 
3296         if (theSet == prefixMap.end())
3297         {
3298             DwaCompressor::CscChannelSet tmpSet;
3299 
3300             tmpSet.idx[0] =
3301             tmpSet.idx[1] =
3302             tmpSet.idx[2] = -1;
3303 
3304             prefixMap[prefix] = tmpSet;
3305         }
3306 
3307         //
3308         // Check the suffix against the list of classifications
3309         // we defined previously. If the _cscIdx is not negative,
3310         // it indicates that we should be part of a CSC group.
3311         //
3312 
3313         for (std::vector<Classifier>::iterator i = _channelRules.begin();
3314              i != _channelRules.end();
3315              ++i)
3316         {
3317             if ( i->match(suffix, chanData[offset].type) )
3318             {
3319                 chanData[offset].compression = i->_scheme;
3320 
3321                 if ( i->_cscIdx >= 0)
3322                     prefixMap[prefix].idx[i->_cscIdx] = offset;
3323             }
3324         }
3325     }
3326 
3327     //
3328     // Finally, try and find RGB sets of channels which
3329     // can be CSC'ed to a Y'CbCr space prior to loss, for
3330     // better compression.
3331     //
3332     // Walk over our set of candidates, and see who has
3333     // all three channels defined (and has common sampling
3334     // patterns, etc).
3335     //
3336 
3337     for (std::map<std::string, DwaCompressor::CscChannelSet>::iterator
3338          theItem = prefixMap.begin(); theItem != prefixMap.end();
3339          ++theItem)
3340     {
3341         int red = (*theItem).second.idx[0];
3342         int grn = (*theItem).second.idx[1];
3343         int blu = (*theItem).second.idx[2];
3344 
3345         if ((red < 0) || (grn < 0) || (blu < 0))
3346             continue;
3347 
3348         if ((chanData[red].xSampling != chanData[grn].xSampling) ||
3349             (chanData[red].xSampling != chanData[blu].xSampling) ||
3350             (chanData[grn].xSampling != chanData[blu].xSampling) ||
3351             (chanData[red].ySampling != chanData[grn].ySampling) ||
3352             (chanData[red].ySampling != chanData[blu].ySampling) ||
3353             (chanData[grn].ySampling != chanData[blu].ySampling))
3354         {
3355             continue;
3356         }
3357 
3358         tmpCscSet.push_back ((*theItem).second);
3359     }
3360 
3361     size_t numCsc = tmpCscSet.size();
3362 
3363     if (numCsc)
3364         cscData.resize(numCsc);
3365 
3366     for (offset = 0; offset < numCsc; ++offset)
3367         cscData[offset] = tmpCscSet[offset];
3368 }
3369 
3370 
3371 
3372 //
3373 // Setup some buffer pointers, determine channel sizes, things
3374 // like that.
3375 //
3376 
3377 void
setupChannelData(int minX,int minY,int maxX,int maxY)3378 DwaCompressor::setupChannelData (int minX, int minY, int maxX, int maxY)
3379 {
3380     char *planarUncBuffer[NUM_COMPRESSOR_SCHEMES];
3381 
3382     for (int i=0; i<NUM_COMPRESSOR_SCHEMES; ++i)
3383     {
3384         planarUncBuffer[i] = 0;
3385 
3386         if (_planarUncBuffer[i])
3387             planarUncBuffer[i] =  _planarUncBuffer[i];
3388     }
3389 
3390     for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
3391     {
3392         ChannelData *cd = &_channelData[chan];
3393 
3394         cd->width  = Imf::numSamples (cd->xSampling, minX, maxX);
3395         cd->height = Imf::numSamples (cd->ySampling, minY, maxY);
3396 
3397         cd->planarUncSize =
3398             cd->width * cd->height * Imf::pixelTypeSize (cd->type);
3399 
3400         cd->planarUncBuffer    = planarUncBuffer[cd->compression];
3401         cd->planarUncBufferEnd = cd->planarUncBuffer;
3402 
3403         cd->planarUncRle[0]    = cd->planarUncBuffer;
3404         cd->planarUncRleEnd[0] = cd->planarUncRle[0];
3405 
3406         for (int byte = 1; byte < Imf::pixelTypeSize(cd->type); ++byte)
3407         {
3408             cd->planarUncRle[byte] =
3409                          cd->planarUncRle[byte-1] + cd->width * cd->height;
3410 
3411             cd->planarUncRleEnd[byte] =
3412                          cd->planarUncRle[byte];
3413         }
3414 
3415         cd->planarUncType = cd->type;
3416 
3417         if (cd->compression == LOSSY_DCT)
3418         {
3419             cd->planarUncType = FLOAT;
3420         }
3421         else
3422         {
3423             planarUncBuffer[cd->compression] +=
3424                 cd->width * cd->height * Imf::pixelTypeSize (cd->planarUncType);
3425         }
3426     }
3427 }
3428 
3429 OPENEXR_IMF_INTERNAL_NAMESPACE_SOURCE_EXIT
3430