1 ///////////////////////////////////////////////////////////////////////////
2 //
3 // Copyright (c) 2009-2014 DreamWorks Animation LLC.
4 //
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are
9 // met:
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above
13 // copyright notice, this list of conditions and the following disclaimer
14 // in the documentation and/or other materials provided with the
15 // distribution.
16 // * Neither the name of DreamWorks Animation nor the names of
17 // its contributors may be used to endorse or promote products derived
18 // from this software without specific prior written permission.
19 //
20 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 //
32 ///////////////////////////////////////////////////////////////////////////
33
34 //---------------------------------------------------
35 //
36 // class DwaCompressor -- Store lossy RGB data by quantizing
37 // DCT components.
38 //
39 // First, we try and figure out what compression strategy to take
40 // based in channel name. For RGB channels, we want a lossy method
41 // described below. But, if we have alpha, we should do something
42 // different (and probably using RLE). If we have depth, or velocity,
43 // or something else, just fall back to ZIP. The rules for deciding
44 // which strategy to use are setup in initializeDefaultChannelRules().
45 // When writing a file, the relevant rules needed to decode are written
46 // into the start of the data block, making a self-contained file.
47 // If initializeDefaultChannelRules() doesn't quite suite your naming
48 // conventions, you can adjust the rules without breaking decoder
49 // compatability.
50 //
51 // If we're going to lossy compress R, G, or B channels, it's easier
52 // to toss bits in a more perceptual uniform space. One could argue
53 // at length as to what constitutes perceptually uniform, expecially
54 // when storing either scene/input/focal plane referred and output referred
55 // data.
56 //
57 // We'll compromise. For values <= 1, we use a traditional power function
58 // (without any of that straight-line business at the bottom). For values > 1,
59 // we want something more like a log function, since power functions blow
60 // up. At 1, we want a smooth blend between the functions. So, we use a
61 // piecewise function that does just that - see dwaLookups.cpp for
62 // a little more detail.
63 //
64 // Also, if we find that we have R, G, and B channels from the same layer,
65 // we can get a bit more compression efficiency by transforming to a Y'CbCr
66 // space. We use the 709 transform, but with Cb,Cr = 0 for an input of
67 // (0, 0, 0), instead of the traditional Cb,Cr = .5. Shifting the zero point
68 // makes no sense with large range data. Transforms are done to from
69 // the perceptual space data, not the linear-light space data (R'G'B' ->
70 // (Y'CbCr, not RGB -> YCbCr).
71 //
72 // Next, we forward DCT the data. This is done with a floating
73 // point DCT, as we don't really have control over the src range. The
74 // resulting values are dropped to half-float precision.
75 //
76 // Now, we need to quantize. Quantization departs from the usual way
77 // of dividing and rounding. Instead, we start with some floating
78 // point "base-error" value. From this, we can derive quantization
79 // error for each DCT component. Take the standard JPEG quantization
80 // tables and normalize them by the smallest value. Then, multiply
81 // the normalized quant tables by our base-error value. This gives
82 // a range of errors for each DCT component.
83 //
84 // For each DCT component, we want to find a quantized value that
85 // is within +- the per-component error. Pick the quantized value
86 // that has the fewest bits set in its' binary representation.
87 // Brute-forcing the search would make for extremly inefficient
88 // compression. Fortunatly, we can precompute a table to assist
89 // with this search.
90 //
91 // For each 16-bit float value, there are at most 15 other values with
92 // fewer bits set. We can precompute these values in a compact form, since
93 // many source values have far fewer that 15 possible quantized values.
94 // Now, instead of searching the entire range +- the component error,
95 // we can just search at most 15 quantization candidates. The search can
96 // be accelerated a bit more by sorting the candidates by the
97 // number of bits set, in increasing order. Then, the search can stop
98 // once a candidate is found w/i the per-component quantization
99 // error range.
100 //
101 // The quantization strategy has the side-benefit that there is no
102 // de-quantization step upon decode, so we don't bother recording
103 // the quantization table.
104 //
105 // Ok. So we now have quantized values. Time for entropy coding. We
106 // can use either static Huffman or zlib/DEFLATE. The static Huffman
107 // is more efficient at compacting data, but can have a greater
108 // overhead, especially for smaller tile/strip sizes.
109 //
110 // There is some additional fun, like ZIP compressing the DC components
111 // instead of Huffman/zlib, which helps make things slightly smaller.
112 //
113 // Compression level is controlled by setting an int/float/double attribute
114 // on the header named "dwaCompressionLevel". This is a thinly veiled name for
115 // the "base-error" value mentioned above. The "base-error" is just
116 // dwaCompressionLevel / 100000. The default value of 45.0 is generally
117 // pretty good at generating "visually lossless" values at reasonable
118 // data rates. Setting dwaCompressionLevel to 0 should result in no additional
119 // quantization at the quantization stage (though there may be
120 // quantization in practice at the CSC/DCT steps). But if you really
121 // want lossless compression, there are pleanty of other choices
122 // of compressors ;)
123 //
124 // When dealing with FLOAT source buffers, we first quantize the source
125 // to HALF and continue down as we would for HALF source.
126 //
127 //---------------------------------------------------
128
129
130 #include "ImfDwaCompressor.h"
131 #include "ImfDwaCompressorSimd.h"
132
133 #include "ImfChannelList.h"
134 #include "ImfStandardAttributes.h"
135 #include "ImfHeader.h"
136 #include "ImfHuf.h"
137 #include "ImfInt64.h"
138 #include "ImfIntAttribute.h"
139 #include "ImfIO.h"
140 #include "ImfMisc.h"
141 #include "ImfNamespace.h"
142 #include "ImfRle.h"
143 #include "ImfSimd.h"
144 #include "ImfSystemSpecific.h"
145 #include "ImfXdr.h"
146 #include "ImfZip.h"
147
148 #include "ImathFun.h"
149 #include "ImathBox.h"
150 #include "ImathVec.h"
151 #include "half.h"
152
153 #include "dwaLookups.h"
154
155 #include <vector>
156 #include <string>
157 #include <cctype>
158 #include <cassert>
159 #include <algorithm>
160
161 // Windows specific addition to prevent the indirect import of the redefined min/max macros
162 #if defined _WIN32 || defined _WIN64
163 #ifdef NOMINMAX
164 #undef NOMINMAX
165 #endif
166 #define NOMINMAX
167 #endif
168 #include <zlib.h>
169
170
171 OPENEXR_IMF_INTERNAL_NAMESPACE_SOURCE_ENTER
172
173
174 namespace {
175
176 //
177 // Function pointer to dispatch to an approprate
178 // convertFloatToHalf64_* impl, based on runtime cpu checking.
179 // Should be initialized in DwaCompressor::initializeFuncs()
180 //
181
182 void (*convertFloatToHalf64)(unsigned short*, float*) =
183 convertFloatToHalf64_scalar;
184
185 //
186 // Function pointer for dispatching a fromHalfZigZag_ impl
187 //
188
189 void (*fromHalfZigZag)(unsigned short*, float*) =
190 fromHalfZigZag_scalar;
191
192 //
193 // Dispatch the inverse DCT on an 8x8 block, where the last
194 // n rows can be all zeros. The n=0 case converts the full block.
195 //
196 void (*dctInverse8x8_0)(float*) = dctInverse8x8_scalar<0>;
197 void (*dctInverse8x8_1)(float*) = dctInverse8x8_scalar<1>;
198 void (*dctInverse8x8_2)(float*) = dctInverse8x8_scalar<2>;
199 void (*dctInverse8x8_3)(float*) = dctInverse8x8_scalar<3>;
200 void (*dctInverse8x8_4)(float*) = dctInverse8x8_scalar<4>;
201 void (*dctInverse8x8_5)(float*) = dctInverse8x8_scalar<5>;
202 void (*dctInverse8x8_6)(float*) = dctInverse8x8_scalar<6>;
203 void (*dctInverse8x8_7)(float*) = dctInverse8x8_scalar<7>;
204
205 } // namespace
206
207
208 struct DwaCompressor::ChannelData
209 {
210 std::string name;
211 CompressorScheme compression;
212 int xSampling;
213 int ySampling;
214 PixelType type;
215 bool pLinear;
216
217 int width;
218 int height;
219
220 //
221 // Incoming and outgoing data is scanline interleaved, and it's much
222 // easier to operate on contiguous data. Assuming the planare unc
223 // buffer is to hold RLE data, we need to rearrange to make bytes
224 // adjacent.
225 //
226
227 char *planarUncBuffer;
228 char *planarUncBufferEnd;
229
230 char *planarUncRle[4];
231 char *planarUncRleEnd[4];
232
233 PixelType planarUncType;
234 int planarUncSize;
235 };
236
237
238 struct DwaCompressor::CscChannelSet
239 {
240 int idx[3];
241 };
242
243
244 struct DwaCompressor::Classifier
245 {
ClassifierDwaCompressor::Classifier246 Classifier (std::string suffix,
247 CompressorScheme scheme,
248 PixelType type,
249 int cscIdx,
250 bool caseInsensitive):
251 _suffix(suffix),
252 _scheme(scheme),
253 _type(type),
254 _cscIdx(cscIdx),
255 _caseInsensitive(caseInsensitive)
256 {
257 if (caseInsensitive)
258 transform(_suffix.begin(), _suffix.end(), _suffix.begin(), tolower);
259 }
260
ClassifierDwaCompressor::Classifier261 Classifier (const char *&ptr, int size)
262 {
263 if (size <= 0)
264 throw Iex::InputExc("Error uncompressing DWA data"
265 " (truncated rule).");
266
267 {
268 char suffix[Name::SIZE];
269 memset (suffix, 0, Name::SIZE);
270 Xdr::read<CharPtrIO> (ptr, std::min(size, Name::SIZE-1), suffix);
271 _suffix = std::string(suffix);
272 }
273
274 if (size < _suffix.length() + 1 + 2*Xdr::size<char>())
275 throw Iex::InputExc("Error uncompressing DWA data"
276 " (truncated rule).");
277
278 char value;
279 Xdr::read<CharPtrIO> (ptr, value);
280
281 _cscIdx = (int)(value >> 4) - 1;
282 if (_cscIdx < -1 || _cscIdx >= 3)
283 throw Iex::InputExc("Error uncompressing DWA data"
284 " (corrupt cscIdx rule).");
285
286 _scheme = (CompressorScheme)((value >> 2) & 3);
287 if (_scheme < 0 || _scheme >= NUM_COMPRESSOR_SCHEMES)
288 throw Iex::InputExc("Error uncompressing DWA data"
289 " (corrupt scheme rule).");
290
291 _caseInsensitive = (value & 1 ? true : false);
292
293 Xdr::read<CharPtrIO> (ptr, value);
294 if (value < 0 || value >= NUM_PIXELTYPES)
295 throw Iex::InputExc("Error uncompressing DWA data"
296 " (corrupt rule).");
297 _type = (PixelType)value;
298 }
299
matchDwaCompressor::Classifier300 bool match (const std::string &suffix, const PixelType type) const
301 {
302 if (_type != type) return false;
303
304 if (_caseInsensitive)
305 {
306 std::string tmp(suffix);
307 transform(tmp.begin(), tmp.end(), tmp.begin(), tolower);
308 return tmp == _suffix;
309 }
310
311 return suffix == _suffix;
312 }
313
sizeDwaCompressor::Classifier314 size_t size () const
315 {
316 // string length + \0
317 size_t sizeBytes = _suffix.length() + 1;
318
319 // 1 byte for scheme / cscIdx / caseInsensitive, and 1 byte for type
320 sizeBytes += 2 * Xdr::size<char>();
321
322 return sizeBytes;
323 }
324
writeDwaCompressor::Classifier325 void write (char *&ptr) const
326 {
327 Xdr::write<CharPtrIO> (ptr, _suffix.c_str());
328
329 // Encode _cscIdx (-1-3) in the upper 4 bits,
330 // _scheme (0-2) in the next 2 bits
331 // _caseInsen in the bottom bit
332 unsigned char value = 0;
333 value |= ((unsigned char)(_cscIdx+1) & 15) << 4;
334 value |= ((unsigned char)_scheme & 3) << 2;
335 value |= (unsigned char)_caseInsensitive & 1;
336
337 Xdr::write<CharPtrIO> (ptr, value);
338 Xdr::write<CharPtrIO> (ptr, (unsigned char)_type);
339 }
340
341 std::string _suffix;
342 CompressorScheme _scheme;
343 PixelType _type;
344 int _cscIdx;
345 bool _caseInsensitive;
346 };
347
348
349 //
350 // Base class for the LOSSY_DCT decoder classes
351 //
352
353 class DwaCompressor::LossyDctDecoderBase
354 {
355 public:
356
357 LossyDctDecoderBase
358 (char *packedAc,
359 char *packedDc,
360 const unsigned short *toLinear,
361 int width,
362 int height);
363
364 virtual ~LossyDctDecoderBase ();
365
366 void execute();
367
368 //
369 // These return number of items, not bytes. Each item
370 // is an unsigned short
371 //
372
numAcValuesEncoded() const373 int numAcValuesEncoded() const { return _packedAcCount; }
numDcValuesEncoded() const374 int numDcValuesEncoded() const { return _packedDcCount; }
375
376 protected:
377
378 //
379 // Un-RLE the packed AC components into
380 // a half buffer. The half block should
381 // be the full 8x8 block (in zig-zag order
382 // still), not the first AC component.
383 //
384 // currAcComp is advanced as bytes are decoded.
385 //
386 // This returns the index of the last non-zero
387 // value in the buffer - with the index into zig zag
388 // order data. If we return 0, we have DC only data.
389 //
390
391 int unRleAc (unsigned short *&currAcComp,
392 unsigned short *halfZigBlock);
393
394
395 //
396 // if NATIVE and XDR are really the same values, we can
397 // skip some processing and speed things along
398 //
399
400 bool _isNativeXdr;
401
402
403 //
404 // Counts of how many items have been packed into the
405 // AC and DC buffers
406 //
407
408 int _packedAcCount;
409 int _packedDcCount;
410
411
412 //
413 // AC and DC buffers to pack
414 //
415
416 char *_packedAc;
417 char *_packedDc;
418
419
420 //
421 // half -> half LUT to transform from nonlinear to linear
422 //
423
424 const unsigned short *_toLinear;
425
426
427 //
428 // image dimensions
429 //
430
431 int _width;
432 int _height;
433
434
435 //
436 // Pointers to the start of each scanlines, to be filled on decode
437 // Generally, these will be filled by the subclasses.
438 //
439
440 std::vector< std::vector<char *> > _rowPtrs;
441
442
443 //
444 // The type of each data that _rowPtrs[i] is referring. Layout
445 // is in the same order as _rowPtrs[].
446 //
447
448 std::vector<PixelType> _type;
449 std::vector<SimdAlignedBuffer64f> _dctData;
450 };
451
452
453 //
454 // Used to decode a single channel of LOSSY_DCT data.
455 //
456
457 class DwaCompressor::LossyDctDecoder: public LossyDctDecoderBase
458 {
459 public:
460
461 //
462 // toLinear is a half-float LUT to convert the encoded values
463 // back to linear light. If you want to skip this step, pass
464 // in NULL here.
465 //
466
LossyDctDecoder(std::vector<char * > & rowPtrs,char * packedAc,char * packedDc,const unsigned short * toLinear,int width,int height,PixelType type)467 LossyDctDecoder
468 (std::vector<char *> &rowPtrs,
469 char *packedAc,
470 char *packedDc,
471 const unsigned short *toLinear,
472 int width,
473 int height,
474 PixelType type)
475 :
476 LossyDctDecoderBase(packedAc, packedDc, toLinear, width, height)
477 {
478 _rowPtrs.push_back(rowPtrs);
479 _type.push_back(type);
480 }
481
~LossyDctDecoder()482 virtual ~LossyDctDecoder () {}
483 };
484
485
486 //
487 // Used to decode 3 channels of LOSSY_DCT data that
488 // are grouped together and color space converted.
489 //
490
491 class DwaCompressor::LossyDctDecoderCsc: public LossyDctDecoderBase
492 {
493 public:
494
495 //
496 // toLinear is a half-float LUT to convert the encoded values
497 // back to linear light. If you want to skip this step, pass
498 // in NULL here.
499 //
500
LossyDctDecoderCsc(std::vector<char * > & rowPtrsR,std::vector<char * > & rowPtrsG,std::vector<char * > & rowPtrsB,char * packedAc,char * packedDc,const unsigned short * toLinear,int width,int height,PixelType typeR,PixelType typeG,PixelType typeB)501 LossyDctDecoderCsc
502 (std::vector<char *> &rowPtrsR,
503 std::vector<char *> &rowPtrsG,
504 std::vector<char *> &rowPtrsB,
505 char *packedAc,
506 char *packedDc,
507 const unsigned short *toLinear,
508 int width,
509 int height,
510 PixelType typeR,
511 PixelType typeG,
512 PixelType typeB)
513 :
514 LossyDctDecoderBase(packedAc, packedDc, toLinear, width, height)
515 {
516 _rowPtrs.push_back(rowPtrsR);
517 _rowPtrs.push_back(rowPtrsG);
518 _rowPtrs.push_back(rowPtrsB);
519 _type.push_back(typeR);
520 _type.push_back(typeG);
521 _type.push_back(typeB);
522 }
523
~LossyDctDecoderCsc()524 virtual ~LossyDctDecoderCsc () {}
525 };
526
527
528 //
529 // Base class for encoding using the lossy DCT scheme
530 //
531
532 class DwaCompressor::LossyDctEncoderBase
533 {
534 public:
535
536 LossyDctEncoderBase
537 (float quantBaseError,
538 char *packedAc,
539 char *packedDc,
540 const unsigned short *toNonlinear,
541 int width,
542 int height);
543
544 virtual ~LossyDctEncoderBase ();
545
546 void execute ();
547
548 //
549 // These return number of items, not bytes. Each item
550 // is an unsigned short
551 //
552
numAcValuesEncoded() const553 int numAcValuesEncoded () const {return _numAcComp;}
numDcValuesEncoded() const554 int numDcValuesEncoded () const {return _numDcComp;}
555
556 protected:
557
558 void toZigZag (half *dst, half *src);
559 int countSetBits (unsigned short src);
560 half quantize (half src, float errorTolerance);
561 void rleAc (half *block, unsigned short *&acPtr);
562
563 float _quantBaseError;
564
565 int _width,
566 _height;
567 const unsigned short *_toNonlinear;
568
569 int _numAcComp,
570 _numDcComp;
571
572 std::vector< std::vector<const char *> > _rowPtrs;
573 std::vector<PixelType> _type;
574 std::vector<SimdAlignedBuffer64f> _dctData;
575
576
577 //
578 // Pointers to the buffers where AC and DC
579 // DCT components should be packed for
580 // lossless compression downstream
581 //
582
583 char *_packedAc;
584 char *_packedDc;
585
586
587 //
588 // Our "quantization tables" - the example JPEG tables,
589 // normalized so that the smallest value in each is 1.0.
590 // This gives us a relationship between error in DCT
591 // components
592 //
593
594 float _quantTableY[64];
595 float _quantTableCbCr[64];
596 };
597
598
599
600 //
601 // Single channel lossy DCT encoder
602 //
603
604 class DwaCompressor::LossyDctEncoder: public LossyDctEncoderBase
605 {
606 public:
607
LossyDctEncoder(float quantBaseError,std::vector<const char * > & rowPtrs,char * packedAc,char * packedDc,const unsigned short * toNonlinear,int width,int height,PixelType type)608 LossyDctEncoder
609 (float quantBaseError,
610 std::vector<const char *> &rowPtrs,
611 char *packedAc,
612 char *packedDc,
613 const unsigned short *toNonlinear,
614 int width,
615 int height,
616 PixelType type)
617 :
618 LossyDctEncoderBase
619 (quantBaseError, packedAc, packedDc, toNonlinear, width, height)
620 {
621 _rowPtrs.push_back(rowPtrs);
622 _type.push_back(type);
623 }
624
~LossyDctEncoder()625 virtual ~LossyDctEncoder () {}
626 };
627
628
629 //
630 // RGB channel lossy DCT encoder
631 //
632
633 class DwaCompressor::LossyDctEncoderCsc: public LossyDctEncoderBase
634 {
635 public:
636
LossyDctEncoderCsc(float quantBaseError,std::vector<const char * > & rowPtrsR,std::vector<const char * > & rowPtrsG,std::vector<const char * > & rowPtrsB,char * packedAc,char * packedDc,const unsigned short * toNonlinear,int width,int height,PixelType typeR,PixelType typeG,PixelType typeB)637 LossyDctEncoderCsc
638 (float quantBaseError,
639 std::vector<const char *> &rowPtrsR,
640 std::vector<const char *> &rowPtrsG,
641 std::vector<const char *> &rowPtrsB,
642 char *packedAc,
643 char *packedDc,
644 const unsigned short *toNonlinear,
645 int width,
646 int height,
647 PixelType typeR,
648 PixelType typeG,
649 PixelType typeB)
650 :
651 LossyDctEncoderBase
652 (quantBaseError, packedAc, packedDc, toNonlinear, width, height)
653 {
654 _type.push_back(typeR);
655 _type.push_back(typeG);
656 _type.push_back(typeB);
657
658 _rowPtrs.push_back(rowPtrsR);
659 _rowPtrs.push_back(rowPtrsG);
660 _rowPtrs.push_back(rowPtrsB);
661 }
662
~LossyDctEncoderCsc()663 virtual ~LossyDctEncoderCsc () {}
664 };
665
666
667 // ==============================================================
668 //
669 // LossyDctDecoderBase
670 //
671 // --------------------------------------------------------------
672
LossyDctDecoderBase(char * packedAc,char * packedDc,const unsigned short * toLinear,int width,int height)673 DwaCompressor::LossyDctDecoderBase::LossyDctDecoderBase
674 (char *packedAc,
675 char *packedDc,
676 const unsigned short *toLinear,
677 int width,
678 int height)
679 :
680 _isNativeXdr(false),
681 _packedAcCount(0),
682 _packedDcCount(0),
683 _packedAc(packedAc),
684 _packedDc(packedDc),
685 _toLinear(toLinear),
686 _width(width),
687 _height(height)
688 {
689 if (_toLinear == 0)
690 _toLinear = dwaCompressorNoOp;
691
692 _isNativeXdr = GLOBAL_SYSTEM_LITTLE_ENDIAN;
693 }
694
695
~LossyDctDecoderBase()696 DwaCompressor::LossyDctDecoderBase::~LossyDctDecoderBase () {}
697
698
699 void
execute()700 DwaCompressor::LossyDctDecoderBase::execute ()
701 {
702 int numComp = _rowPtrs.size();
703 int lastNonZero = 0;
704 int numBlocksX = (int) ceil ((float)_width / 8.0f);
705 int numBlocksY = (int) ceil ((float)_height / 8.0f);
706 int leftoverX = _width - (numBlocksX-1) * 8;
707 int leftoverY = _height - (numBlocksY-1) * 8;
708
709 int numFullBlocksX = (int)floor ((float)_width / 8.0f);
710
711 unsigned short tmpShortNative = 0;
712 unsigned short tmpShortXdr = 0;
713 const char *tmpConstCharPtr = 0;
714
715 unsigned short *currAcComp = (unsigned short *)_packedAc;
716 std::vector<unsigned short *> currDcComp (_rowPtrs.size());
717 std::vector<SimdAlignedBuffer64us> halfZigBlock (_rowPtrs.size());
718
719 if (_type.size() != _rowPtrs.size())
720 throw Iex::BaseExc ("Row pointers and types mismatch in count");
721
722 if ((_rowPtrs.size() != 3) && (_rowPtrs.size() != 1))
723 throw Iex::NoImplExc ("Only 1 and 3 channel encoding is supported");
724
725 _dctData.resize(numComp);
726
727 //
728 // Allocate a temp aligned buffer to hold a rows worth of full
729 // 8x8 half-float blocks
730 //
731
732 unsigned char *rowBlockHandle = new unsigned char
733 [numComp * numBlocksX * 64 * sizeof(unsigned short) + _SSE_ALIGNMENT];
734
735 unsigned short *rowBlock[3];
736
737 rowBlock[0] = (unsigned short*)rowBlockHandle;
738
739 for (int i = 0; i < _SSE_ALIGNMENT; ++i)
740 {
741 if (((size_t)(rowBlockHandle + i) & _SSE_ALIGNMENT_MASK) == 0)
742 rowBlock[0] = (unsigned short *)(rowBlockHandle + i);
743 }
744
745 for (int comp = 1; comp < numComp; ++comp)
746 rowBlock[comp] = rowBlock[comp - 1] + numBlocksX * 64;
747
748 //
749 // Pack DC components together by common plane, so we can get
750 // a little more out of differencing them. We'll always have
751 // one component per block, so we can computed offsets.
752 //
753
754 currDcComp[0] = (unsigned short *)_packedDc;
755
756 for (unsigned int comp = 1; comp < numComp; ++comp)
757 currDcComp[comp] = currDcComp[comp - 1] + numBlocksX * numBlocksY;
758
759 for (int blocky = 0; blocky < numBlocksY; ++blocky)
760 {
761 int maxY = 8;
762
763 if (blocky == numBlocksY-1)
764 maxY = leftoverY;
765
766 int maxX = 8;
767
768 for (int blockx = 0; blockx < numBlocksX; ++blockx)
769 {
770 if (blockx == numBlocksX-1)
771 maxX = leftoverX;
772
773 //
774 // If we can detect that the block is constant values
775 // (all components only have DC values, and all AC is 0),
776 // we can do everything only on 1 value, instead of all
777 // 64.
778 //
779 // This won't really help for regular images, but it is
780 // meant more for layers with large swaths of black
781 //
782
783 bool blockIsConstant = true;
784
785 for (unsigned int comp = 0; comp < numComp; ++comp)
786 {
787
788 //
789 // DC component is stored separately
790 //
791
792 #ifdef IMF_HAVE_SSE2
793 {
794 __m128i *dst = (__m128i*)halfZigBlock[comp]._buffer;
795
796 dst[7] = _mm_setzero_si128();
797 dst[6] = _mm_setzero_si128();
798 dst[5] = _mm_setzero_si128();
799 dst[4] = _mm_setzero_si128();
800 dst[3] = _mm_setzero_si128();
801 dst[2] = _mm_setzero_si128();
802 dst[1] = _mm_setzero_si128();
803 dst[0] = _mm_insert_epi16
804 (_mm_setzero_si128(), *currDcComp[comp]++, 0);
805 }
806 #else /* IMF_HAVE_SSE2 */
807
808 memset (halfZigBlock[comp]._buffer, 0, 64 * 2);
809 halfZigBlock[comp]._buffer[0] = *currDcComp[comp]++;
810
811 #endif /* IMF_HAVE_SSE2 */
812
813 _packedDcCount++;
814
815 //
816 // UnRLE the AC. This will modify currAcComp
817 //
818
819 lastNonZero = unRleAc (currAcComp, halfZigBlock[comp]._buffer);
820
821 //
822 // Convert from XDR to NATIVE
823 //
824
825 if (!_isNativeXdr)
826 {
827 for (int i = 0; i < 64; ++i)
828 {
829 tmpShortXdr = halfZigBlock[comp]._buffer[i];
830 tmpConstCharPtr = (const char *)&tmpShortXdr;
831
832 Xdr::read<CharPtrIO> (tmpConstCharPtr, tmpShortNative);
833
834 halfZigBlock[comp]._buffer[i] = tmpShortNative;
835 }
836 }
837
838 if (lastNonZero == 0)
839 {
840 //
841 // DC only case - AC components are all 0
842 //
843
844 half h;
845
846 h.setBits (halfZigBlock[comp]._buffer[0]);
847 _dctData[comp]._buffer[0] = (float)h;
848
849 dctInverse8x8DcOnly (_dctData[comp]._buffer);
850 }
851 else
852 {
853 //
854 // We have some AC components that are non-zero.
855 // Can't use the 'constant block' optimization
856 //
857
858 blockIsConstant = false;
859
860 //
861 // Un-Zig zag
862 //
863
864 (*fromHalfZigZag)
865 (halfZigBlock[comp]._buffer, _dctData[comp]._buffer);
866
867 //
868 // Zig-Zag indices in normal layout are as follows:
869 //
870 // 0 1 3 6 10 15 21 28
871 // 2 4 7 11 16 22 29 36
872 // 5 8 12 17 23 30 37 43
873 // 9 13 18 24 31 38 44 49
874 // 14 19 25 32 39 45 50 54
875 // 20 26 33 40 46 51 55 58
876 // 27 34 41 47 52 56 59 61
877 // 35 42 48 53 57 60 62 63
878 //
879 // If lastNonZero is less than the first item on
880 // each row, we know that the whole row is zero and
881 // can be skipped in the row-oriented part of the
882 // iDCT.
883 //
884 // The unrolled logic here is:
885 //
886 // if lastNonZero < rowStartIdx[i],
887 // zeroedRows = rowsEmpty[i]
888 //
889 // where:
890 //
891 // const int rowStartIdx[] = {2, 5, 9, 14, 20, 27, 35};
892 // const int rowsEmpty[] = {7, 6, 5, 4, 3, 2, 1};
893 //
894
895 if (lastNonZero < 2)
896 dctInverse8x8_7(_dctData[comp]._buffer);
897 else if (lastNonZero < 5)
898 dctInverse8x8_6(_dctData[comp]._buffer);
899 else if (lastNonZero < 9)
900 dctInverse8x8_5(_dctData[comp]._buffer);
901 else if (lastNonZero < 14)
902 dctInverse8x8_4(_dctData[comp]._buffer);
903 else if (lastNonZero < 20)
904 dctInverse8x8_3(_dctData[comp]._buffer);
905 else if (lastNonZero < 27)
906 dctInverse8x8_2(_dctData[comp]._buffer);
907 else if (lastNonZero < 35)
908 dctInverse8x8_1(_dctData[comp]._buffer);
909 else
910 dctInverse8x8_0(_dctData[comp]._buffer);
911 }
912 }
913
914 //
915 // Perform the CSC
916 //
917
918 if (numComp == 3)
919 {
920 if (!blockIsConstant)
921 {
922 csc709Inverse64 (_dctData[0]._buffer,
923 _dctData[1]._buffer,
924 _dctData[2]._buffer);
925
926 }
927 else
928 {
929 csc709Inverse (_dctData[0]._buffer[0],
930 _dctData[1]._buffer[0],
931 _dctData[2]._buffer[0]);
932 }
933 }
934
935 //
936 // Float -> Half conversion.
937 //
938 // If the block has a constant value, just convert the first pixel.
939 //
940
941 for (unsigned int comp = 0; comp < numComp; ++comp)
942 {
943 if (!blockIsConstant)
944 {
945 (*convertFloatToHalf64)
946 (&rowBlock[comp][blockx*64], _dctData[comp]._buffer);
947 }
948 else
949 {
950 #if IMF_HAVE_SSE2
951
952 __m128i *dst = (__m128i*)&rowBlock[comp][blockx*64];
953
954 dst[0] = _mm_set1_epi16
955 (((half)_dctData[comp]._buffer[0]).bits());
956
957 dst[1] = dst[0];
958 dst[2] = dst[0];
959 dst[3] = dst[0];
960 dst[4] = dst[0];
961 dst[5] = dst[0];
962 dst[6] = dst[0];
963 dst[7] = dst[0];
964
965 #else /* IMF_HAVE_SSE2 */
966
967 unsigned short *dst = &rowBlock[comp][blockx*64];
968
969 dst[0] = ((half)_dctData[comp]._buffer[0]).bits();
970
971 for (int i = 1; i < 64; ++i)
972 {
973 dst[i] = dst[0];
974 }
975
976 #endif /* IMF_HAVE_SSE2 */
977 } // blockIsConstant
978 } // comp
979 } // blockx
980
981 //
982 // At this point, we have half-float nonlinear value blocked
983 // in rowBlock[][]. We need to unblock the data, transfer
984 // back to linear, and write the results in the _rowPtrs[].
985 //
986 // There is a fast-path for aligned rows, which helps
987 // things a little. Since this fast path is only valid
988 // for full 8-element wide blocks, the partial x blocks
989 // are broken into a separate loop below.
990 //
991 // At the moment, the fast path requires:
992 // * sse support
993 // * aligned row pointers
994 // * full 8-element wide blocks
995 //
996
997 for (int comp = 0; comp < numComp; ++comp)
998 {
999 //
1000 // Test if we can use the fast path
1001 //
1002
1003 #ifdef IMF_HAVE_SSE2
1004
1005 bool fastPath = true;
1006
1007 for (int y = 8 * blocky; y < 8 * blocky + maxY; ++y)
1008 {
1009 if ((size_t)_rowPtrs[comp][y] & _SSE_ALIGNMENT_MASK)
1010 fastPath = false;
1011 }
1012
1013 if (fastPath)
1014 {
1015 //
1016 // Handle all the full X blocks, in a fast path with sse2 and
1017 // aligned row pointers
1018 //
1019
1020 for (int y=8*blocky; y<8*blocky+maxY; ++y)
1021 {
1022 __m128i *dst = (__m128i *)_rowPtrs[comp][y];
1023 __m128i *src = (__m128i *)&rowBlock[comp][(y & 0x7) * 8];
1024
1025
1026 for (int blockx = 0; blockx < numFullBlocksX; ++blockx)
1027 {
1028 //
1029 // These may need some twiddling.
1030 // Run with multiples of 8
1031 //
1032
1033 _mm_prefetch ((char *)(src + 16), _MM_HINT_NTA);
1034
1035 unsigned short i0 = _mm_extract_epi16 (*src, 0);
1036 unsigned short i1 = _mm_extract_epi16 (*src, 1);
1037 unsigned short i2 = _mm_extract_epi16 (*src, 2);
1038 unsigned short i3 = _mm_extract_epi16 (*src, 3);
1039
1040 unsigned short i4 = _mm_extract_epi16 (*src, 4);
1041 unsigned short i5 = _mm_extract_epi16 (*src, 5);
1042 unsigned short i6 = _mm_extract_epi16 (*src, 6);
1043 unsigned short i7 = _mm_extract_epi16 (*src, 7);
1044
1045 i0 = _toLinear[i0];
1046 i1 = _toLinear[i1];
1047 i2 = _toLinear[i2];
1048 i3 = _toLinear[i3];
1049
1050 i4 = _toLinear[i4];
1051 i5 = _toLinear[i5];
1052 i6 = _toLinear[i6];
1053 i7 = _toLinear[i7];
1054
1055 *dst = _mm_insert_epi16 (_mm_setzero_si128(), i0, 0);
1056 *dst = _mm_insert_epi16 (*dst, i1, 1);
1057 *dst = _mm_insert_epi16 (*dst, i2, 2);
1058 *dst = _mm_insert_epi16 (*dst, i3, 3);
1059
1060 *dst = _mm_insert_epi16 (*dst, i4, 4);
1061 *dst = _mm_insert_epi16 (*dst, i5, 5);
1062 *dst = _mm_insert_epi16 (*dst, i6, 6);
1063 *dst = _mm_insert_epi16 (*dst, i7, 7);
1064
1065 src += 8;
1066 dst++;
1067 }
1068 }
1069 }
1070 else
1071 {
1072
1073 #endif /* IMF_HAVE_SSE2 */
1074
1075 //
1076 // Basic scalar kinda slow path for handling the full X blocks
1077 //
1078
1079 for (int y = 8 * blocky; y < 8 * blocky + maxY; ++y)
1080 {
1081 unsigned short *dst = (unsigned short *)_rowPtrs[comp][y];
1082
1083 for (int blockx = 0; blockx < numFullBlocksX; ++blockx)
1084 {
1085 unsigned short *src =
1086 &rowBlock[comp][blockx * 64 + ((y & 0x7) * 8)];
1087
1088 dst[0] = _toLinear[src[0]];
1089 dst[1] = _toLinear[src[1]];
1090 dst[2] = _toLinear[src[2]];
1091 dst[3] = _toLinear[src[3]];
1092
1093 dst[4] = _toLinear[src[4]];
1094 dst[5] = _toLinear[src[5]];
1095 dst[6] = _toLinear[src[6]];
1096 dst[7] = _toLinear[src[7]];
1097
1098 dst += 8;
1099 }
1100 }
1101
1102 #ifdef IMF_HAVE_SSE2
1103
1104 }
1105
1106 #endif /* IMF_HAVE_SSE2 */
1107
1108 //
1109 // If we have partial X blocks, deal with all those now
1110 // Since this should be minimal work, there currently
1111 // is only one path that should work for everyone.
1112 //
1113
1114 if (numFullBlocksX != numBlocksX)
1115 {
1116 for (int y = 8 * blocky; y < 8 * blocky + maxY; ++y)
1117 {
1118 unsigned short *src = (unsigned short *)
1119 &rowBlock[comp][numFullBlocksX * 64 + ((y & 0x7) * 8)];
1120
1121 unsigned short *dst = (unsigned short *)_rowPtrs[comp][y];
1122
1123 dst += 8 * numFullBlocksX;
1124
1125 for (int x = 0; x < maxX; ++x)
1126 {
1127 *dst++ = _toLinear[*src++];
1128 }
1129 }
1130 }
1131 } // comp
1132 } // blocky
1133
1134 //
1135 // Walk over all the channels that are of type FLOAT.
1136 // Convert from HALF XDR back to FLOAT XDR.
1137 //
1138
1139 for (unsigned int chan = 0; chan < numComp; ++chan)
1140 {
1141
1142 if (_type[chan] != FLOAT)
1143 continue;
1144
1145 std::vector<unsigned short> halfXdr (_width);
1146
1147 for (int y=0; y<_height; ++y)
1148 {
1149 char *floatXdrPtr = _rowPtrs[chan][y];
1150
1151 memcpy(&halfXdr[0], floatXdrPtr, _width*sizeof(unsigned short));
1152
1153 const char *halfXdrPtr = (const char *)(&halfXdr[0]);
1154
1155 for (int x=0; x<_width; ++x)
1156 {
1157 half tmpHalf;
1158
1159 Xdr::read<CharPtrIO> (halfXdrPtr, tmpHalf);
1160 Xdr::write<CharPtrIO> (floatXdrPtr, (float)tmpHalf);
1161
1162 //
1163 // Xdr::write and Xdr::read will advance the ptrs
1164 //
1165 }
1166 }
1167 }
1168
1169 delete[] rowBlockHandle;
1170 }
1171
1172
1173 //
1174 // Un-RLE the packed AC components into
1175 // a half buffer. The half block should
1176 // be the full 8x8 block (in zig-zag order
1177 // still), not the first AC component.
1178 //
1179 // currAcComp is advanced as bytes are decoded.
1180 //
1181 // This returns the index of the last non-zero
1182 // value in the buffer - with the index into zig zag
1183 // order data. If we return 0, we have DC only data.
1184 //
1185 // This is assuminging that halfZigBlock is zero'ed
1186 // prior to calling
1187 //
1188
1189 int
unRleAc(unsigned short * & currAcComp,unsigned short * halfZigBlock)1190 DwaCompressor::LossyDctDecoderBase::unRleAc
1191 (unsigned short *&currAcComp,
1192 unsigned short *halfZigBlock)
1193 {
1194 //
1195 // Un-RLE the RLE'd blocks. If we find an item whose
1196 // high byte is 0xff, then insert the number of 0's
1197 // as indicated by the low byte.
1198 //
1199 // Otherwise, just copy the number verbaitm.
1200 //
1201
1202 int lastNonZero = 0;
1203 int dctComp = 1;
1204
1205 //
1206 // Start with a zero'ed block, so we don't have to
1207 // write when we hit a run symbol
1208 //
1209
1210 while (dctComp < 64)
1211 {
1212 if (*currAcComp == 0xff00)
1213 {
1214 //
1215 // End of block
1216 //
1217
1218 dctComp = 64;
1219
1220 }
1221 else if ((*currAcComp) >> 8 == 0xff)
1222 {
1223 //
1224 // Run detected! Insert 0's.
1225 //
1226 // Since the block has been zeroed, just advance the ptr
1227 //
1228
1229 dctComp += (*currAcComp) & 0xff;
1230 }
1231 else
1232 {
1233 //
1234 // Not a run, just copy over the value
1235 //
1236
1237 lastNonZero = dctComp;
1238 halfZigBlock[dctComp] = *currAcComp;
1239
1240 dctComp++;
1241 }
1242
1243 _packedAcCount++;
1244 currAcComp++;
1245 }
1246
1247 return lastNonZero;
1248 }
1249
1250
1251 // ==============================================================
1252 //
1253 // LossyDctEncoderBase
1254 //
1255 // --------------------------------------------------------------
1256
LossyDctEncoderBase(float quantBaseError,char * packedAc,char * packedDc,const unsigned short * toNonlinear,int width,int height)1257 DwaCompressor::LossyDctEncoderBase::LossyDctEncoderBase
1258 (float quantBaseError,
1259 char *packedAc,
1260 char *packedDc,
1261 const unsigned short *toNonlinear,
1262 int width,
1263 int height)
1264 :
1265 _quantBaseError(quantBaseError),
1266 _width(width),
1267 _height(height),
1268 _toNonlinear(toNonlinear),
1269 _numAcComp(0),
1270 _numDcComp(0),
1271 _packedAc(packedAc),
1272 _packedDc(packedDc)
1273 {
1274 //
1275 // Here, we take the generic JPEG quantization tables and
1276 // normalize them by the smallest component in each table.
1277 // This gives us a relationship amongst the DCT components,
1278 // in terms of how sensitive each component is to
1279 // error.
1280 //
1281 // A higher normalized value means we can quantize more,
1282 // and a small normalized value means we can quantize less.
1283 //
1284 // Eventually, we will want an acceptable quantization
1285 // error range for each component. We find this by
1286 // multiplying some user-specified level (_quantBaseError)
1287 // by the normalized table (_quantTableY, _quantTableCbCr) to
1288 // find the acceptable quantization error range.
1289 //
1290 // The quantization table is not needed for decoding, and
1291 // is not transmitted. So, if you want to get really fancy,
1292 // you could derive some content-dependent quantization
1293 // table, and the decoder would not need to be changed. But,
1294 // for now, we'll just use statice quantization tables.
1295 //
1296
1297 int jpegQuantTableY[] =
1298 {
1299 16, 11, 10, 16, 24, 40, 51, 61,
1300 12, 12, 14, 19, 26, 58, 60, 55,
1301 14, 13, 16, 24, 40, 57, 69, 56,
1302 14, 17, 22, 29, 51, 87, 80, 62,
1303 18, 22, 37, 56, 68, 109, 103, 77,
1304 24, 35, 55, 64, 81, 104, 113, 92,
1305 49, 64, 78, 87, 103, 121, 120, 101,
1306 72, 92, 95, 98, 112, 100, 103, 99
1307 };
1308
1309 int jpegQuantTableYMin = 10;
1310
1311 int jpegQuantTableCbCr[] =
1312 {
1313 17, 18, 24, 47, 99, 99, 99, 99,
1314 18, 21, 26, 66, 99, 99, 99, 99,
1315 24, 26, 56, 99, 99, 99, 99, 99,
1316 47, 66, 99, 99, 99, 99, 99, 99,
1317 99, 99, 99, 99, 99, 99, 99, 99,
1318 99, 99, 99, 99, 99, 99, 99, 99,
1319 99, 99, 99, 99, 99, 99, 99, 99,
1320 99, 99, 99, 99, 99, 99, 99, 99
1321 };
1322
1323 int jpegQuantTableCbCrMin = 17;
1324
1325 for (int idx = 0; idx < 64; ++idx)
1326 {
1327 _quantTableY[idx] = static_cast<float> (jpegQuantTableY[idx]) /
1328 static_cast<float> (jpegQuantTableYMin);
1329
1330 _quantTableCbCr[idx] = static_cast<float> (jpegQuantTableCbCr[idx]) /
1331 static_cast<float> (jpegQuantTableCbCrMin);
1332 }
1333
1334 if (_quantBaseError < 0)
1335 quantBaseError = 0;
1336 }
1337
1338
~LossyDctEncoderBase()1339 DwaCompressor::LossyDctEncoderBase::~LossyDctEncoderBase ()
1340 {
1341 }
1342
1343
1344 //
1345 // Given three channels of source data, encoding by first applying
1346 // a color space conversion to a YCbCr space. Otherwise, if we only
1347 // have one channel, just encode it as is.
1348 //
1349 // Other numbers of channels are somewhat unexpected at this point,
1350 // and will throw an exception.
1351 //
1352
1353 void
execute()1354 DwaCompressor::LossyDctEncoderBase::execute ()
1355 {
1356 int numBlocksX = (int)ceil ((float)_width / 8.0f);
1357 int numBlocksY = (int)ceil ((float)_height/ 8.0f);
1358
1359 half halfZigCoef[64];
1360 half halfCoef[64];
1361
1362 std::vector<unsigned short *> currDcComp (_rowPtrs.size());
1363 unsigned short *currAcComp = (unsigned short *)_packedAc;
1364
1365 _dctData.resize (_rowPtrs.size());
1366 _numAcComp = 0;
1367 _numDcComp = 0;
1368
1369 assert (_type.size() == _rowPtrs.size());
1370 assert ((_rowPtrs.size() == 3) || (_rowPtrs.size() == 1));
1371
1372 //
1373 // Allocate a temp half buffer to quantize into for
1374 // any FLOAT source channels.
1375 //
1376
1377 int tmpHalfBufferElements = 0;
1378
1379 for (unsigned int chan = 0; chan < _rowPtrs.size(); ++chan)
1380 if (_type[chan] == FLOAT)
1381 tmpHalfBufferElements += _width * _height;
1382
1383 std::vector<unsigned short> tmpHalfBuffer (tmpHalfBufferElements);
1384
1385 char *tmpHalfBufferPtr = 0;
1386
1387 if (tmpHalfBufferElements)
1388 tmpHalfBufferPtr = (char *)&tmpHalfBuffer[0];
1389
1390 //
1391 // Run over all the float scanlines, quantizing,
1392 // and re-assigning _rowPtr[y]. We need to translate
1393 // FLOAT XDR to HALF XDR.
1394 //
1395
1396 for (unsigned int chan = 0; chan < _rowPtrs.size(); ++chan)
1397 {
1398 if (_type[chan] != FLOAT)
1399 continue;
1400
1401 for (int y = 0; y < _height; ++y)
1402 {
1403 float src = 0;
1404 const char *srcXdr = _rowPtrs[chan][y];
1405 char *dstXdr = tmpHalfBufferPtr;
1406
1407 for (int x = 0; x < _width; ++x)
1408 {
1409
1410 Xdr::read<CharPtrIO> (srcXdr, src);
1411 Xdr::write<CharPtrIO> (dstXdr, ((half)src).bits());
1412
1413 //
1414 // Xdr::read and Xdr::write will advance the ptr
1415 //
1416 }
1417
1418 _rowPtrs[chan][y] = (const char *)tmpHalfBufferPtr;
1419 tmpHalfBufferPtr += _width * sizeof (unsigned short);
1420 }
1421 }
1422
1423 //
1424 // Pack DC components together by common plane, so we can get
1425 // a little more out of differencing them. We'll always have
1426 // one component per block, so we can computed offsets.
1427 //
1428
1429 currDcComp[0] = (unsigned short *)_packedDc;
1430
1431 for (unsigned int chan = 1; chan < _rowPtrs.size(); ++chan)
1432 currDcComp[chan] = currDcComp[chan-1] + numBlocksX * numBlocksY;
1433
1434 for (int blocky = 0; blocky < numBlocksY; ++blocky)
1435 {
1436 for (int blockx = 0; blockx < numBlocksX; ++blockx)
1437 {
1438 half h;
1439 unsigned short tmpShortXdr, tmpShortNative;
1440 char *tmpCharPtr;
1441
1442 for (unsigned int chan = 0; chan < _rowPtrs.size(); ++chan)
1443 {
1444 //
1445 // Break the source into 8x8 blocks. If we don't
1446 // fit at the edges, mirror.
1447 //
1448 // Also, convert from linear to nonlinear representation.
1449 // Our source is assumed to be XDR, and we need to convert
1450 // to NATIVE prior to converting to float.
1451 //
1452 // If we're converting linear -> nonlinear, assume that the
1453 // XDR -> NATIVE conversion is built into the lookup. Otherwise,
1454 // we'll need to explicitly do it.
1455 //
1456
1457 for (int y = 0; y < 8; ++y)
1458 {
1459 for (int x = 0; x < 8; ++x)
1460 {
1461 int vx = 8 * blockx + x;
1462 int vy = 8 * blocky + y;
1463
1464 if (vx >= _width)
1465 vx = _width - (vx - (_width - 1));
1466
1467 if (vx < 0) vx = _width-1;
1468
1469 if (vy >=_height)
1470 vy = _height - (vy - (_height - 1));
1471
1472 if (vy < 0) vy = _height-1;
1473
1474 tmpShortXdr =
1475 ((const unsigned short *)(_rowPtrs[chan])[vy])[vx];
1476
1477 if (_toNonlinear)
1478 {
1479 h.setBits (_toNonlinear[tmpShortXdr]);
1480 }
1481 else
1482 {
1483 const char *tmpConstCharPtr =
1484 (const char *)(&tmpShortXdr);
1485
1486 Xdr::read<CharPtrIO>
1487 (tmpConstCharPtr, tmpShortNative);
1488
1489 h.setBits(tmpShortNative);
1490 }
1491
1492 _dctData[chan]._buffer[y * 8 + x] = (float)h;
1493 } // x
1494 } // y
1495 } // chan
1496
1497 //
1498 // Color space conversion
1499 //
1500
1501 if (_rowPtrs.size() == 3)
1502 {
1503 csc709Forward64 (_dctData[0]._buffer,
1504 _dctData[1]._buffer,
1505 _dctData[2]._buffer);
1506 }
1507
1508 for (unsigned int chan = 0; chan < _rowPtrs.size(); ++chan)
1509 {
1510 //
1511 // Forward DCT
1512 //
1513
1514 dctForward8x8(_dctData[chan]._buffer);
1515
1516 //
1517 // Quantize to half, and zigzag
1518 //
1519
1520 if (chan == 0)
1521 {
1522 for (int i = 0; i < 64; ++i)
1523 {
1524 halfCoef[i] =
1525 quantize ((half)_dctData[chan]._buffer[i],
1526 _quantBaseError*_quantTableY[i]);
1527 }
1528 }
1529 else
1530 {
1531 for (int i = 0; i < 64; ++i)
1532 {
1533 halfCoef[i] =
1534 quantize ((half)_dctData[chan]._buffer[i],
1535 _quantBaseError*_quantTableCbCr[i]);
1536 }
1537 }
1538
1539 toZigZag (halfZigCoef, halfCoef);
1540
1541 //
1542 // Convert from NATIVE back to XDR, before we write out
1543 //
1544
1545 for (int i = 0; i < 64; ++i)
1546 {
1547 tmpCharPtr = (char *)&tmpShortXdr;
1548 Xdr::write<CharPtrIO>(tmpCharPtr, halfZigCoef[i].bits());
1549 halfZigCoef[i].setBits(tmpShortXdr);
1550 }
1551
1552 //
1553 // Save the DC component separately, to be compressed on
1554 // its own.
1555 //
1556
1557 *currDcComp[chan]++ = halfZigCoef[0].bits();
1558 _numDcComp++;
1559
1560 //
1561 // Then RLE the AC components (which will record the count
1562 // of the resulting number of items)
1563 //
1564
1565 rleAc (halfZigCoef, currAcComp);
1566 } // chan
1567 } // blockx
1568 } // blocky
1569 }
1570
1571
1572 //
1573 // Reorder from zig-zag order to normal ordering
1574 //
1575
1576 void
toZigZag(half * dst,half * src)1577 DwaCompressor::LossyDctEncoderBase::toZigZag (half *dst, half *src)
1578 {
1579 const int remap[] =
1580 {
1581 0,
1582 1, 8,
1583 16, 9, 2,
1584 3, 10, 17, 24,
1585 32, 25, 18, 11, 4,
1586 5, 12, 19, 26, 33, 40,
1587 48, 41, 34, 27, 20, 13, 6,
1588 7, 14, 21, 28, 35, 42, 49, 56,
1589 57, 50, 43, 36, 29, 22, 15,
1590 23, 30, 37, 44, 51, 58,
1591 59, 52, 45, 38, 31,
1592 39, 46, 53, 60,
1593 61, 54, 47,
1594 55, 62,
1595 63
1596 };
1597
1598 for (int i=0; i<64; ++i)
1599 dst[i] = src[remap[i]];
1600 }
1601
1602
1603 //
1604 // Precomputing the bit count runs faster than using
1605 // the builtin instruction, at least in one case..
1606 //
1607 // Precomputing 8-bits is no slower than 16-bits,
1608 // and saves a fair bit of overhead..
1609 //
1610
1611 int
countSetBits(unsigned short src)1612 DwaCompressor::LossyDctEncoderBase::countSetBits (unsigned short src)
1613 {
1614 static const unsigned short numBitsSet[256] =
1615 {
1616 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
1617 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1618 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1619 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1620 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1621 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1622 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1623 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1624 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1625 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1626 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1627 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1628 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1629 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1630 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
1631 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
1632 };
1633
1634 return numBitsSet[src & 0xff] + numBitsSet[src >> 8];
1635 }
1636
1637
1638 //
1639 // Take a DCT coefficient, as well as an acceptable error. Search
1640 // nearby values within the error tolerance, that have fewer
1641 // bits set.
1642 //
1643 // The list of candidates has been pre-computed and sorted
1644 // in order of increasing numbers of bits set. This way, we
1645 // can stop searching as soon as we find a candidate that
1646 // is within the error tolerance.
1647 //
1648
1649 half
quantize(half src,float errorTolerance)1650 DwaCompressor::LossyDctEncoderBase::quantize (half src, float errorTolerance)
1651 {
1652 half tmp;
1653 float srcFloat = (float)src;
1654 int numSetBits = countSetBits(src.bits());
1655 const unsigned short *closest = closestData + closestDataOffset[src.bits()];
1656
1657 for (int targetNumSetBits = numSetBits - 1;
1658 targetNumSetBits >= 0;
1659 --targetNumSetBits)
1660 {
1661 tmp.setBits (*closest);
1662
1663 if (fabs ((float)tmp - srcFloat) < errorTolerance)
1664 return tmp;
1665
1666 closest++;
1667 }
1668
1669 return src;
1670 }
1671
1672
1673 //
1674 // RLE the zig-zag of the AC components + copy over
1675 // into another tmp buffer
1676 //
1677 // Try to do a simple RLE scheme to reduce run's of 0's. This
1678 // differs from the jpeg EOB case, since EOB just indicates that
1679 // the rest of the block is zero. In our case, we have lots of
1680 // NaN symbols, which shouldn't be allowed to occur in DCT
1681 // coefficents - so we'll use them for encoding runs.
1682 //
1683 // If the high byte is 0xff, then we have a run of 0's, of length
1684 // given by the low byte. For example, 0xff03 would be a run
1685 // of 3 0's, starting at the current location.
1686 //
1687 // block is our block of 64 coefficients
1688 // acPtr a pointer to back the RLE'd values into.
1689 //
1690 // This will advance the counter, _numAcComp.
1691 //
1692
1693 void
rleAc(half * block,unsigned short * & acPtr)1694 DwaCompressor::LossyDctEncoderBase::rleAc
1695 (half *block,
1696 unsigned short *&acPtr)
1697 {
1698 int dctComp = 1;
1699 unsigned short rleSymbol = 0x0;
1700
1701 while (dctComp < 64)
1702 {
1703 int runLen = 1;
1704
1705 //
1706 // If we don't have a 0, output verbatim
1707 //
1708
1709 if (block[dctComp].bits() != rleSymbol)
1710 {
1711 *acPtr++ = block[dctComp].bits();
1712 _numAcComp++;
1713
1714 dctComp += runLen;
1715 continue;
1716 }
1717
1718 //
1719 // We're sitting on a 0, so see how big the run is.
1720 //
1721
1722 while ((dctComp+runLen < 64) &&
1723 (block[dctComp+runLen].bits() == rleSymbol))
1724 {
1725 runLen++;
1726 }
1727
1728 //
1729 // If the run len is too small, just output verbatim
1730 // otherwise output our run token
1731 //
1732 // Originally, we wouldn't have a separate symbol for
1733 // "end of block". But in some experimentation, it looks
1734 // like using 0xff00 for "end of block" can save a bit
1735 // of space.
1736 //
1737
1738 if (runLen == 1)
1739 {
1740 runLen = 1;
1741 *acPtr++ = block[dctComp].bits();
1742 _numAcComp++;
1743
1744 //
1745 // Using 0xff00 for "end of block"
1746 //
1747 }
1748 else if (runLen + dctComp == 64)
1749 {
1750 //
1751 // Signal EOB
1752 //
1753
1754 *acPtr++ = 0xff00;
1755 _numAcComp++;
1756 }
1757 else
1758 {
1759 //
1760 // Signal normal run
1761 //
1762
1763 *acPtr++ = 0xff00 | runLen;
1764 _numAcComp++;
1765 }
1766
1767 //
1768 // Advance by runLen
1769 //
1770
1771 dctComp += runLen;
1772 }
1773 }
1774
1775
1776 // ==============================================================
1777 //
1778 // DwaCompressor
1779 //
1780 // --------------------------------------------------------------
1781
1782 //
1783 // DwaCompressor()
1784 //
1785
DwaCompressor(const Header & hdr,int maxScanLineSize,int numScanLines,AcCompression acCompression)1786 DwaCompressor::DwaCompressor
1787 (const Header &hdr,
1788 int maxScanLineSize,
1789 int numScanLines,
1790 AcCompression acCompression)
1791 :
1792 Compressor(hdr),
1793 _acCompression(acCompression),
1794 _maxScanLineSize(maxScanLineSize),
1795 _numScanLines(numScanLines),
1796 _channels(hdr.channels()),
1797 _packedAcBuffer(0),
1798 _packedAcBufferSize(0),
1799 _packedDcBuffer(0),
1800 _packedDcBufferSize(0),
1801 _rleBuffer(0),
1802 _rleBufferSize(0),
1803 _outBuffer(0),
1804 _outBufferSize(0),
1805 _zip(0),
1806 _dwaCompressionLevel(45.0)
1807 {
1808 _min[0] = hdr.dataWindow().min.x;
1809 _min[1] = hdr.dataWindow().min.y;
1810 _max[0] = hdr.dataWindow().max.x;
1811 _max[1] = hdr.dataWindow().max.y;
1812
1813 for (int i=0; i < NUM_COMPRESSOR_SCHEMES; ++i)
1814 {
1815 _planarUncBuffer[i] = 0;
1816 _planarUncBufferSize[i] = 0;
1817 }
1818
1819 //
1820 // Check the header for a quality attribute
1821 //
1822
1823 if (hasDwaCompressionLevel (hdr))
1824 _dwaCompressionLevel = dwaCompressionLevel (hdr);
1825 }
1826
1827
~DwaCompressor()1828 DwaCompressor::~DwaCompressor()
1829 {
1830 delete[] _packedAcBuffer;
1831 delete[] _packedDcBuffer;
1832 delete[] _rleBuffer;
1833 delete[] _outBuffer;
1834 delete _zip;
1835
1836 for (int i=0; i<NUM_COMPRESSOR_SCHEMES; ++i)
1837 delete[] _planarUncBuffer[i];
1838 }
1839
1840
1841 int
numScanLines() const1842 DwaCompressor::numScanLines() const
1843 {
1844 return _numScanLines;
1845 }
1846
1847
1848 Imf::Compressor::Format
format() const1849 DwaCompressor::format() const
1850 {
1851 if (GLOBAL_SYSTEM_LITTLE_ENDIAN)
1852 return NATIVE;
1853 else
1854 return XDR;
1855 }
1856
1857
1858 int
compress(const char * inPtr,int inSize,int minY,const char * & outPtr)1859 DwaCompressor::compress
1860 (const char *inPtr,
1861 int inSize,
1862 int minY,
1863 const char *&outPtr)
1864 {
1865 return compress
1866 (inPtr,
1867 inSize,
1868 Imath::Box2i (Imath::V2i (_min[0], minY),
1869 Imath::V2i (_max[0], minY + numScanLines() - 1)),
1870 outPtr);
1871 }
1872
1873
1874 int
compressTile(const char * inPtr,int inSize,Imath::Box2i range,const char * & outPtr)1875 DwaCompressor::compressTile
1876 (const char *inPtr,
1877 int inSize,
1878 Imath::Box2i range,
1879 const char *&outPtr)
1880 {
1881 return compress (inPtr, inSize, range, outPtr);
1882 }
1883
1884
1885 int
compress(const char * inPtr,int inSize,Imath::Box2i range,const char * & outPtr)1886 DwaCompressor::compress
1887 (const char *inPtr,
1888 int inSize,
1889 Imath::Box2i range,
1890 const char *&outPtr)
1891 {
1892 const char *inDataPtr = inPtr;
1893 char *packedAcEnd = 0;
1894 char *packedDcEnd = 0;
1895 int fileVersion = 2; // Starting with 2, we write the channel
1896 // classification rules into the file
1897
1898 if (fileVersion < 2)
1899 initializeLegacyChannelRules();
1900 else
1901 initializeDefaultChannelRules();
1902
1903 size_t outBufferSize = 0;
1904 initializeBuffers(outBufferSize);
1905
1906 unsigned short channelRuleSize = 0;
1907 std::vector<Classifier> channelRules;
1908 if (fileVersion >= 2)
1909 {
1910 relevantChannelRules(channelRules);
1911
1912 channelRuleSize = Xdr::size<unsigned short>();
1913 for (size_t i = 0; i < channelRules.size(); ++i)
1914 channelRuleSize += channelRules[i].size();
1915 }
1916
1917 //
1918 // Remember to allocate _outBuffer, if we haven't done so already.
1919 //
1920
1921 outBufferSize += channelRuleSize;
1922 if (outBufferSize > _outBufferSize)
1923 {
1924 _outBufferSize = outBufferSize;
1925 if (_outBuffer == 0)
1926 delete[] _outBuffer;
1927 _outBuffer = new char[outBufferSize];
1928 }
1929
1930 char *outDataPtr = &_outBuffer[NUM_SIZES_SINGLE * sizeof(Imf::Int64) +
1931 channelRuleSize];
1932
1933 //
1934 // We might not be dealing with any color data, in which
1935 // case the AC buffer size will be 0, and deferencing
1936 // a vector will not be a good thing to do.
1937 //
1938
1939 if (_packedAcBuffer)
1940 packedAcEnd = _packedAcBuffer;
1941
1942 if (_packedDcBuffer)
1943 packedDcEnd = _packedDcBuffer;
1944
1945 #define OBIDX(x) (Int64 *)&_outBuffer[x * sizeof (Int64)]
1946
1947 Int64 *version = OBIDX (VERSION);
1948 Int64 *unknownUncompressedSize = OBIDX (UNKNOWN_UNCOMPRESSED_SIZE);
1949 Int64 *unknownCompressedSize = OBIDX (UNKNOWN_COMPRESSED_SIZE);
1950 Int64 *acCompressedSize = OBIDX (AC_COMPRESSED_SIZE);
1951 Int64 *dcCompressedSize = OBIDX (DC_COMPRESSED_SIZE);
1952 Int64 *rleCompressedSize = OBIDX (RLE_COMPRESSED_SIZE);
1953 Int64 *rleUncompressedSize = OBIDX (RLE_UNCOMPRESSED_SIZE);
1954 Int64 *rleRawSize = OBIDX (RLE_RAW_SIZE);
1955
1956 Int64 *totalAcUncompressedCount = OBIDX (AC_UNCOMPRESSED_COUNT);
1957 Int64 *totalDcUncompressedCount = OBIDX (DC_UNCOMPRESSED_COUNT);
1958
1959 Int64 *acCompression = OBIDX (AC_COMPRESSION);
1960
1961 int minX = range.min.x;
1962 int maxX = std::min(range.max.x, _max[0]);
1963 int minY = range.min.y;
1964 int maxY = std::min(range.max.y, _max[1]);
1965
1966 //
1967 // Zero all the numbers in the chunk header
1968 //
1969
1970 memset (_outBuffer, 0, NUM_SIZES_SINGLE * sizeof (Int64));
1971
1972 //
1973 // Setup the AC compression strategy and the version in the data block,
1974 // then write the relevant channel classification rules if needed
1975 //
1976 *version = fileVersion;
1977 *acCompression = _acCompression;
1978
1979 setupChannelData (minX, minY, maxX, maxY);
1980
1981 if (fileVersion >= 2)
1982 {
1983 char *writePtr = &_outBuffer[NUM_SIZES_SINGLE * sizeof(Imf::Int64)];
1984 Xdr::write<CharPtrIO> (writePtr, channelRuleSize);
1985
1986 for (size_t i = 0; i < channelRules.size(); ++i)
1987 channelRules[i].write(writePtr);
1988 }
1989
1990 //
1991 // Determine the start of each row in the input buffer
1992 // Channels are interleaved by scanline
1993 //
1994
1995 std::vector<bool> encodedChannels (_channelData.size());
1996 std::vector< std::vector<const char *> > rowPtrs (_channelData.size());
1997
1998 for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
1999 encodedChannels[chan] = false;
2000
2001 inDataPtr = inPtr;
2002
2003 for (int y = minY; y <= maxY; ++y)
2004 {
2005 for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
2006 {
2007
2008 ChannelData *cd = &_channelData[chan];
2009
2010 if (Imath::modp(y, cd->ySampling) != 0)
2011 continue;
2012
2013 rowPtrs[chan].push_back(inDataPtr);
2014 inDataPtr += cd->width * Imf::pixelTypeSize(cd->type);
2015 }
2016 }
2017
2018 inDataPtr = inPtr;
2019
2020 //
2021 // Make a pass over all our CSC sets and try to encode them first
2022 //
2023
2024 for (unsigned int csc = 0; csc < _cscSets.size(); ++csc)
2025 {
2026
2027 LossyDctEncoderCsc encoder
2028 (_dwaCompressionLevel / 100000.f,
2029 rowPtrs[_cscSets[csc].idx[0]],
2030 rowPtrs[_cscSets[csc].idx[1]],
2031 rowPtrs[_cscSets[csc].idx[2]],
2032 packedAcEnd,
2033 packedDcEnd,
2034 dwaCompressorToNonlinear,
2035 _channelData[_cscSets[csc].idx[0]].width,
2036 _channelData[_cscSets[csc].idx[0]].height,
2037 _channelData[_cscSets[csc].idx[0]].type,
2038 _channelData[_cscSets[csc].idx[1]].type,
2039 _channelData[_cscSets[csc].idx[2]].type);
2040
2041 encoder.execute();
2042
2043 *totalAcUncompressedCount += encoder.numAcValuesEncoded();
2044 *totalDcUncompressedCount += encoder.numDcValuesEncoded();
2045
2046 packedAcEnd += encoder.numAcValuesEncoded() * sizeof(unsigned short);
2047 packedDcEnd += encoder.numDcValuesEncoded() * sizeof(unsigned short);
2048
2049 encodedChannels[_cscSets[csc].idx[0]] = true;
2050 encodedChannels[_cscSets[csc].idx[1]] = true;
2051 encodedChannels[_cscSets[csc].idx[2]] = true;
2052 }
2053
2054 for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
2055 {
2056 ChannelData *cd = &_channelData[chan];
2057
2058 if (encodedChannels[chan])
2059 continue;
2060
2061 switch (cd->compression)
2062 {
2063 case LOSSY_DCT:
2064
2065 //
2066 // For LOSSY_DCT, treat this just like the CSC'd case,
2067 // but only operate on one channel
2068 //
2069
2070 {
2071 const unsigned short *nonlinearLut = 0;
2072
2073 if (!cd->pLinear)
2074 nonlinearLut = dwaCompressorToNonlinear;
2075
2076 LossyDctEncoder encoder
2077 (_dwaCompressionLevel / 100000.f,
2078 rowPtrs[chan],
2079 packedAcEnd,
2080 packedDcEnd,
2081 nonlinearLut,
2082 cd->width,
2083 cd->height,
2084 cd->type);
2085
2086 encoder.execute();
2087
2088 *totalAcUncompressedCount += encoder.numAcValuesEncoded();
2089 *totalDcUncompressedCount += encoder.numDcValuesEncoded();
2090
2091 packedAcEnd +=
2092 encoder.numAcValuesEncoded() * sizeof (unsigned short);
2093
2094 packedDcEnd +=
2095 encoder.numDcValuesEncoded() * sizeof (unsigned short);
2096 }
2097
2098 break;
2099
2100 case RLE:
2101
2102 //
2103 // For RLE, bash the bytes up so that the first bytes of each
2104 // pixel are contingous, as are the second bytes, and so on.
2105 //
2106
2107 for (unsigned int y = 0; y < rowPtrs[chan].size(); ++y)
2108 {
2109 const char *row = rowPtrs[chan][y];
2110
2111 for (int x = 0; x < cd->width; ++x)
2112 {
2113 for (int byte = 0;
2114 byte < Imf::pixelTypeSize (cd->type);
2115 ++byte)
2116 {
2117
2118 *cd->planarUncRleEnd[byte]++ = *row++;
2119 }
2120 }
2121
2122 *rleRawSize += cd->width * Imf::pixelTypeSize(cd->type);
2123 }
2124
2125 break;
2126
2127 case UNKNOWN:
2128
2129 //
2130 // Otherwise, just copy data over verbatim
2131 //
2132
2133 {
2134 int scanlineSize = cd->width * Imf::pixelTypeSize(cd->type);
2135
2136 for (unsigned int y = 0; y < rowPtrs[chan].size(); ++y)
2137 {
2138 memcpy (cd->planarUncBufferEnd,
2139 rowPtrs[chan][y],
2140 scanlineSize);
2141
2142 cd->planarUncBufferEnd += scanlineSize;
2143 }
2144
2145 *unknownUncompressedSize += cd->planarUncSize;
2146 }
2147
2148 break;
2149
2150 default:
2151
2152 assert (false);
2153 }
2154
2155 encodedChannels[chan] = true;
2156 }
2157
2158 //
2159 // Pack the Unknown data into the output buffer first. Instead of
2160 // just copying it uncompressed, try zlib compression at least.
2161 //
2162
2163 if (*unknownUncompressedSize > 0)
2164 {
2165 uLongf inSize = (uLongf)(*unknownUncompressedSize);
2166 uLongf outSize = (uLongf)(ceil ((float)inSize * 1.01f) + 100);
2167
2168 if (Z_OK != ::compress2 ((Bytef *)outDataPtr,
2169 &outSize,
2170 (const Bytef *)_planarUncBuffer[UNKNOWN],
2171 inSize,
2172 9))
2173 {
2174 throw Iex::BaseExc ("Data compression (zlib) failed.");
2175 }
2176
2177 outDataPtr += outSize;
2178 *unknownCompressedSize = outSize;
2179 }
2180
2181 //
2182 // Now, pack all the Lossy DCT coefficients into our output
2183 // buffer, with Huffman encoding.
2184 //
2185 // Also, record the compressed size and the number of
2186 // uncompressed componentns we have.
2187 //
2188
2189 if (*totalAcUncompressedCount > 0)
2190 {
2191 switch (_acCompression)
2192 {
2193 case STATIC_HUFFMAN:
2194
2195 *acCompressedSize = (int)
2196 hufCompress((unsigned short *)_packedAcBuffer,
2197 (int)*totalAcUncompressedCount,
2198 outDataPtr);
2199 break;
2200
2201 case DEFLATE:
2202
2203 {
2204 uLongf destLen = (uLongf)
2205 (2 * (*totalAcUncompressedCount) * sizeof (unsigned short));
2206
2207 if (Z_OK != ::compress2
2208 ((Bytef *)outDataPtr,
2209 &destLen,
2210 (Bytef *)_packedAcBuffer,
2211 (uLong)(*totalAcUncompressedCount
2212 * sizeof (unsigned short)),
2213 9))
2214 {
2215 throw Iex::InputExc ("Data compression (zlib) failed.");
2216 }
2217
2218 *acCompressedSize = destLen;
2219 }
2220
2221 break;
2222
2223 default:
2224
2225 assert (false);
2226 }
2227
2228 outDataPtr += *acCompressedSize;
2229 }
2230
2231 //
2232 // Handle the DC components separately
2233 //
2234
2235 if (*totalDcUncompressedCount > 0)
2236 {
2237 *dcCompressedSize = _zip->compress
2238 (_packedDcBuffer,
2239 (int)(*totalDcUncompressedCount) * sizeof (unsigned short),
2240 outDataPtr);
2241
2242 outDataPtr += *dcCompressedSize;
2243 }
2244
2245 //
2246 // If we have RLE data, first RLE encode it and set the uncompressed
2247 // size. Then, deflate the results and set the compressed size.
2248 //
2249
2250 if (*rleRawSize > 0)
2251 {
2252 *rleUncompressedSize = rleCompress
2253 ((int)(*rleRawSize),
2254 _planarUncBuffer[RLE],
2255 (signed char *)_rleBuffer);
2256
2257 uLongf dstLen =
2258 (uLongf)ceil (1.01f * (float) * rleUncompressedSize) + 24;
2259
2260 if (Z_OK != ::compress2
2261 ((Bytef *)outDataPtr,
2262 &dstLen,
2263 (Bytef *)_rleBuffer,
2264 (uLong)(*rleUncompressedSize),
2265 9))
2266 {
2267 throw Iex::BaseExc ("Error compressing RLE'd data.");
2268 }
2269
2270 *rleCompressedSize = dstLen;
2271 outDataPtr += *rleCompressedSize;
2272 }
2273
2274 //
2275 // Flip the counters to XDR format
2276 //
2277
2278 for (int i = 0; i < NUM_SIZES_SINGLE; ++i)
2279 {
2280 Int64 src = *(((Int64 *)_outBuffer) + i);
2281 char *dst = (char *)(((Int64 *)_outBuffer) + i);
2282
2283 Xdr::write<CharPtrIO> (dst, src);
2284 }
2285
2286 //
2287 // We're done - compute the number of bytes we packed
2288 //
2289
2290 outPtr = _outBuffer;
2291
2292 return static_cast<int>(outDataPtr - _outBuffer + 1);
2293 }
2294
2295
2296 int
uncompress(const char * inPtr,int inSize,int minY,const char * & outPtr)2297 DwaCompressor::uncompress
2298 (const char *inPtr,
2299 int inSize,
2300 int minY,
2301 const char *&outPtr)
2302 {
2303 return uncompress (inPtr,
2304 inSize,
2305 Imath::Box2i (Imath::V2i (_min[0], minY),
2306 Imath::V2i (_max[0], minY + numScanLines() - 1)),
2307 outPtr);
2308 }
2309
2310
2311 int
uncompressTile(const char * inPtr,int inSize,Imath::Box2i range,const char * & outPtr)2312 DwaCompressor::uncompressTile
2313 (const char *inPtr,
2314 int inSize,
2315 Imath::Box2i range,
2316 const char *&outPtr)
2317 {
2318 return uncompress (inPtr, inSize, range, outPtr);
2319 }
2320
2321
2322 int
uncompress(const char * inPtr,int inSize,Imath::Box2i range,const char * & outPtr)2323 DwaCompressor::uncompress
2324 (const char *inPtr,
2325 int inSize,
2326 Imath::Box2i range,
2327 const char *&outPtr)
2328 {
2329 int minX = range.min.x;
2330 int maxX = std::min (range.max.x, _max[0]);
2331 int minY = range.min.y;
2332 int maxY = std::min (range.max.y, _max[1]);
2333
2334 int headerSize = NUM_SIZES_SINGLE*sizeof(Int64);
2335 if (inSize < headerSize)
2336 {
2337 throw Iex::InputExc("Error uncompressing DWA data"
2338 "(truncated header).");
2339 }
2340
2341 //
2342 // Flip the counters from XDR to NATIVE
2343 //
2344
2345 for (int i = 0; i < NUM_SIZES_SINGLE; ++i)
2346 {
2347 Int64 *dst = (((Int64 *)inPtr) + i);
2348 const char *src = (char *)(((Int64 *)inPtr) + i);
2349
2350 Xdr::read<CharPtrIO> (src, *dst);
2351 }
2352
2353 //
2354 // Unwind all the counter info
2355 //
2356
2357 const Int64 *inPtr64 = (const Int64*) inPtr;
2358
2359 Int64 version = *(inPtr64 + VERSION);
2360 Int64 unknownUncompressedSize = *(inPtr64 + UNKNOWN_UNCOMPRESSED_SIZE);
2361 Int64 unknownCompressedSize = *(inPtr64 + UNKNOWN_COMPRESSED_SIZE);
2362 Int64 acCompressedSize = *(inPtr64 + AC_COMPRESSED_SIZE);
2363 Int64 dcCompressedSize = *(inPtr64 + DC_COMPRESSED_SIZE);
2364 Int64 rleCompressedSize = *(inPtr64 + RLE_COMPRESSED_SIZE);
2365 Int64 rleUncompressedSize = *(inPtr64 + RLE_UNCOMPRESSED_SIZE);
2366 Int64 rleRawSize = *(inPtr64 + RLE_RAW_SIZE);
2367
2368 Int64 totalAcUncompressedCount = *(inPtr64 + AC_UNCOMPRESSED_COUNT);
2369 Int64 totalDcUncompressedCount = *(inPtr64 + DC_UNCOMPRESSED_COUNT);
2370
2371 Int64 acCompression = *(inPtr64 + AC_COMPRESSION);
2372
2373 Int64 compressedSize = unknownCompressedSize +
2374 acCompressedSize +
2375 dcCompressedSize +
2376 rleCompressedSize;
2377
2378 const char *dataPtr = inPtr + NUM_SIZES_SINGLE * sizeof(Int64);
2379
2380 /* Both the sum and individual sizes are checked in case of overflow. */
2381 if (inSize < (headerSize + compressedSize) ||
2382 inSize < unknownCompressedSize ||
2383 inSize < acCompressedSize ||
2384 inSize < dcCompressedSize ||
2385 inSize < rleCompressedSize)
2386 {
2387 throw Iex::InputExc("Error uncompressing DWA data"
2388 "(truncated file).");
2389 }
2390
2391 if (unknownUncompressedSize < 0 ||
2392 unknownCompressedSize < 0 ||
2393 acCompressedSize < 0 ||
2394 dcCompressedSize < 0 ||
2395 rleCompressedSize < 0 ||
2396 rleUncompressedSize < 0 ||
2397 rleRawSize < 0 ||
2398 totalAcUncompressedCount < 0 ||
2399 totalDcUncompressedCount < 0)
2400 {
2401 throw Iex::InputExc("Error uncompressing DWA data"
2402 " (corrupt header).");
2403 }
2404
2405 if (version < 2)
2406 initializeLegacyChannelRules();
2407 else
2408 {
2409 unsigned short ruleSize = 0;
2410 Xdr::read<CharPtrIO>(dataPtr, ruleSize);
2411
2412 if (ruleSize < 0)
2413 throw Iex::InputExc("Error uncompressing DWA data"
2414 " (corrupt header file).");
2415
2416 headerSize += ruleSize;
2417 if (inSize < headerSize + compressedSize)
2418 throw Iex::InputExc("Error uncompressing DWA data"
2419 " (truncated file).");
2420
2421 _channelRules.clear();
2422 ruleSize -= Xdr::size<unsigned short> ();
2423 while (ruleSize > 0)
2424 {
2425 Classifier rule(dataPtr, ruleSize);
2426
2427 _channelRules.push_back(rule);
2428 ruleSize -= rule.size();
2429 }
2430 }
2431
2432
2433 size_t outBufferSize = 0;
2434 initializeBuffers(outBufferSize);
2435
2436 //
2437 // Allocate _outBuffer, if we haven't done so already
2438 //
2439
2440 if (_maxScanLineSize * numScanLines() > _outBufferSize)
2441 {
2442 _outBufferSize = _maxScanLineSize * numScanLines();
2443 if (_outBuffer != 0)
2444 delete[] _outBuffer;
2445 _outBuffer = new char[_maxScanLineSize * numScanLines()];
2446 }
2447
2448
2449 char *outBufferEnd = _outBuffer;
2450
2451
2452 //
2453 // Find the start of the RLE packed AC components and
2454 // the DC components for each channel. This will be handy
2455 // if you want to decode the channels in parallel later on.
2456 //
2457
2458 char *packedAcBufferEnd = 0;
2459
2460 if (_packedAcBuffer)
2461 packedAcBufferEnd = _packedAcBuffer;
2462
2463 char *packedDcBufferEnd = 0;
2464
2465 if (_packedDcBuffer)
2466 packedDcBufferEnd = _packedDcBuffer;
2467
2468 //
2469 // UNKNOWN data is packed first, followed by the
2470 // Huffman-compressed AC, then the DC values,
2471 // and then the zlib compressed RLE data.
2472 //
2473
2474 const char *compressedUnknownBuf = dataPtr;
2475
2476 const char *compressedAcBuf = compressedUnknownBuf +
2477 static_cast<ptrdiff_t>(unknownCompressedSize);
2478 const char *compressedDcBuf = compressedAcBuf +
2479 static_cast<ptrdiff_t>(acCompressedSize);
2480 const char *compressedRleBuf = compressedDcBuf +
2481 static_cast<ptrdiff_t>(dcCompressedSize);
2482
2483 //
2484 // Sanity check that the version is something we expect. Right now,
2485 // we can decode version 0, 1, and 2. v1 adds 'end of block' symbols
2486 // to the AC RLE. v2 adds channel classification rules at the
2487 // start of the data block.
2488 //
2489
2490 if ((version < 0) || (version > 2))
2491 throw Iex::InputExc ("Invalid version of compressed data block");
2492
2493 setupChannelData(minX, minY, maxX, maxY);
2494
2495 //
2496 // Uncompress the UNKNOWN data into _planarUncBuffer[UNKNOWN]
2497 //
2498
2499 if (unknownCompressedSize > 0)
2500 {
2501 uLongf outSize = static_cast<uLongf>(
2502 ceil( (float)unknownUncompressedSize * 1.01) + 100);
2503
2504 if (unknownUncompressedSize < 0 ||
2505 outSize > _planarUncBufferSize[UNKNOWN])
2506 {
2507 throw Iex::InputExc("Error uncompressing DWA data"
2508 "(corrupt header).");
2509 }
2510
2511 if (Z_OK != ::uncompress
2512 ((Bytef *)_planarUncBuffer[UNKNOWN],
2513 &outSize,
2514 (Bytef *)compressedUnknownBuf,
2515 (uLong)unknownCompressedSize))
2516 {
2517 throw Iex::BaseExc("Error uncompressing UNKNOWN data.");
2518 }
2519 }
2520
2521 //
2522 // Uncompress the AC data into _packedAcBuffer
2523 //
2524
2525 if (acCompressedSize > 0)
2526 {
2527 if (totalAcUncompressedCount*sizeof(unsigned short) > _packedAcBufferSize)
2528 {
2529 throw Iex::InputExc("Error uncompressing DWA data"
2530 "(corrupt header).");
2531 }
2532
2533 //
2534 // Don't trust the user to get it right, look in the file.
2535 //
2536
2537 switch (acCompression)
2538 {
2539 case STATIC_HUFFMAN:
2540
2541 hufUncompress
2542 (compressedAcBuf,
2543 (int)acCompressedSize,
2544 (unsigned short *)_packedAcBuffer,
2545 (int)totalAcUncompressedCount);
2546
2547 break;
2548
2549 case DEFLATE:
2550 {
2551 uLongf destLen =
2552 (int)(totalAcUncompressedCount) * sizeof (unsigned short);
2553
2554 if (Z_OK != ::uncompress
2555 ((Bytef *)_packedAcBuffer,
2556 &destLen,
2557 (Bytef *)compressedAcBuf,
2558 (uLong)acCompressedSize))
2559 {
2560 throw Iex::InputExc ("Data decompression (zlib) failed.");
2561 }
2562
2563 if (totalAcUncompressedCount * sizeof (unsigned short) !=
2564 destLen)
2565 {
2566 throw Iex::InputExc ("AC data corrupt.");
2567 }
2568 }
2569 break;
2570
2571 default:
2572
2573 throw Iex::NoImplExc ("Unknown AC Compression");
2574 break;
2575 }
2576 }
2577
2578 //
2579 // Uncompress the DC data into _packedDcBuffer
2580 //
2581
2582 if (dcCompressedSize > 0)
2583 {
2584 if (totalDcUncompressedCount*sizeof(unsigned short) > _packedDcBufferSize)
2585 {
2586 throw Iex::InputExc("Error uncompressing DWA data"
2587 "(corrupt header).");
2588 }
2589
2590 if (_zip->uncompress
2591 (compressedDcBuf, (int)dcCompressedSize, _packedDcBuffer)
2592 != (int)totalDcUncompressedCount * sizeof (unsigned short))
2593 {
2594 throw Iex::BaseExc("DC data corrupt.");
2595 }
2596 }
2597
2598 //
2599 // Uncompress the RLE data into _rleBuffer, then unRLE the results
2600 // into _planarUncBuffer[RLE]
2601 //
2602
2603 if (rleRawSize > 0)
2604 {
2605 if (rleUncompressedSize > _rleBufferSize ||
2606 rleRawSize > _planarUncBufferSize[RLE])
2607 {
2608 throw Iex::InputExc("Error uncompressing DWA data"
2609 "(corrupt header).");
2610 }
2611
2612 uLongf dstLen = (uLongf)rleUncompressedSize;
2613
2614 if (Z_OK != ::uncompress
2615 ((Bytef *)_rleBuffer,
2616 &dstLen,
2617 (Bytef *)compressedRleBuf,
2618 (uLong)rleCompressedSize))
2619 {
2620 throw Iex::BaseExc("Error uncompressing RLE data.");
2621 }
2622
2623 if (dstLen != rleUncompressedSize)
2624 throw Iex::BaseExc("RLE data corrupted");
2625
2626 if (rleUncompress
2627 ((int)rleUncompressedSize,
2628 (int)rleRawSize,
2629 (signed char *)_rleBuffer,
2630 _planarUncBuffer[RLE]) != rleRawSize)
2631 {
2632 throw Iex::BaseExc("RLE data corrupted");
2633 }
2634 }
2635
2636 //
2637 // Determine the start of each row in the output buffer
2638 //
2639
2640 std::vector<bool> decodedChannels (_channelData.size());
2641 std::vector< std::vector<char *> > rowPtrs (_channelData.size());
2642
2643 for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
2644 decodedChannels[chan] = false;
2645
2646 outBufferEnd = _outBuffer;
2647
2648 for (int y = minY; y <= maxY; ++y)
2649 {
2650 for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
2651 {
2652 ChannelData *cd = &_channelData[chan];
2653
2654 if (Imath::modp (y, cd->ySampling) != 0)
2655 continue;
2656
2657 rowPtrs[chan].push_back (outBufferEnd);
2658 outBufferEnd += cd->width * Imf::pixelTypeSize (cd->type);
2659 }
2660 }
2661
2662 //
2663 // Setup to decode each block of 3 channels that need to
2664 // be handled together
2665 //
2666
2667 for (unsigned int csc = 0; csc < _cscSets.size(); ++csc)
2668 {
2669 int rChan = _cscSets[csc].idx[0];
2670 int gChan = _cscSets[csc].idx[1];
2671 int bChan = _cscSets[csc].idx[2];
2672
2673
2674 LossyDctDecoderCsc decoder
2675 (rowPtrs[rChan],
2676 rowPtrs[gChan],
2677 rowPtrs[bChan],
2678 packedAcBufferEnd,
2679 packedDcBufferEnd,
2680 dwaCompressorToLinear,
2681 _channelData[rChan].width,
2682 _channelData[rChan].height,
2683 _channelData[rChan].type,
2684 _channelData[gChan].type,
2685 _channelData[bChan].type);
2686
2687 decoder.execute();
2688
2689 packedAcBufferEnd +=
2690 decoder.numAcValuesEncoded() * sizeof (unsigned short);
2691
2692 packedDcBufferEnd +=
2693 decoder.numDcValuesEncoded() * sizeof (unsigned short);
2694
2695 decodedChannels[rChan] = true;
2696 decodedChannels[gChan] = true;
2697 decodedChannels[bChan] = true;
2698 }
2699
2700 //
2701 // Setup to handle the remaining channels by themselves
2702 //
2703
2704 for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
2705 {
2706 if (decodedChannels[chan])
2707 continue;
2708
2709 ChannelData *cd = &_channelData[chan];
2710 int pixelSize = Imf::pixelTypeSize (cd->type);
2711
2712 switch (cd->compression)
2713 {
2714 case LOSSY_DCT:
2715
2716 //
2717 // Setup a single-channel lossy DCT decoder pointing
2718 // at the output buffer
2719 //
2720
2721 {
2722 const unsigned short *linearLut = 0;
2723
2724 if (!cd->pLinear)
2725 linearLut = dwaCompressorToLinear;
2726
2727 LossyDctDecoder decoder
2728 (rowPtrs[chan],
2729 packedAcBufferEnd,
2730 packedDcBufferEnd,
2731 linearLut,
2732 cd->width,
2733 cd->height,
2734 cd->type);
2735
2736 decoder.execute();
2737
2738 packedAcBufferEnd +=
2739 decoder.numAcValuesEncoded() * sizeof (unsigned short);
2740
2741 packedDcBufferEnd +=
2742 decoder.numDcValuesEncoded() * sizeof (unsigned short);
2743 }
2744
2745 break;
2746
2747 case RLE:
2748
2749 //
2750 // For the RLE case, the data has been un-RLE'd into
2751 // planarUncRleEnd[], but is still split out by bytes.
2752 // We need to rearrange the bytes back into the correct
2753 // order in the output buffer;
2754 //
2755
2756 {
2757 int row = 0;
2758
2759 for (int y = minY; y <= maxY; ++y)
2760 {
2761 if (Imath::modp (y, cd->ySampling) != 0)
2762 continue;
2763
2764 char *dst = rowPtrs[chan][row];
2765
2766 if (pixelSize == 2)
2767 {
2768 interleaveByte2 (dst,
2769 cd->planarUncRleEnd[0],
2770 cd->planarUncRleEnd[1],
2771 cd->width);
2772
2773 cd->planarUncRleEnd[0] += cd->width;
2774 cd->planarUncRleEnd[1] += cd->width;
2775 }
2776 else
2777 {
2778 for (int x = 0; x < cd->width; ++x)
2779 {
2780 for (int byte = 0; byte < pixelSize; ++byte)
2781 {
2782 *dst++ = *cd->planarUncRleEnd[byte]++;
2783 }
2784 }
2785 }
2786
2787 row++;
2788 }
2789 }
2790
2791 break;
2792
2793 case UNKNOWN:
2794
2795 //
2796 // In the UNKNOWN case, data is already in planarUncBufferEnd
2797 // and just needs to copied over to the output buffer
2798 //
2799
2800 {
2801 int row = 0;
2802 int dstScanlineSize = cd->width * Imf::pixelTypeSize (cd->type);
2803
2804 for (int y = minY; y <= maxY; ++y)
2805 {
2806 if (Imath::modp (y, cd->ySampling) != 0)
2807 continue;
2808
2809 memcpy (rowPtrs[chan][row],
2810 cd->planarUncBufferEnd,
2811 dstScanlineSize);
2812
2813 cd->planarUncBufferEnd += dstScanlineSize;
2814 row++;
2815 }
2816 }
2817
2818 break;
2819
2820 default:
2821
2822 throw Iex::NoImplExc ("Unhandled compression scheme case");
2823 break;
2824 }
2825
2826 decodedChannels[chan] = true;
2827 }
2828
2829 //
2830 // Return a ptr to _outBuffer
2831 //
2832
2833 outPtr = _outBuffer;
2834 return (int)(outBufferEnd - _outBuffer);
2835 }
2836
2837
2838 // static
2839 void
initializeFuncs()2840 DwaCompressor::initializeFuncs()
2841 {
2842 convertFloatToHalf64 = convertFloatToHalf64_scalar;
2843 fromHalfZigZag = fromHalfZigZag_scalar;
2844
2845 CpuId cpuId;
2846
2847 //
2848 // Setup HALF <-> FLOAT conversion implementations
2849 //
2850
2851 if (cpuId.avx && cpuId.f16c)
2852 {
2853 convertFloatToHalf64 = convertFloatToHalf64_f16c;
2854 fromHalfZigZag = fromHalfZigZag_f16c;
2855 }
2856
2857 //
2858 // Setup inverse DCT implementations
2859 //
2860
2861 dctInverse8x8_0 = dctInverse8x8_scalar<0>;
2862 dctInverse8x8_1 = dctInverse8x8_scalar<1>;
2863 dctInverse8x8_2 = dctInverse8x8_scalar<2>;
2864 dctInverse8x8_3 = dctInverse8x8_scalar<3>;
2865 dctInverse8x8_4 = dctInverse8x8_scalar<4>;
2866 dctInverse8x8_5 = dctInverse8x8_scalar<5>;
2867 dctInverse8x8_6 = dctInverse8x8_scalar<6>;
2868 dctInverse8x8_7 = dctInverse8x8_scalar<7>;
2869
2870 if (cpuId.avx)
2871 {
2872 dctInverse8x8_0 = dctInverse8x8_avx<0>;
2873 dctInverse8x8_1 = dctInverse8x8_avx<1>;
2874 dctInverse8x8_2 = dctInverse8x8_avx<2>;
2875 dctInverse8x8_3 = dctInverse8x8_avx<3>;
2876 dctInverse8x8_4 = dctInverse8x8_avx<4>;
2877 dctInverse8x8_5 = dctInverse8x8_avx<5>;
2878 dctInverse8x8_6 = dctInverse8x8_avx<6>;
2879 dctInverse8x8_7 = dctInverse8x8_avx<7>;
2880 }
2881 else if (cpuId.sse2)
2882 {
2883 dctInverse8x8_0 = dctInverse8x8_sse2<0>;
2884 dctInverse8x8_1 = dctInverse8x8_sse2<1>;
2885 dctInverse8x8_2 = dctInverse8x8_sse2<2>;
2886 dctInverse8x8_3 = dctInverse8x8_sse2<3>;
2887 dctInverse8x8_4 = dctInverse8x8_sse2<4>;
2888 dctInverse8x8_5 = dctInverse8x8_sse2<5>;
2889 dctInverse8x8_6 = dctInverse8x8_sse2<6>;
2890 dctInverse8x8_7 = dctInverse8x8_sse2<7>;
2891 }
2892 }
2893
2894
2895 //
2896 // Handle channel classification and buffer allocation once we know
2897 // how to classify channels
2898 //
2899
2900 void
initializeBuffers(size_t & outBufferSize)2901 DwaCompressor::initializeBuffers (size_t &outBufferSize)
2902 {
2903 classifyChannels (_channels, _channelData, _cscSets);
2904
2905 //
2906 // _outBuffer needs to be big enough to hold all our
2907 // compressed data - which could vary depending on what sort
2908 // of channels we have.
2909 //
2910
2911 int maxOutBufferSize = 0;
2912 int numLossyDctChans = 0;
2913 int unknownBufferSize = 0;
2914 int rleBufferSize = 0;
2915
2916 int maxLossyDctAcSize = (int)ceil ((float)numScanLines() / 8.0f) *
2917 (int)ceil ((float)(_max[0] - _min[0] + 1) / 8.0f) *
2918 63 * sizeof (unsigned short);
2919
2920 int maxLossyDctDcSize = (int)ceil ((float)numScanLines() / 8.0f) *
2921 (int)ceil ((float)(_max[0] - _min[0] + 1) / 8.0f) *
2922 sizeof (unsigned short);
2923
2924 for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
2925 {
2926 switch (_channelData[chan].compression)
2927 {
2928 case LOSSY_DCT:
2929
2930 //
2931 // This is the size of the number of packed
2932 // components, plus the requirements for
2933 // maximum Huffman encoding size.
2934 //
2935
2936 maxOutBufferSize += 2 * maxLossyDctAcSize + 65536;
2937 numLossyDctChans++;
2938 break;
2939
2940 case RLE:
2941 {
2942 //
2943 // RLE, if gone horribly wrong, could double the size
2944 // of the source data.
2945 //
2946
2947 int rleAmount = 2 * numScanLines() * (_max[0] - _min[0] + 1) *
2948 Imf::pixelTypeSize (_channelData[chan].type);
2949
2950 rleBufferSize += rleAmount;
2951 }
2952 break;
2953
2954
2955 case UNKNOWN:
2956
2957 unknownBufferSize += numScanLines() * (_max[0] - _min[0] + 1) *
2958 Imf::pixelTypeSize (_channelData[chan].type);
2959 break;
2960
2961 default:
2962
2963 throw Iex::NoImplExc ("Unhandled compression scheme case");
2964 break;
2965 }
2966 }
2967
2968 //
2969 // Also, since the results of the RLE are packed into
2970 // the output buffer, we need the extra room there. But
2971 // we're going to zlib compress() the data we pack,
2972 // which could take slightly more space
2973 //
2974
2975 maxOutBufferSize += (int)(ceil (1.01f * (float)rleBufferSize) + 100);
2976
2977 //
2978 // And the same goes for the UNKNOWN data
2979 //
2980
2981 maxOutBufferSize += (int)(ceil (1.01f * (float)unknownBufferSize) + 100);
2982
2983 //
2984 // Allocate a zip/deflate compressor big enought to hold the DC data
2985 // and include it's compressed results in the size requirements
2986 // for our output buffer
2987 //
2988
2989 if (_zip == 0)
2990 _zip = new Zip (maxLossyDctDcSize * numLossyDctChans);
2991 else if (_zip->maxRawSize() < maxLossyDctDcSize * numLossyDctChans)
2992 {
2993 delete _zip;
2994 _zip = new Zip (maxLossyDctDcSize * numLossyDctChans);
2995 }
2996
2997
2998 maxOutBufferSize += _zip->maxCompressedSize();
2999
3000 //
3001 // We also need to reserve space at the head of the buffer to
3002 // write out the size of our various packed and compressed data.
3003 //
3004
3005 maxOutBufferSize += NUM_SIZES_SINGLE * sizeof (Int64);
3006
3007
3008 //
3009 // Later, we're going to hijack outBuffer for the result of
3010 // both encoding and decoding. So it needs to be big enough
3011 // to hold either a buffers' worth of uncompressed or
3012 // compressed data
3013 //
3014 // For encoding, we'll need _outBuffer to hold maxOutBufferSize bytes,
3015 // but for decoding, we only need it to be maxScanLineSize*numScanLines.
3016 // Cache the max size for now, and alloc the buffer when we either
3017 // encode or decode.
3018 //
3019
3020 outBufferSize = maxOutBufferSize;
3021
3022
3023 //
3024 // _packedAcBuffer holds the quantized DCT coefficients prior
3025 // to Huffman encoding
3026 //
3027
3028 if (maxLossyDctAcSize * numLossyDctChans > _packedAcBufferSize)
3029 {
3030 _packedAcBufferSize = maxLossyDctAcSize * numLossyDctChans;
3031 if (_packedAcBuffer != 0)
3032 delete[] _packedAcBuffer;
3033 _packedAcBuffer = new char[_packedAcBufferSize];
3034 }
3035
3036 //
3037 // _packedDcBuffer holds one quantized DCT coef per 8x8 block
3038 //
3039
3040 if (maxLossyDctDcSize * numLossyDctChans > _packedDcBufferSize)
3041 {
3042 _packedDcBufferSize = maxLossyDctDcSize * numLossyDctChans;
3043 if (_packedDcBuffer != 0)
3044 delete[] _packedDcBuffer;
3045 _packedDcBuffer = new char[_packedDcBufferSize];
3046 }
3047
3048 if (rleBufferSize > _rleBufferSize)
3049 {
3050 _rleBufferSize = rleBufferSize;
3051 if (_rleBuffer != 0)
3052 delete[] _rleBuffer;
3053 _rleBuffer = new char[rleBufferSize];
3054 }
3055
3056 //
3057 // The planar uncompressed buffer will hold float data for LOSSY_DCT
3058 // compressed values, and whatever the native type is for other
3059 // channels. We're going to use this to hold data in a planar
3060 // format, as opposed to the native interleaved format we take
3061 // into compress() and give back from uncompress().
3062 //
3063 // This also makes it easier to compress the UNKNOWN and RLE data
3064 // all in one swoop (for each compression scheme).
3065 //
3066
3067 int planarUncBufferSize[NUM_COMPRESSOR_SCHEMES];
3068 for (int i=0; i<NUM_COMPRESSOR_SCHEMES; ++i)
3069 planarUncBufferSize[i] = 0;
3070
3071 for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
3072 {
3073 switch (_channelData[chan].compression)
3074 {
3075 case LOSSY_DCT:
3076 break;
3077
3078 case RLE:
3079 planarUncBufferSize[RLE] +=
3080 numScanLines() * (_max[0] - _min[0] + 1) *
3081 Imf::pixelTypeSize (_channelData[chan].type);
3082 break;
3083
3084 case UNKNOWN:
3085 planarUncBufferSize[UNKNOWN] +=
3086 numScanLines() * (_max[0] - _min[0] + 1) *
3087 Imf::pixelTypeSize (_channelData[chan].type);
3088 break;
3089
3090 default:
3091 throw Iex::NoImplExc ("Unhandled compression scheme case");
3092 break;
3093 }
3094 }
3095
3096 //
3097 // UNKNOWN data is going to be zlib compressed, which needs
3098 // a little extra headroom
3099 //
3100
3101 if (planarUncBufferSize[UNKNOWN] > 0)
3102 {
3103 planarUncBufferSize[UNKNOWN] =
3104 (int) ceil (1.01f * (float)planarUncBufferSize[UNKNOWN]) + 100;
3105 }
3106
3107 for (int i = 0; i < NUM_COMPRESSOR_SCHEMES; ++i)
3108 {
3109 if (planarUncBufferSize[i] > _planarUncBufferSize[i])
3110 {
3111 _planarUncBufferSize[i] = planarUncBufferSize[i];
3112 if (_planarUncBuffer[i] != 0)
3113 delete[] _planarUncBuffer[i];
3114 _planarUncBuffer[i] = new char[planarUncBufferSize[i]];
3115 }
3116 }
3117 }
3118
3119
3120 //
3121 // Setup channel classification rules to use when writing files
3122 //
3123
3124 void
initializeDefaultChannelRules()3125 DwaCompressor::initializeDefaultChannelRules ()
3126 {
3127 _channelRules.clear();
3128
3129 _channelRules.push_back (Classifier ("R", LOSSY_DCT, HALF, 0, false));
3130 _channelRules.push_back (Classifier ("R", LOSSY_DCT, FLOAT, 0, false));
3131 _channelRules.push_back (Classifier ("G", LOSSY_DCT, HALF, 1, false));
3132 _channelRules.push_back (Classifier ("G", LOSSY_DCT, FLOAT, 1, false));
3133 _channelRules.push_back (Classifier ("B", LOSSY_DCT, HALF, 2, false));
3134 _channelRules.push_back (Classifier ("B", LOSSY_DCT, FLOAT, 2, false));
3135
3136 _channelRules.push_back (Classifier ("Y", LOSSY_DCT, HALF, -1, false));
3137 _channelRules.push_back (Classifier ("Y", LOSSY_DCT, FLOAT, -1, false));
3138 _channelRules.push_back (Classifier ("BY", LOSSY_DCT, HALF, -1, false));
3139 _channelRules.push_back (Classifier ("BY", LOSSY_DCT, FLOAT, -1, false));
3140 _channelRules.push_back (Classifier ("RY", LOSSY_DCT, HALF, -1, false));
3141 _channelRules.push_back (Classifier ("RY", LOSSY_DCT, FLOAT, -1, false));
3142
3143 _channelRules.push_back (Classifier ("A", RLE, UINT, -1, false));
3144 _channelRules.push_back (Classifier ("A", RLE, HALF, -1, false));
3145 _channelRules.push_back (Classifier ("A", RLE, FLOAT, -1, false));
3146 }
3147
3148
3149 //
3150 // Setup channel classification rules when reading files with VERSION < 2
3151 //
3152
3153 void
initializeLegacyChannelRules()3154 DwaCompressor::initializeLegacyChannelRules ()
3155 {
3156 _channelRules.clear();
3157
3158 _channelRules.push_back (Classifier ("r", LOSSY_DCT, HALF, 0, true));
3159 _channelRules.push_back (Classifier ("r", LOSSY_DCT, FLOAT, 0, true));
3160 _channelRules.push_back (Classifier ("red", LOSSY_DCT, HALF, 0, true));
3161 _channelRules.push_back (Classifier ("red", LOSSY_DCT, FLOAT, 0, true));
3162 _channelRules.push_back (Classifier ("g", LOSSY_DCT, HALF, 1, true));
3163 _channelRules.push_back (Classifier ("g", LOSSY_DCT, FLOAT, 1, true));
3164 _channelRules.push_back (Classifier ("grn", LOSSY_DCT, HALF, 1, true));
3165 _channelRules.push_back (Classifier ("grn", LOSSY_DCT, FLOAT, 1, true));
3166 _channelRules.push_back (Classifier ("green", LOSSY_DCT, HALF, 1, true));
3167 _channelRules.push_back (Classifier ("green", LOSSY_DCT, FLOAT, 1, true));
3168 _channelRules.push_back (Classifier ("b", LOSSY_DCT, HALF, 2, true));
3169 _channelRules.push_back (Classifier ("b", LOSSY_DCT, FLOAT, 2, true));
3170 _channelRules.push_back (Classifier ("blu", LOSSY_DCT, HALF, 2, true));
3171 _channelRules.push_back (Classifier ("blu", LOSSY_DCT, FLOAT, 2, true));
3172 _channelRules.push_back (Classifier ("blue", LOSSY_DCT, HALF, 2, true));
3173 _channelRules.push_back (Classifier ("blue", LOSSY_DCT, FLOAT, 2, true));
3174
3175 _channelRules.push_back (Classifier ("y", LOSSY_DCT, HALF, -1, true));
3176 _channelRules.push_back (Classifier ("y", LOSSY_DCT, FLOAT, -1, true));
3177 _channelRules.push_back (Classifier ("by", LOSSY_DCT, HALF, -1, true));
3178 _channelRules.push_back (Classifier ("by", LOSSY_DCT, FLOAT, -1, true));
3179 _channelRules.push_back (Classifier ("ry", LOSSY_DCT, HALF, -1, true));
3180 _channelRules.push_back (Classifier ("ry", LOSSY_DCT, FLOAT, -1, true));
3181 _channelRules.push_back (Classifier ("a", RLE, UINT, -1, true));
3182 _channelRules.push_back (Classifier ("a", RLE, HALF, -1, true));
3183 _channelRules.push_back (Classifier ("a", RLE, FLOAT, -1, true));
3184 }
3185
3186
3187 //
3188 // Given a set of rules and ChannelData, figure out which rules apply
3189 //
3190
3191 void
relevantChannelRules(std::vector<Classifier> & rules) const3192 DwaCompressor::relevantChannelRules (std::vector<Classifier> &rules) const
3193 {
3194 rules.clear();
3195
3196 std::vector<std::string> suffixes;
3197
3198 for (size_t cd = 0; cd < _channelData.size(); ++cd)
3199 {
3200 std::string suffix = _channelData[cd].name;
3201 size_t lastDot = suffix.find_last_of ('.');
3202
3203 if (lastDot != std::string::npos)
3204 suffix = suffix.substr (lastDot+1, std::string::npos);
3205
3206 suffixes.push_back(suffix);
3207 }
3208
3209
3210 for (size_t i = 0; i < _channelRules.size(); ++i)
3211 {
3212 for (size_t cd = 0; cd < _channelData.size(); ++cd)
3213 {
3214 if (_channelRules[i].match (suffixes[cd], _channelData[cd].type ))
3215 {
3216 rules.push_back (_channelRules[i]);
3217 break;
3218 }
3219 }
3220 }
3221 }
3222
3223
3224 //
3225 // Take our initial list of channels, and cache the contents.
3226 //
3227 // Determine approprate compression schemes for each channel,
3228 // and figure out which sets should potentially be CSC'ed
3229 // prior to lossy compression.
3230 //
3231
3232 void
classifyChannels(ChannelList channels,std::vector<ChannelData> & chanData,std::vector<CscChannelSet> & cscData)3233 DwaCompressor::classifyChannels
3234 (ChannelList channels,
3235 std::vector<ChannelData> &chanData,
3236 std::vector<CscChannelSet> &cscData)
3237 {
3238 //
3239 // prefixMap used to map channel name prefixes to
3240 // potential CSC-able sets of channels.
3241 //
3242
3243 std::map<std::string, DwaCompressor::CscChannelSet> prefixMap;
3244 std::vector<DwaCompressor::CscChannelSet> tmpCscSet;
3245
3246 unsigned int numChan = 0;
3247
3248 for (ChannelList::Iterator c = channels.begin(); c != channels.end(); ++c)
3249 numChan++;
3250
3251 if (numChan)
3252 chanData.resize (numChan);
3253
3254 //
3255 // Cache the relevant data from the channel structs.
3256 //
3257
3258 unsigned int offset = 0;
3259
3260 for (ChannelList::Iterator c = channels.begin(); c != channels.end(); ++c)
3261 {
3262 chanData[offset].name = std::string (c.name());
3263 chanData[offset].compression = UNKNOWN;
3264 chanData[offset].xSampling = c.channel().xSampling;
3265 chanData[offset].ySampling = c.channel().ySampling;
3266 chanData[offset].type = c.channel().type;
3267 chanData[offset].pLinear = c.channel().pLinear;
3268
3269 offset++;
3270 }
3271
3272 //
3273 // Try and figure out which channels should be
3274 // compressed by which means.
3275 //
3276
3277 for (offset = 0; offset<numChan; ++offset)
3278 {
3279 std::string prefix = "";
3280 std::string suffix = chanData[offset].name;
3281 size_t lastDot = suffix.find_last_of ('.');
3282
3283 if (lastDot != std::string::npos)
3284 {
3285 prefix = suffix.substr (0, lastDot);
3286 suffix = suffix.substr (lastDot+1, std::string::npos);
3287 }
3288
3289 //
3290 // Make sure we have an entry in our CSC set map
3291 //
3292
3293 std::map<std::string, DwaCompressor::CscChannelSet>::iterator
3294 theSet = prefixMap.find (prefix);
3295
3296 if (theSet == prefixMap.end())
3297 {
3298 DwaCompressor::CscChannelSet tmpSet;
3299
3300 tmpSet.idx[0] =
3301 tmpSet.idx[1] =
3302 tmpSet.idx[2] = -1;
3303
3304 prefixMap[prefix] = tmpSet;
3305 }
3306
3307 //
3308 // Check the suffix against the list of classifications
3309 // we defined previously. If the _cscIdx is not negative,
3310 // it indicates that we should be part of a CSC group.
3311 //
3312
3313 for (std::vector<Classifier>::iterator i = _channelRules.begin();
3314 i != _channelRules.end();
3315 ++i)
3316 {
3317 if ( i->match(suffix, chanData[offset].type) )
3318 {
3319 chanData[offset].compression = i->_scheme;
3320
3321 if ( i->_cscIdx >= 0)
3322 prefixMap[prefix].idx[i->_cscIdx] = offset;
3323 }
3324 }
3325 }
3326
3327 //
3328 // Finally, try and find RGB sets of channels which
3329 // can be CSC'ed to a Y'CbCr space prior to loss, for
3330 // better compression.
3331 //
3332 // Walk over our set of candidates, and see who has
3333 // all three channels defined (and has common sampling
3334 // patterns, etc).
3335 //
3336
3337 for (std::map<std::string, DwaCompressor::CscChannelSet>::iterator
3338 theItem = prefixMap.begin(); theItem != prefixMap.end();
3339 ++theItem)
3340 {
3341 int red = (*theItem).second.idx[0];
3342 int grn = (*theItem).second.idx[1];
3343 int blu = (*theItem).second.idx[2];
3344
3345 if ((red < 0) || (grn < 0) || (blu < 0))
3346 continue;
3347
3348 if ((chanData[red].xSampling != chanData[grn].xSampling) ||
3349 (chanData[red].xSampling != chanData[blu].xSampling) ||
3350 (chanData[grn].xSampling != chanData[blu].xSampling) ||
3351 (chanData[red].ySampling != chanData[grn].ySampling) ||
3352 (chanData[red].ySampling != chanData[blu].ySampling) ||
3353 (chanData[grn].ySampling != chanData[blu].ySampling))
3354 {
3355 continue;
3356 }
3357
3358 tmpCscSet.push_back ((*theItem).second);
3359 }
3360
3361 size_t numCsc = tmpCscSet.size();
3362
3363 if (numCsc)
3364 cscData.resize(numCsc);
3365
3366 for (offset = 0; offset < numCsc; ++offset)
3367 cscData[offset] = tmpCscSet[offset];
3368 }
3369
3370
3371
3372 //
3373 // Setup some buffer pointers, determine channel sizes, things
3374 // like that.
3375 //
3376
3377 void
setupChannelData(int minX,int minY,int maxX,int maxY)3378 DwaCompressor::setupChannelData (int minX, int minY, int maxX, int maxY)
3379 {
3380 char *planarUncBuffer[NUM_COMPRESSOR_SCHEMES];
3381
3382 for (int i=0; i<NUM_COMPRESSOR_SCHEMES; ++i)
3383 {
3384 planarUncBuffer[i] = 0;
3385
3386 if (_planarUncBuffer[i])
3387 planarUncBuffer[i] = _planarUncBuffer[i];
3388 }
3389
3390 for (unsigned int chan = 0; chan < _channelData.size(); ++chan)
3391 {
3392 ChannelData *cd = &_channelData[chan];
3393
3394 cd->width = Imf::numSamples (cd->xSampling, minX, maxX);
3395 cd->height = Imf::numSamples (cd->ySampling, minY, maxY);
3396
3397 cd->planarUncSize =
3398 cd->width * cd->height * Imf::pixelTypeSize (cd->type);
3399
3400 cd->planarUncBuffer = planarUncBuffer[cd->compression];
3401 cd->planarUncBufferEnd = cd->planarUncBuffer;
3402
3403 cd->planarUncRle[0] = cd->planarUncBuffer;
3404 cd->planarUncRleEnd[0] = cd->planarUncRle[0];
3405
3406 for (int byte = 1; byte < Imf::pixelTypeSize(cd->type); ++byte)
3407 {
3408 cd->planarUncRle[byte] =
3409 cd->planarUncRle[byte-1] + cd->width * cd->height;
3410
3411 cd->planarUncRleEnd[byte] =
3412 cd->planarUncRle[byte];
3413 }
3414
3415 cd->planarUncType = cd->type;
3416
3417 if (cd->compression == LOSSY_DCT)
3418 {
3419 cd->planarUncType = FLOAT;
3420 }
3421 else
3422 {
3423 planarUncBuffer[cd->compression] +=
3424 cd->width * cd->height * Imf::pixelTypeSize (cd->planarUncType);
3425 }
3426 }
3427 }
3428
3429 OPENEXR_IMF_INTERNAL_NAMESPACE_SOURCE_EXIT
3430