1 ///////////////////////////////////////////////////////////////////////////
2 //
3 // Copyright (c) 2009-2014 DreamWorks Animation LLC.
4 //
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are
9 // met:
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above
13 // copyright notice, this list of conditions and the following disclaimer
14 // in the documentation and/or other materials provided with the
15 // distribution.
16 // * Neither the name of DreamWorks Animation nor the names of
17 // its contributors may be used to endorse or promote products derived
18 // from this software without specific prior written permission.
19 //
20 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 //
32 ///////////////////////////////////////////////////////////////////////////
33
34 #define OPENEXR_BUILTIN_TABLES
35
36 //
37 // A program to generate various acceleration lookup tables
38 // for Imf::DwaCompressor
39 //
40
41 #include <cstddef>
42 #include <stdio.h>
43 #include <stdlib.h>
44 #include <math.h>
45 #include <vector>
46
47 #include <OpenEXRConfig.h>
48
49 #ifndef OPENEXR_BUILTIN_TABLES
50 #ifdef OPENEXR_IMF_HAVE_SYSCONF_NPROCESSORS_ONLN
51 #include <unistd.h>
52 #endif
53 #endif // OPENEXR_BUILTIN_TABLES
54
55 #include <half.h>
56 #include <IlmThread.h>
57 #include <IlmThreadSemaphore.h>
58 #include <ImfIO.h>
59 #include <ImfXdr.h>
60 #include "ImfNamespace.h"
61
62 using namespace OPENEXR_IMF_NAMESPACE;
63
64 namespace {
65
66 #ifdef OPENEXR_BUILTIN_TABLES
67 static unsigned short dwaCompressorNoOp[0x10000] = {};
68 static unsigned short dwaCompressorToLinear[0x10000] = {};
69 static unsigned short dwaCompressorToNonlinear[0x10000] = {};
70
71 //static unsigned int closestDataOffset[0x10000] = {};
72 //static unsigned short closestData[0x80000] = {};
73 #else
74
75 class LutHeaderWorker
76 {
77 public:
78 class Runner : public ILMTHREAD_NAMESPACE::Thread
79 {
80 public:
81 Runner(LutHeaderWorker &worker, bool output):
82 ILMTHREAD_NAMESPACE::Thread(),
83 _worker(worker),
84 _output(output)
85 {
86 start();
87 }
88
89 virtual ~Runner()
90 {
91 _semaphore.wait();
92 }
93
94 virtual void run()
95 {
96 _semaphore.post();
97 _worker.run(_output);
98 }
99
100 private:
101 LutHeaderWorker &_worker;
102 bool _output;
103 ILMTHREAD_NAMESPACE::Semaphore _semaphore;
104
105 }; // class LutHeaderWorker::Runner
106
107
108 LutHeaderWorker(size_t startValue,
109 size_t endValue):
110 _lastCandidateCount(0),
111 _startValue(startValue),
112 _endValue(endValue),
113 _numElements(0),
114 _offset(new size_t[numValues()]),
115 _elements(new unsigned short[1024*1024*2])
116 {
117 }
118
119 ~LutHeaderWorker()
120 {
121 delete[] _offset;
122 delete[] _elements;
123 }
124
125 size_t lastCandidateCount() const
126 {
127 return _lastCandidateCount;
128 }
129
130 size_t numValues() const
131 {
132 return _endValue - _startValue;
133 }
134
135 size_t numElements() const
136 {
137 return _numElements;
138 }
139
140 const size_t* offset() const
141 {
142 return _offset;
143 }
144
145 const unsigned short* elements() const
146 {
147 return _elements;
148 }
149
150 void run(bool outputProgress)
151 {
152 half candidate[16];
153 int candidateCount = 0;
154
155 for (size_t input=_startValue; input<_endValue; ++input) {
156
157 if (outputProgress) {
158 #ifdef __GNUC__
159 if (input % 100 == 0) {
160 fprintf(stderr,
161 " Building acceleration for DwaCompressor, %.2f %% %c",
162 100.*(float)input/(float)numValues(), 13);
163 }
164 #else
165 if (input % 1000 == 0) {
166 fprintf(stderr,
167 " Building acceleration for DwaCompressor, %.2f %%\n",
168 100.*(float)input/(float)numValues());
169 }
170 #endif
171 }
172
173
174 int numSetBits = countSetBits(input);
175 half inputHalf, closestHalf;
176
177 inputHalf.setBits(input);
178
179 _offset[input - _startValue] = _numElements;
180
181 // Gather candidates
182 candidateCount = 0;
183 for (int targetNumSetBits=numSetBits-1; targetNumSetBits>=0;
184 --targetNumSetBits) {
185 bool valueFound = false;
186
187 for (int i=0; i<65536; ++i) {
188 if (countSetBits(i) != targetNumSetBits) continue;
189
190 if (!valueFound) {
191 closestHalf.setBits(i);
192 valueFound = true;
193 } else {
194 half tmpHalf;
195
196 tmpHalf.setBits(i);
197
198 if (fabs((float)inputHalf - (float)tmpHalf) <
199 fabs((float)inputHalf - (float)closestHalf)) {
200 closestHalf = tmpHalf;
201 }
202 }
203 }
204
205 if (valueFound == false) {
206 fprintf(stderr, "bork bork bork!\n");
207 }
208
209 candidate[candidateCount] = closestHalf;
210 candidateCount++;
211 }
212
213 // Sort candidates by increasing number of bits set
214 for (int i=0; i<candidateCount; ++i) {
215 for (int j=i+1; j<candidateCount; ++j) {
216
217 int iCnt = countSetBits(candidate[i].bits());
218 int jCnt = countSetBits(candidate[j].bits());
219
220 if (jCnt < iCnt) {
221 half tmp = candidate[i];
222 candidate[i] = candidate[j];
223 candidate[j] = tmp;
224 }
225 }
226 }
227
228 // Copy candidates to the data buffer;
229 for (int i=0; i<candidateCount; ++i) {
230 _elements[_numElements] = candidate[i].bits();
231 _numElements++;
232 }
233
234 if (input == _endValue-1) {
235 _lastCandidateCount = candidateCount;
236 }
237 }
238 }
239
240
241 private:
242 size_t _lastCandidateCount;
243 size_t _startValue;
244 size_t _endValue;
245 size_t _numElements;
246 size_t *_offset;
247 unsigned short *_elements;
248
249 //
250 // Precomputing the bit count runs faster than using
251 // the builtin instruction, at least in one case..
252 //
253 // Precomputing 8-bits is no slower than 16-bits,
254 // and saves a fair bit of overhead..
255 //
256 int countSetBits(unsigned short src)
257 {
258 static const unsigned short numBitsSet[256] =
259 {
260 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
261 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
262 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
263 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
264 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
265 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
266 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
267 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
268 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
269 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
270 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
271 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
272 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
273 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
274 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
275 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
276 };
277
278 return numBitsSet[src & 0xff] + numBitsSet[src >> 8];
279 }
280
281 }; // class LutHeaderWorker
282
283 #endif // OPENEXR_BUILTIN_TABLES
284
285 } // namespace
286
287
288 //
289 // Generate a no-op LUT, to cut down in conditional branches
290 //
291 static void
generateNoop()292 generateNoop()
293 {
294 #ifndef OPENEXR_BUILTIN_TABLES
295 printf("const unsigned short dwaCompressorNoOp[] = \n");
296 printf("{");
297 #endif // OPENEXR_BUILTIN_TABLES
298 for (int i=0; i<65536; ++i) {
299 #ifndef OPENEXR_BUILTIN_TABLES
300 if (i % 8 == 0) {
301 printf("\n ");
302 }
303 #endif // OPENEXR_BUILTIN_TABLES
304 unsigned short dst;
305 char *tmp = (char *)(&dst);
306
307 unsigned short src = (unsigned short)i;
308 Xdr::write <CharPtrIO> (tmp, src);
309 #ifndef OPENEXR_BUILTIN_TABLES
310 printf("0x%04x, ", dst);
311 #else
312 dwaCompressorNoOp[i] = dst;
313 #endif // OPENEXR_BUILTIN_TABLES
314 }
315 #ifndef OPENEXR_BUILTIN_TABLES
316 printf("\n};\n");
317 #endif // OPENEXR_BUILTIN_TABLES
318 }
319
320 //
321 // Nonlinearly encode luminance. For values below 1.0, we want
322 // to use a gamma 2.2 function to match what is fairly common
323 // for storing output referred. However, > 1, gamma functions blow up,
324 // and log functions are much better behaved. We could use a log
325 // function everywhere, but it tends to over-sample dark
326 // regions and undersample the brighter regions, when
327 // compared to the way real devices reproduce values.
328 //
329 // So, above 1, use a log function which is a smooth blend
330 // into the gamma function.
331 //
332 // Nonlinear(linear) =
333 //
334 // linear^(1./2.2) / linear <= 1.0
335 // |
336 // ln(linear)/ln(e^2.2) + 1 \ otherwise
337 //
338 //
339 // toNonlinear[] needs to take in XDR format half float values,
340 // and output NATIVE format float.
341 //
342 // toLinear[] does the opposite - takes in NATIVE half and
343 // outputs XDR half values.
344 //
345
346 static void
generateToLinear()347 generateToLinear()
348 {
349 #ifndef OPENEXR_BUILTIN_TABLES
350 unsigned short toLinear[65536];
351 #else
352 unsigned short* toLinear = dwaCompressorToLinear;
353 #endif // OPENEXR_BUILTIN_TABLES
354
355 toLinear[0] = 0;
356
357 for (int i=1; i<65536; ++i) {
358 half h;
359 float sign = 1;
360 float logBase = pow(2.7182818, 2.2);
361
362 // map NaN and inf to 0
363 if ((i & 0x7c00) == 0x7c00) {
364 toLinear[i] = 0;
365 continue;
366 }
367
368 //
369 // _toLinear - assume i is NATIVE, but our output needs
370 // to get flipped to XDR
371 //
372 h.setBits(i);
373 sign = 1;
374 if ((float)h < 0) {
375 sign = -1;
376 }
377
378 if ( fabs( (float)h) <= 1.0 ) {
379 h = (half)(sign * pow((float)fabs((float)h), 2.2f));
380 } else {
381 h = (half)(sign * pow(logBase, (float)(fabs((float)h) - 1.0)));
382 }
383
384 {
385 char *tmp = (char *)(&toLinear[i]);
386
387 Xdr::write <CharPtrIO> ( tmp, h.bits());
388 }
389 }
390 #ifndef OPENEXR_BUILTIN_TABLES
391 printf("const unsigned short dwaCompressorToLinear[] = \n");
392 printf("{");
393 for (int i=0; i<65536; ++i) {
394 if (i % 8 == 0) {
395 printf("\n ");
396 }
397 printf("0x%04x, ", toLinear[i]);
398 }
399 printf("\n};\n");
400 #endif // OPENEXR_BUILTIN_TABLES
401 }
402
403
404 static void
generateToNonlinear()405 generateToNonlinear()
406 {
407 #ifndef OPENEXR_BUILTIN_TABLES
408 unsigned short toNonlinear[65536];
409 #else
410 unsigned short* toNonlinear = dwaCompressorToNonlinear;
411 #endif // OPENEXR_BUILTIN_TABLES
412
413 toNonlinear[0] = 0;
414
415 for (int i=1; i<65536; ++i) {
416 unsigned short usNative, usXdr;
417 half h;
418 float sign = 1;
419 float logBase = pow(2.7182818, 2.2);
420
421 usXdr = i;
422
423 {
424 const char *tmp = (char *)(&usXdr);
425
426 Xdr::read<CharPtrIO>(tmp, usNative);
427 }
428
429 // map NaN and inf to 0
430 if ((usNative & 0x7c00) == 0x7c00) {
431 toNonlinear[i] = 0;
432 continue;
433 }
434
435 //
436 // toNonlinear - assume i is XDR
437 //
438 h.setBits(usNative);
439 sign = 1;
440 if ((float)h < 0) {
441 sign = -1;
442 }
443
444 if ( fabs( (float)h ) <= 1.0) {
445 h = (half)(sign * pow(fabs((float)h), 1.f/2.2f));
446 } else {
447 h = (half)(sign * ( log(fabs((float)h)) / log(logBase) + 1.0) );
448 }
449 toNonlinear[i] = h.bits();
450 }
451 #ifndef OPENEXR_BUILTIN_TABLES
452 printf("const unsigned short dwaCompressorToNonlinear[] = \n");
453 printf("{");
454 for (int i=0; i<65536; ++i) {
455 if (i % 8 == 0) {
456 printf("\n ");
457 }
458 printf("0x%04x, ", toNonlinear[i]);
459 }
460 printf("\n};\n");
461 #endif // OPENEXR_BUILTIN_TABLES
462 }
463
464
465 #ifndef OPENEXR_BUILTIN_TABLES
466 //
467 // Attempt to get available CPUs in a somewhat portable way.
468 //
469
470 int
cpuCount()471 cpuCount()
472 {
473 if (!ILMTHREAD_NAMESPACE::supportsThreads()) return 1;
474
475 int cpuCount = 1;
476
477 #if defined (OPENEXR_IMF_HAVE_SYSCONF_NPROCESSORS_ONLN)
478
479 cpuCount = sysconf(_SC_NPROCESSORS_ONLN);
480
481 #elif defined (_WIN32)
482
483 SYSTEM_INFO sysinfo;
484 GetSystemInfo( &sysinfo );
485 cpuCount = sysinfo.dwNumberOfProcessors;
486
487 #endif
488
489 if (cpuCount < 1) cpuCount = 1;
490 return cpuCount;
491 }
492
493 //
494 // Generate acceleration luts for the quantization.
495 //
496 // For each possible input value, we want to find the closest numbers
497 // which have one fewer bits set than before.
498 //
499 // This gives us num_bits(input)-1 values per input. If we alloc
500 // space for everything, that's like a 2MB table. We can do better
501 // by compressing all the values to be contigious and using offset
502 // pointers.
503 //
504 // After we've found the candidates with fewer bits set, sort them
505 // based on increasing numbers of bits set. This way, on quantize(),
506 // we can scan through the list and halt once we find the first
507 // candidate within the error range. For small values that can
508 // be quantized to 0, 0 is the first value tested and the search
509 // can exit fairly quickly.
510 //
511
512 void
generateLutHeader()513 generateLutHeader()
514 {
515 std::vector<LutHeaderWorker*> workers;
516
517 size_t numWorkers = cpuCount();
518 size_t workerInterval = 65536 / numWorkers;
519
520 for (size_t i=0; i<numWorkers; ++i) {
521 if (i != numWorkers-1) {
522 workers.push_back( new LutHeaderWorker( i *workerInterval,
523 (i+1)*workerInterval) );
524 } else {
525 workers.push_back( new LutHeaderWorker(i*workerInterval, 65536) );
526 }
527 }
528
529 if (ILMTHREAD_NAMESPACE::supportsThreads()) {
530 std::vector<LutHeaderWorker::Runner*> runners;
531 for (size_t i=0; i<workers.size(); ++i) {
532 runners.push_back( new LutHeaderWorker::Runner(*workers[i], (i==0)) );
533 }
534
535 for (size_t i=0; i<workers.size(); ++i) {
536 delete runners[i];
537 }
538 } else {
539 for (size_t i=0; i<workers.size(); ++i) {
540 workers[i]->run(i == 0);
541 }
542 }
543
544 printf("static unsigned int closestDataOffset[] = {\n");
545 int offsetIdx = 0;
546 int offsetPrev = 0;
547 for (size_t i=0; i<workers.size(); ++i) {
548 for (size_t value=0; value<workers[i]->numValues(); ++value) {
549 if (offsetIdx % 8 == 0) {
550 printf(" ");
551 }
552 printf("%6lu, ", workers[i]->offset()[value] + offsetPrev);
553 if (offsetIdx % 8 == 7) {
554 printf("\n");
555 }
556 offsetIdx++;
557 }
558 offsetPrev += workers[i]->offset()[workers[i]->numValues()-1] +
559 workers[i]->lastCandidateCount();
560 }
561 printf("};\n\n\n");
562
563
564 printf("static unsigned short closestData[] = {\n");
565 int elementIdx = 0;
566 for (size_t i=0; i<workers.size(); ++i) {
567 for (size_t element=0; element<workers[i]->numElements(); ++element) {
568 if (elementIdx % 8 == 0) {
569 printf(" ");
570 }
571 printf("%5d, ", workers[i]->elements()[element]);
572 if (elementIdx % 8 == 7) {
573 printf("\n");
574 }
575 elementIdx++;
576 }
577 }
578 printf("};\n\n\n");
579
580 for (size_t i=0; i<workers.size(); ++i) {
581 delete workers[i];
582 }
583 }
584
585
586 int
main(int argc,char ** argv)587 main(int argc, char **argv)
588 {
589 printf("#include <cstddef>\n");
590 printf("\n\n\n");
591
592 generateNoop();
593
594 printf("\n\n\n");
595
596 generateToLinear();
597
598 printf("\n\n\n");
599
600 generateToNonlinear();
601
602 printf("\n\n\n");
603
604 generateLutHeader();
605
606 return 0;
607 }
608 #else // OPENEXR_BUILTIN_TABLES
609
610 #include "dwaLookups.h"
611
612 OPENEXR_IMF_INTERNAL_NAMESPACE_SOURCE_ENTER
613
init_dwa_()614 static void init_dwa_()
615 {
616 generateNoop();
617 generateToLinear();
618 generateToNonlinear();
619 // N/A: generateLutHeader();
620 }
621
init_dwa()622 static inline void init_dwa()
623 {
624 static bool initialized = false;
625 if (!initialized)
626 {
627 init_dwa_();
628 initialized = true;
629 }
630 }
631
get_dwaCompressorNoOp()632 const unsigned short* get_dwaCompressorNoOp()
633 {
634 init_dwa();
635 return dwaCompressorNoOp;
636 }
get_dwaCompressorToLinear()637 const unsigned short* get_dwaCompressorToLinear()
638 {
639 init_dwa();
640 return dwaCompressorToLinear;
641 }
get_dwaCompressorToNonlinear()642 const unsigned short* get_dwaCompressorToNonlinear()
643 {
644 init_dwa();
645 return dwaCompressorToNonlinear;
646 }
647
648 OPENEXR_IMF_INTERNAL_NAMESPACE_SOURCE_EXIT
649
650 #endif // OPENEXR_BUILTIN_TABLES
651