1 ///////////////////////////////////////////////////////////////////////////
2 //
3 // Copyright (c) 2009-2014 DreamWorks Animation LLC.
4 //
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are
9 // met:
10 // *       Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // *       Redistributions in binary form must reproduce the above
13 // copyright notice, this list of conditions and the following disclaimer
14 // in the documentation and/or other materials provided with the
15 // distribution.
16 // *       Neither the name of DreamWorks Animation nor the names of
17 // its contributors may be used to endorse or promote products derived
18 // from this software without specific prior written permission.
19 //
20 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 //
32 ///////////////////////////////////////////////////////////////////////////
33 
34 #define OPENEXR_BUILTIN_TABLES
35 
36 //
37 // A program to generate various acceleration lookup tables
38 // for Imf::DwaCompressor
39 //
40 
41 #include <cstddef>
42 #include <stdio.h>
43 #include <stdlib.h>
44 #include <math.h>
45 #include <vector>
46 
47 #include <OpenEXRConfig.h>
48 
49 #ifndef OPENEXR_BUILTIN_TABLES
50 #ifdef OPENEXR_IMF_HAVE_SYSCONF_NPROCESSORS_ONLN
51 #include <unistd.h>
52 #endif
53 #endif // OPENEXR_BUILTIN_TABLES
54 
55 #include <half.h>
56 #include <IlmThread.h>
57 #include <IlmThreadSemaphore.h>
58 #include <ImfIO.h>
59 #include <ImfXdr.h>
60 #include "ImfNamespace.h"
61 
62 using namespace OPENEXR_IMF_NAMESPACE;
63 
64 namespace {
65 
66 #ifdef OPENEXR_BUILTIN_TABLES
67 static unsigned short dwaCompressorNoOp[0x10000] = {};
68 static unsigned short dwaCompressorToLinear[0x10000] = {};
69 static unsigned short dwaCompressorToNonlinear[0x10000] = {};
70 
71 //static unsigned int closestDataOffset[0x10000] = {};
72 //static unsigned short closestData[0x80000] = {};
73 #else
74 
75     class LutHeaderWorker
76     {
77         public:
78             class Runner : public ILMTHREAD_NAMESPACE::Thread
79             {
80                 public:
81                     Runner(LutHeaderWorker &worker, bool output):
82                         ILMTHREAD_NAMESPACE::Thread(),
83                         _worker(worker),
84                         _output(output)
85                     {
86                         start();
87                     }
88 
89                     virtual ~Runner()
90                     {
91                         _semaphore.wait();
92                     }
93 
94                     virtual void run()
95                     {
96                         _semaphore.post();
97                         _worker.run(_output);
98                     }
99 
100                 private:
101                     LutHeaderWorker     &_worker;
102                     bool                 _output;
103                     ILMTHREAD_NAMESPACE::Semaphore _semaphore;
104 
105             }; // class LutHeaderWorker::Runner
106 
107 
108             LutHeaderWorker(size_t startValue,
109                             size_t endValue):
110                 _lastCandidateCount(0),
111                 _startValue(startValue),
112                 _endValue(endValue),
113                 _numElements(0),
114                 _offset(new size_t[numValues()]),
115                 _elements(new unsigned short[1024*1024*2])
116             {
117             }
118 
119             ~LutHeaderWorker()
120             {
121                 delete[] _offset;
122                 delete[] _elements;
123             }
124 
125             size_t lastCandidateCount() const
126             {
127                 return _lastCandidateCount;
128             }
129 
130             size_t numValues() const
131             {
132                 return _endValue - _startValue;
133             }
134 
135             size_t numElements() const
136             {
137                 return _numElements;
138             }
139 
140             const size_t* offset() const
141             {
142                 return _offset;
143             }
144 
145             const unsigned short* elements() const
146             {
147                 return _elements;
148             }
149 
150             void run(bool outputProgress)
151             {
152                 half candidate[16];
153                 int  candidateCount = 0;
154 
155                 for (size_t input=_startValue; input<_endValue; ++input) {
156 
157                     if (outputProgress) {
158 #ifdef __GNUC__
159                         if (input % 100 == 0) {
160                             fprintf(stderr,
161                             " Building acceleration for DwaCompressor, %.2f %%      %c",
162                                           100.*(float)input/(float)numValues(), 13);
163                         }
164 #else
165                         if (input % 1000 == 0) {
166                             fprintf(stderr,
167                             " Building acceleration for DwaCompressor, %.2f %%\n",
168                                           100.*(float)input/(float)numValues());
169                         }
170 #endif
171                     }
172 
173 
174                     int  numSetBits = countSetBits(input);
175                     half inputHalf, closestHalf;
176 
177                     inputHalf.setBits(input);
178 
179                     _offset[input - _startValue] = _numElements;
180 
181                     // Gather candidates
182                     candidateCount = 0;
183                     for (int targetNumSetBits=numSetBits-1; targetNumSetBits>=0;
184                                                            --targetNumSetBits) {
185                         bool valueFound = false;
186 
187                         for (int i=0; i<65536; ++i) {
188                             if (countSetBits(i) != targetNumSetBits) continue;
189 
190                             if (!valueFound) {
191                                 closestHalf.setBits(i);
192                                 valueFound = true;
193                             } else {
194                                 half tmpHalf;
195 
196                                 tmpHalf.setBits(i);
197 
198                                 if (fabs((float)inputHalf - (float)tmpHalf) <
199                                     fabs((float)inputHalf - (float)closestHalf)) {
200                                     closestHalf = tmpHalf;
201                                 }
202                             }
203                         }
204 
205                         if (valueFound == false) {
206                             fprintf(stderr, "bork bork bork!\n");
207                         }
208 
209                         candidate[candidateCount] = closestHalf;
210                         candidateCount++;
211                     }
212 
213                     // Sort candidates by increasing number of bits set
214                     for (int i=0; i<candidateCount; ++i) {
215                         for (int j=i+1; j<candidateCount; ++j) {
216 
217                             int   iCnt = countSetBits(candidate[i].bits());
218                             int   jCnt = countSetBits(candidate[j].bits());
219 
220                             if (jCnt < iCnt) {
221                                 half tmp     = candidate[i];
222                                 candidate[i] = candidate[j];
223                                 candidate[j] = tmp;
224                             }
225                         }
226                     }
227 
228                     // Copy candidates to the data buffer;
229                     for (int i=0; i<candidateCount; ++i) {
230                         _elements[_numElements] = candidate[i].bits();
231                         _numElements++;
232                     }
233 
234                     if (input == _endValue-1) {
235                         _lastCandidateCount = candidateCount;
236                     }
237                 }
238             }
239 
240 
241         private:
242             size_t          _lastCandidateCount;
243             size_t          _startValue;
244             size_t          _endValue;
245             size_t          _numElements;
246             size_t         *_offset;
247             unsigned short *_elements;
248 
249             //
250             // Precomputing the bit count runs faster than using
251             // the builtin instruction, at least in one case..
252             //
253             // Precomputing 8-bits is no slower than 16-bits,
254             // and saves a fair bit of overhead..
255             //
256             int countSetBits(unsigned short src)
257             {
258                 static const unsigned short numBitsSet[256] =
259                 {
260                     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
261                     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
262                     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
263                     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
264                     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
265                     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
266                     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
267                     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
268                     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
269                     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
270                     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
271                     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
272                     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
273                     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
274                     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
275                     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
276                 };
277 
278                 return numBitsSet[src & 0xff] + numBitsSet[src >> 8];
279             }
280 
281     }; // class LutHeaderWorker
282 
283 #endif // OPENEXR_BUILTIN_TABLES
284 
285 } // namespace
286 
287 
288 //
289 // Generate a no-op LUT, to cut down in conditional branches
290 //
291 static void
generateNoop()292 generateNoop()
293 {
294 #ifndef OPENEXR_BUILTIN_TABLES
295     printf("const unsigned short dwaCompressorNoOp[] = \n");
296     printf("{");
297 #endif // OPENEXR_BUILTIN_TABLES
298     for (int i=0; i<65536; ++i) {
299 #ifndef OPENEXR_BUILTIN_TABLES
300         if (i % 8 == 0) {
301             printf("\n    ");
302         }
303 #endif  // OPENEXR_BUILTIN_TABLES
304         unsigned short dst;
305         char *tmp = (char *)(&dst);
306 
307         unsigned short src = (unsigned short)i;
308         Xdr::write <CharPtrIO> (tmp,  src);
309 #ifndef OPENEXR_BUILTIN_TABLES
310         printf("0x%04x, ", dst);
311 #else
312         dwaCompressorNoOp[i] = dst;
313 #endif // OPENEXR_BUILTIN_TABLES
314     }
315 #ifndef OPENEXR_BUILTIN_TABLES
316     printf("\n};\n");
317 #endif // OPENEXR_BUILTIN_TABLES
318 }
319 
320 //
321 // Nonlinearly encode luminance. For values below 1.0, we want
322 // to use a gamma 2.2 function to match what is fairly common
323 // for storing output referred. However, > 1, gamma functions blow up,
324 // and log functions are much better behaved. We could use a log
325 // function everywhere, but it tends to over-sample dark
326 // regions and undersample the brighter regions, when
327 // compared to the way real devices reproduce values.
328 //
329 // So, above 1, use a log function which is a smooth blend
330 // into the gamma function.
331 //
332 //  Nonlinear(linear) =
333 //
334 //    linear^(1./2.2)             / linear <= 1.0
335 //                               |
336 //    ln(linear)/ln(e^2.2) + 1    \ otherwise
337 //
338 //
339 // toNonlinear[] needs to take in XDR format half float values,
340 // and output NATIVE format float.
341 //
342 // toLinear[] does the opposite - takes in NATIVE half and
343 // outputs XDR half values.
344 //
345 
346 static void
generateToLinear()347 generateToLinear()
348 {
349 #ifndef OPENEXR_BUILTIN_TABLES
350     unsigned short toLinear[65536];
351 #else
352     unsigned short* toLinear = dwaCompressorToLinear;
353 #endif // OPENEXR_BUILTIN_TABLES
354 
355     toLinear[0] = 0;
356 
357     for (int i=1; i<65536; ++i) {
358         half  h;
359         float sign    = 1;
360         float logBase = pow(2.7182818, 2.2);
361 
362         // map  NaN and inf to 0
363         if ((i & 0x7c00) == 0x7c00) {
364             toLinear[i]    = 0;
365             continue;
366         }
367 
368         //
369         // _toLinear - assume i is NATIVE, but our output needs
370         //             to get flipped to XDR
371         //
372         h.setBits(i);
373         sign = 1;
374         if ((float)h < 0) {
375             sign = -1;
376         }
377 
378         if ( fabs( (float)h) <= 1.0 ) {
379             h  = (half)(sign * pow((float)fabs((float)h), 2.2f));
380         } else {
381             h  = (half)(sign * pow(logBase, (float)(fabs((float)h) - 1.0)));
382         }
383 
384         {
385             char *tmp = (char *)(&toLinear[i]);
386 
387             Xdr::write <CharPtrIO> ( tmp,  h.bits());
388         }
389     }
390 #ifndef OPENEXR_BUILTIN_TABLES
391     printf("const unsigned short dwaCompressorToLinear[] = \n");
392     printf("{");
393     for (int i=0; i<65536; ++i) {
394         if (i % 8 == 0) {
395             printf("\n    ");
396         }
397         printf("0x%04x, ", toLinear[i]);
398     }
399     printf("\n};\n");
400 #endif // OPENEXR_BUILTIN_TABLES
401 }
402 
403 
404 static void
generateToNonlinear()405 generateToNonlinear()
406 {
407 #ifndef OPENEXR_BUILTIN_TABLES
408     unsigned short toNonlinear[65536];
409 #else
410     unsigned short* toNonlinear = dwaCompressorToNonlinear;
411 #endif // OPENEXR_BUILTIN_TABLES
412 
413     toNonlinear[0] = 0;
414 
415     for (int i=1; i<65536; ++i) {
416         unsigned short usNative, usXdr;
417         half  h;
418         float sign    = 1;
419         float logBase = pow(2.7182818, 2.2);
420 
421         usXdr           = i;
422 
423         {
424             const char *tmp = (char *)(&usXdr);
425 
426             Xdr::read<CharPtrIO>(tmp, usNative);
427         }
428 
429         // map  NaN and inf to 0
430         if ((usNative & 0x7c00) == 0x7c00) {
431             toNonlinear[i] = 0;
432             continue;
433         }
434 
435         //
436         // toNonlinear - assume i is XDR
437         //
438         h.setBits(usNative);
439         sign = 1;
440         if ((float)h < 0) {
441             sign = -1;
442         }
443 
444         if ( fabs( (float)h ) <= 1.0) {
445             h = (half)(sign * pow(fabs((float)h), 1.f/2.2f));
446         } else {
447             h = (half)(sign * ( log(fabs((float)h)) / log(logBase) + 1.0) );
448         }
449         toNonlinear[i] = h.bits();
450     }
451 #ifndef OPENEXR_BUILTIN_TABLES
452     printf("const unsigned short dwaCompressorToNonlinear[] = \n");
453     printf("{");
454     for (int i=0; i<65536; ++i) {
455         if (i % 8 == 0) {
456             printf("\n    ");
457         }
458         printf("0x%04x, ", toNonlinear[i]);
459     }
460     printf("\n};\n");
461 #endif // OPENEXR_BUILTIN_TABLES
462 }
463 
464 
465 #ifndef OPENEXR_BUILTIN_TABLES
466 //
467 // Attempt to get available CPUs in a somewhat portable way.
468 //
469 
470 int
cpuCount()471 cpuCount()
472 {
473     if (!ILMTHREAD_NAMESPACE::supportsThreads()) return 1;
474 
475     int cpuCount = 1;
476 
477 #if defined (OPENEXR_IMF_HAVE_SYSCONF_NPROCESSORS_ONLN)
478 
479     cpuCount = sysconf(_SC_NPROCESSORS_ONLN);
480 
481 #elif defined (_WIN32)
482 
483     SYSTEM_INFO sysinfo;
484     GetSystemInfo( &sysinfo );
485     cpuCount = sysinfo.dwNumberOfProcessors;
486 
487 #endif
488 
489     if (cpuCount < 1) cpuCount = 1;
490     return cpuCount;
491 }
492 
493 //
494 // Generate acceleration luts for the quantization.
495 //
496 // For each possible input value, we want to find the closest numbers
497 // which have one fewer bits set than before.
498 //
499 // This gives us num_bits(input)-1 values per input. If we alloc
500 // space for everything, that's like a 2MB table. We can do better
501 // by compressing all the values to be contigious and using offset
502 // pointers.
503 //
504 // After we've found the candidates with fewer bits set, sort them
505 // based on increasing numbers of bits set. This way, on quantize(),
506 // we can scan through the list and halt once we find the first
507 // candidate within the error range. For small values that can
508 // be quantized to 0, 0 is the first value tested and the search
509 // can exit fairly quickly.
510 //
511 
512 void
generateLutHeader()513 generateLutHeader()
514 {
515     std::vector<LutHeaderWorker*> workers;
516 
517     size_t numWorkers     = cpuCount();
518     size_t workerInterval = 65536 / numWorkers;
519 
520     for (size_t i=0; i<numWorkers; ++i) {
521         if (i != numWorkers-1) {
522             workers.push_back( new LutHeaderWorker( i   *workerInterval,
523                                                    (i+1)*workerInterval) );
524         } else {
525             workers.push_back( new LutHeaderWorker(i*workerInterval, 65536) );
526         }
527     }
528 
529     if (ILMTHREAD_NAMESPACE::supportsThreads()) {
530         std::vector<LutHeaderWorker::Runner*> runners;
531         for (size_t i=0; i<workers.size(); ++i) {
532             runners.push_back( new LutHeaderWorker::Runner(*workers[i], (i==0)) );
533         }
534 
535         for (size_t i=0; i<workers.size(); ++i) {
536             delete runners[i];
537         }
538     } else {
539         for (size_t i=0; i<workers.size(); ++i) {
540             workers[i]->run(i == 0);
541         }
542     }
543 
544     printf("static unsigned int closestDataOffset[] = {\n");
545     int offsetIdx  = 0;
546     int offsetPrev = 0;
547     for (size_t i=0; i<workers.size(); ++i) {
548         for (size_t value=0; value<workers[i]->numValues(); ++value) {
549             if (offsetIdx % 8 == 0) {
550                 printf("    ");
551             }
552             printf("%6lu, ", workers[i]->offset()[value] + offsetPrev);
553             if (offsetIdx % 8 == 7) {
554                 printf("\n");
555             }
556             offsetIdx++;
557         }
558         offsetPrev += workers[i]->offset()[workers[i]->numValues()-1] +
559                       workers[i]->lastCandidateCount();
560     }
561     printf("};\n\n\n");
562 
563 
564     printf("static unsigned short closestData[] = {\n");
565     int elementIdx = 0;
566     for (size_t i=0; i<workers.size(); ++i) {
567         for (size_t element=0; element<workers[i]->numElements(); ++element) {
568             if (elementIdx % 8 == 0) {
569                 printf("    ");
570             }
571             printf("%5d, ", workers[i]->elements()[element]);
572             if (elementIdx % 8 == 7) {
573                 printf("\n");
574             }
575             elementIdx++;
576         }
577     }
578     printf("};\n\n\n");
579 
580     for (size_t i=0; i<workers.size(); ++i) {
581         delete workers[i];
582     }
583 }
584 
585 
586 int
main(int argc,char ** argv)587 main(int argc, char **argv)
588 {
589     printf("#include <cstddef>\n");
590     printf("\n\n\n");
591 
592     generateNoop();
593 
594     printf("\n\n\n");
595 
596     generateToLinear();
597 
598     printf("\n\n\n");
599 
600     generateToNonlinear();
601 
602     printf("\n\n\n");
603 
604     generateLutHeader();
605 
606     return 0;
607 }
608 #else // OPENEXR_BUILTIN_TABLES
609 
610 #include "dwaLookups.h"
611 
612 OPENEXR_IMF_INTERNAL_NAMESPACE_SOURCE_ENTER
613 
init_dwa_()614 static void init_dwa_()
615 {
616     generateNoop();
617     generateToLinear();
618     generateToNonlinear();
619     // N/A: generateLutHeader();
620 }
621 
init_dwa()622 static inline void init_dwa()
623 {
624     static bool initialized = false;
625     if (!initialized)
626     {
627         init_dwa_();
628         initialized = true;
629     }
630 }
631 
get_dwaCompressorNoOp()632 const unsigned short* get_dwaCompressorNoOp()
633 {
634     init_dwa();
635     return dwaCompressorNoOp;
636 }
get_dwaCompressorToLinear()637 const unsigned short* get_dwaCompressorToLinear()
638 {
639     init_dwa();
640     return dwaCompressorToLinear;
641 }
get_dwaCompressorToNonlinear()642 const unsigned short* get_dwaCompressorToNonlinear()
643 {
644     init_dwa();
645     return dwaCompressorToNonlinear;
646 }
647 
648 OPENEXR_IMF_INTERNAL_NAMESPACE_SOURCE_EXIT
649 
650 #endif // OPENEXR_BUILTIN_TABLES
651