1 //
2 // SPDX-License-Identifier: BSD-3-Clause
3 // Copyright (c) Contributors to the OpenEXR Project.
4 //
5 
6 #include <cstddef>
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <math.h>
10 #include <vector>
11 #include <assert.h>
12 
13 #include <OpenEXRConfig.h>
14 #include <OpenEXRConfigInternal.h>
15 
16 #if __cplusplus >= 201103L
17 #include <thread>
18 #endif
19 
20 #ifdef OPENEXR_IMF_HAVE_SYSCONF_NPROCESSORS_ONLN
21 #include <unistd.h>
22 #endif
23 
24 #include <half.h>
25 #include <IlmThread.h>
26 #include <IlmThreadSemaphore.h>
27 #include <ImfIO.h>
28 #include <ImfXdr.h>
29 #include "dwaLookups.h"
30 #include "ImfNamespace.h"
31 
32 //
33 // This test uses the code that generates the dwaLookups.h header to
34 // validate the the values in the tables are correct.
35 //
36 
37 using namespace OPENEXR_IMF_NAMESPACE;
38 using namespace OPENEXR_IMF_NAMESPACE;
39 
40 OPENEXR_IMF_INTERNAL_NAMESPACE_SOURCE_ENTER
41 
42 #include "dwaLookups.h"
43 
44 OPENEXR_IMF_INTERNAL_NAMESPACE_SOURCE_EXIT
45 
46 namespace {
47 
48 class LutHeaderWorker
49 {
50 public:
51     class Runner : public ILMTHREAD_NAMESPACE::Thread
52     {
53     public:
Runner(LutHeaderWorker & worker,bool output)54         Runner(LutHeaderWorker &worker, bool output):
55             ILMTHREAD_NAMESPACE::Thread(),
56             _worker(worker),
57             _output(output)
58         {
59             start();
60         }
61 
~Runner()62         virtual ~Runner()
63         {
64             _semaphore.wait();
65         }
66 
run()67         virtual void run()
68         {
69             _semaphore.post();
70             _worker.run(_output);
71         }
72 
73     private:
74         LutHeaderWorker     &_worker;
75         bool                 _output;
76         ILMTHREAD_NAMESPACE::Semaphore _semaphore;
77 
78     }; // class LutHeaderWorker::Runner
79 
80 
LutHeaderWorker(size_t startValue,size_t endValue)81     LutHeaderWorker(size_t startValue,
82                     size_t endValue):
83         _lastCandidateCount(0),
84         _startValue(startValue),
85         _endValue(endValue),
86         _numElements(0),
87         _offset(new size_t[numValues()]),
88         _elements(new unsigned short[1024*1024*2])
89     {
90     }
91 
~LutHeaderWorker()92     ~LutHeaderWorker()
93     {
94         delete[] _offset;
95         delete[] _elements;
96     }
97 
lastCandidateCount() const98     size_t lastCandidateCount() const
99     {
100         return _lastCandidateCount;
101     }
102 
numValues() const103     size_t numValues() const
104     {
105         return _endValue - _startValue;
106     }
107 
numElements() const108     size_t numElements() const
109     {
110         return _numElements;
111     }
112 
offset() const113     const size_t* offset() const
114     {
115         return _offset;
116     }
117 
elements() const118     const unsigned short* elements() const
119     {
120         return _elements;
121     }
122 
run(bool outputProgress)123     void run(bool outputProgress)
124     {
125         half candidate[16];
126         int  candidateCount = 0;
127 
128         for (size_t input=_startValue; input<_endValue; ++input) {
129 
130             int  numSetBits = countSetBits(input);
131             half inputHalf, closestHalf;
132 
133             inputHalf.setBits(input);
134             closestHalf.setBits(0);
135 
136             _offset[input - _startValue] = _numElements;
137 
138             // Gather candidates
139             candidateCount = 0;
140             for (int targetNumSetBits=numSetBits-1; targetNumSetBits>=0;
141                  --targetNumSetBits) {
142                 bool valueFound = false;
143 
144                 for (int i=0; i<65536; ++i) {
145                     if (countSetBits(i) != targetNumSetBits) continue;
146 
147                     if (!valueFound) {
148                         closestHalf.setBits(i);
149                         valueFound = true;
150                     } else {
151                         half tmpHalf;
152 
153                         tmpHalf.setBits(i);
154 
155                         if (fabs((float)inputHalf - (float)tmpHalf) <
156                             fabs((float)inputHalf - (float)closestHalf)) {
157                             closestHalf = tmpHalf;
158                         }
159                     }
160                 }
161 
162                 candidate[candidateCount] = closestHalf;
163                 candidateCount++;
164             }
165 
166             // Sort candidates by increasing number of bits set
167             for (int i=0; i<candidateCount; ++i) {
168                 for (int j=i+1; j<candidateCount; ++j) {
169 
170                     int   iCnt = countSetBits(candidate[i].bits());
171                     int   jCnt = countSetBits(candidate[j].bits());
172 
173                     if (jCnt < iCnt) {
174                         half tmp     = candidate[i];
175                         candidate[i] = candidate[j];
176                         candidate[j] = tmp;
177                     }
178                 }
179             }
180 
181             // Copy candidates to the data buffer;
182             for (int i=0; i<candidateCount; ++i) {
183                 _elements[_numElements] = candidate[i].bits();
184                 _numElements++;
185             }
186 
187             if (input == _endValue-1) {
188                 _lastCandidateCount = candidateCount;
189             }
190         }
191     }
192 
193 private:
194 
195     size_t          _lastCandidateCount;
196     size_t          _startValue;
197     size_t          _endValue;
198     size_t          _numElements;
199     size_t         *_offset;
200     unsigned short *_elements;
201 
202     //
203     // Precomputing the bit count runs faster than using
204     // the builtin instruction, at least in one case..
205     //
206     // Precomputing 8-bits is no slower than 16-bits,
207     // and saves a fair bit of overhead..
208     //
countSetBits(unsigned short src)209     int countSetBits(unsigned short src)
210     {
211         static const unsigned short numBitsSet[256] =
212             {
213                 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
214                 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
215                 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
216                 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
217                 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
218                 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
219                 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
220                 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
221                 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
222                 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
223                 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
224                 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
225                 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
226                 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
227                 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
228                 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
229             };
230 
231         return numBitsSet[src & 0xff] + numBitsSet[src >> 8];
232     }
233 
234 }; // class LutHeaderWorker
235 
236 } // namespace
237 
238 
239 //
240 // Generate a no-op LUT, to cut down in conditional branches
241 //
242 void
testNoop()243 testNoop()
244 {
245     printf("test dwaCompressorNoOp[] \n");
246 
247     for (int i=0; i<65536; ++i)
248     {
249         unsigned short src = (unsigned short)i;
250         assert (src == OPENEXR_IMF_INTERNAL_NAMESPACE::dwaCompressorNoOp[i]);
251     }
252 }
253 
254 //
255 // Nonlinearly encode luminance. For values below 1.0, we want
256 // to use a gamma 2.2 function to match what is fairly common
257 // for storing output referred. However, > 1, gamma functions blow up,
258 // and log functions are much better behaved. We could use a log
259 // function everywhere, but it tends to over-sample dark
260 // regions and undersample the brighter regions, when
261 // compared to the way real devices reproduce values.
262 //
263 // So, above 1, use a log function which is a smooth blend
264 // into the gamma function.
265 //
266 //  Nonlinear(linear) =
267 //
268 //    linear^(1./2.2)             / linear <= 1.0
269 //                               |
270 //    ln(linear)/ln(e^2.2) + 1    \ otherwise
271 //
272 //
273 // toNonlinear[] needs to take in XDR format half float values,
274 // and output NATIVE format float.
275 //
276 // toLinear[] does the opposite - takes in NATIVE half and
277 // outputs XDR half values.
278 //
279 
280 void
testToLinear()281 testToLinear()
282 {
283     unsigned short toLinear[65536];
284 
285     toLinear[0] = 0;
286 
287     for (int i=1; i<65536; ++i) {
288         half  h;
289         float sign    = 1;
290         float logBase = pow(2.7182818, 2.2);
291 
292         // map  NaN and inf to 0
293         if ((i & 0x7c00) == 0x7c00) {
294             toLinear[i]    = 0;
295             continue;
296         }
297 
298         //
299         // _toLinear - assume i is NATIVE, but our output needs
300         //             to get flipped to XDR
301         //
302         h.setBits(i);
303         sign = 1;
304         if ((float)h < 0) {
305             sign = -1;
306         }
307 
308         if ( fabs( (float)h) <= 1.0 ) {
309             h  = (half)(sign * pow((float)fabs((float)h), 2.2f));
310         } else {
311             h  = (half)(sign * pow(logBase, (float)(fabs((float)h) - 1.0)));
312         }
313 
314         {
315             char *tmp = (char *)(&toLinear[i]);
316 
317             Xdr::write <CharPtrIO> ( tmp,  h.bits());
318         }
319     }
320 
321     printf("test dwaCompressorToLinear[]\n");
322     for (int i=0; i<65536; ++i)
323         assert (toLinear[i] == OPENEXR_IMF_INTERNAL_NAMESPACE::dwaCompressorToLinear[i]);
324 }
325 
326 
327 void
testToNonlinear()328 testToNonlinear()
329 {
330     unsigned short toNonlinear[65536];
331 
332     toNonlinear[0] = 0;
333 
334     for (int i=1; i<65536; ++i) {
335         unsigned short usNative, usXdr;
336         half  h;
337         float sign    = 1;
338         float logBase = pow(2.7182818, 2.2);
339 
340         usXdr           = i;
341 
342         {
343             const char *tmp = (char *)(&usXdr);
344 
345             Xdr::read<CharPtrIO>(tmp, usNative);
346         }
347 
348         // map  NaN and inf to 0
349         if ((usNative & 0x7c00) == 0x7c00) {
350             toNonlinear[i] = 0;
351             continue;
352         }
353 
354         //
355         // toNonlinear - assume i is XDR
356         //
357         h.setBits(usNative);
358         sign = 1;
359         if ((float)h < 0) {
360             sign = -1;
361         }
362 
363         if ( fabs( (float)h ) <= 1.0) {
364             h = (half)(sign * pow(fabs((float)h), 1.f/2.2f));
365         } else {
366             h = (half)(sign * ( log(fabs((float)h)) / log(logBase) + 1.0) );
367         }
368         toNonlinear[i] = h.bits();
369     }
370 
371     printf("test dwaCompressorToNonlinear[]\n");
372     for (int i=0; i<65536; ++i)
373         assert (toNonlinear[i] == OPENEXR_IMF_INTERNAL_NAMESPACE::dwaCompressorToNonlinear[i]);
374 }
375 
376 //
377 // Attempt to get available CPUs in a somewhat portable way.
378 //
379 
380 int
cpuCount()381 cpuCount()
382 {
383     if (!ILMTHREAD_NAMESPACE::supportsThreads()) return 1;
384 
385     int cpuCount = 1;
386 
387 #if __cplusplus >= 201103L
388     cpuCount = std::thread::hardware_concurrency();
389 #else
390 
391 #if defined (OPENEXR_IMF_HAVE_SYSCONF_NPROCESSORS_ONLN)
392 
393     cpuCount = sysconf(_SC_NPROCESSORS_ONLN);
394 
395 #elif defined (_WIN32)
396 
397     SYSTEM_INFO sysinfo;
398     GetSystemInfo( &sysinfo );
399     cpuCount = sysinfo.dwNumberOfProcessors;
400 
401 #endif
402 
403 #endif
404     if (cpuCount < 1) cpuCount = 1;
405     return cpuCount;
406 }
407 
408 //
409 // Generate acceleration luts for the quantization.
410 //
411 // For each possible input value, we want to find the closest numbers
412 // which have one fewer bits set than before.
413 //
414 // This gives us num_bits(input)-1 values per input. If we alloc
415 // space for everything, that's like a 2MB table. We can do better
416 // by compressing all the values to be contigious and using offset
417 // pointers.
418 //
419 // After we've found the candidates with fewer bits set, sort them
420 // based on increasing numbers of bits set. This way, on quantize(),
421 // we can scan through the list and halt once we find the first
422 // candidate within the error range. For small values that can
423 // be quantized to 0, 0 is the first value tested and the search
424 // can exit fairly quickly.
425 //
426 
427 void
testLutHeader()428 testLutHeader()
429 {
430     std::vector<LutHeaderWorker*> workers;
431 
432     size_t numWorkers     = cpuCount();
433     size_t workerInterval = 65536 / numWorkers;
434 
435     for (size_t i=0; i<numWorkers; ++i) {
436         if (i != numWorkers-1) {
437             workers.push_back( new LutHeaderWorker( i   *workerInterval,
438                                                    (i+1)*workerInterval) );
439         } else {
440             workers.push_back( new LutHeaderWorker(i*workerInterval, 65536) );
441         }
442     }
443 
444     if (ILMTHREAD_NAMESPACE::supportsThreads()) {
445         std::vector<LutHeaderWorker::Runner*> runners;
446         for (size_t i=0; i<workers.size(); ++i) {
447             runners.push_back( new LutHeaderWorker::Runner(*workers[i], (i==0)) );
448         }
449 
450         for (size_t i=0; i<workers.size(); ++i) {
451             delete runners[i];
452         }
453     } else {
454         for (size_t i=0; i<workers.size(); ++i) {
455             workers[i]->run(i == 0);
456         }
457     }
458 
459     printf("test closestDataOffset[]\n");
460     int offsetIdx  = 0;
461     int offsetPrev = 0;
462     for (size_t i=0; i<workers.size(); ++i) {
463         for (size_t value=0; value<workers[i]->numValues(); ++value)
464         {
465             assert (OPENEXR_IMF_INTERNAL_NAMESPACE::closestDataOffset[offsetIdx] == workers[i]->offset()[value] + offsetPrev);
466             offsetIdx++;
467         }
468         offsetPrev += workers[i]->offset()[workers[i]->numValues()-1] +
469                       workers[i]->lastCandidateCount();
470     }
471 
472     printf("test closestData[]\n");
473     int elementIdx = 0;
474     for (size_t i=0; i<workers.size(); ++i) {
475         for (size_t element=0; element<workers[i]->numElements(); ++element)
476         {
477             assert (OPENEXR_IMF_INTERNAL_NAMESPACE::closestData[elementIdx] == workers[i]->elements()[element]);
478             elementIdx++;
479         }
480     }
481 
482     for (size_t i=0; i<workers.size(); ++i) {
483         delete workers[i];
484     }
485 }
486 
487 void
testDwaLookups(const std::string &)488 testDwaLookups(const std::string&)
489 {
490     testNoop();
491     testToLinear();
492     testToNonlinear();
493     testLutHeader();
494 }
495