1 /* quantization.cpp - source file for class with nonuniform quantization functionality
2  * written by C. R. Helmrich, last modified in 2020 - see License.htm for legal notices
3  *
4  * The copyright in this software is being made available under the exhale Copyright License
5  * and comes with ABSOLUTELY NO WARRANTY. This software may be subject to other third-
6  * party rights, including patent rights. No such rights are granted under this License.
7  *
8  * Copyright (c) 2018-2021 Christian R. Helmrich, project ecodis. All rights reserved.
9  */
10 
11 #include "exhaleLibPch.h"
12 #include "quantization.h"
13 
14 #define EC_TRAIN (0 && EC_TRELLIS_OPT_CODING) // for RDOC testing
15 
16 // static helper functions
getBitCount(EntropyCoder & entrCoder,const int sfIndex,const int sfIndexPred,const uint8_t groupLength,const uint8_t * coeffQuant,const uint16_t coeffOffset,const uint16_t numCoeffs)17 static inline short getBitCount (EntropyCoder& entrCoder, const int sfIndex, const int sfIndexPred,
18                                  const uint8_t groupLength, const uint8_t* coeffQuant,
19                                  const uint16_t coeffOffset, const uint16_t numCoeffs)
20 {
21   unsigned bitCount = (sfIndex != UCHAR_MAX && sfIndexPred == UCHAR_MAX ? 8 : entrCoder.indexGetBitCount (sfIndex - sfIndexPred));
22 
23   if (groupLength == 1) // include arithmetic coding in bit count
24   {
25 #if EC_TRELLIS_OPT_CODING
26     const unsigned bitsStart = (entrCoder.arithGetCtxState () >> 17) & 31;
27 #endif
28     bitCount += entrCoder.arithCodeSigMagn (coeffQuant, coeffOffset, numCoeffs);
29 #if EC_TRELLIS_OPT_CODING
30     bitCount += (entrCoder.arithGetCtxState () >> 17) & 31;
31     bitCount -= __min (bitsStart, bitCount); // +new-old m_acBits
32 #endif
33   }
34 
35   return (short) __min (SHRT_MAX, bitCount); // exclude sign bits
36 }
37 
38 #if EC_TRELLIS_OPT_CODING && !EC_TRAIN
getLagrangeValue(const uint16_t rateIndex)39 static inline double getLagrangeValue (const uint16_t rateIndex) // RD optimization constant
40 {
41   return (95.0 + rateIndex * rateIndex) * 0.0009765625; // / 1024
42 }
43 #endif
44 
45 // private helper functions
getQuantDist(const unsigned * const coeffMagn,const uint8_t scaleFactor,const uint8_t * const coeffQuant,const uint16_t numCoeffs)46 double SfbQuantizer::getQuantDist (const unsigned* const coeffMagn, const uint8_t scaleFactor,
47                                    const uint8_t* const coeffQuant, const uint16_t numCoeffs)
48 {
49   const double stepSizeDiv = m_lutSfNorm[scaleFactor];
50   double dDist = 0.0;
51 
52   for (int i = numCoeffs - 1; i >= 0; i--)
53   {
54     const double d = m_lutXExp43[coeffQuant[i]] - coeffMagn[i] * stepSizeDiv;
55 
56     dDist += d * d;
57   }
58 
59   // consider quantization step-size in calculation of distortion
60   return dDist * m_lut2ExpX4[scaleFactor] * m_lut2ExpX4[scaleFactor];
61 }
62 
quantizeMagnSfb(const unsigned * const coeffMagn,const uint8_t scaleFactor,uint8_t * const coeffQuant,const uint16_t numCoeffs,EntropyCoder * const arithmCoder,const uint16_t coeffOffset,short * const sigMaxQ,short * const sigNumQ)63 uint8_t SfbQuantizer::quantizeMagnSfb (const unsigned* const coeffMagn, const uint8_t scaleFactor,
64                                       /*mod*/uint8_t* const coeffQuant, const uint16_t numCoeffs,
65 #if EC_TRELLIS_OPT_CODING
66                                        EntropyCoder* const arithmCoder, const uint16_t coeffOffset,
67 #endif
68                                        short* const sigMaxQ /*= nullptr*/, short* const sigNumQ /*= nullptr*/)
69 {
70   const double stepSizeDiv = m_lutSfNorm[scaleFactor];
71   double  dNum = 0.0, dDen = 0.0;
72   short sf, maxQ = 0, numQ = 0;
73 
74   for (int i = numCoeffs - 1; i >= 0; i--)
75   {
76     const double normalizedMagn = (double) coeffMagn[i] * stepSizeDiv;
77 #if SFB_QUANT_FAST_POW
78     short q;
79 
80     if (normalizedMagn < 28.5)  // fast approximate pow (d, 0.75)
81     {
82       // based on code from: N. N. Schraudolph, "A Fast, Compact Approximation of the Expo-
83       // nential Function," Neural Comput., vol. 11, pp. 853-862, 1998 and M. Ankerl, 2007,
84       // https://martin.ankerl.com/2007/10/04/optimized-pow-approximation-for-java-and-c-c/
85       union { double d; int32_t i[2]; } u = { normalizedMagn };
86 
87       u.i[1] = int32_t (0.75 * (u.i[1] - 1072632447) + 1072632447.0);
88       u.i[0] = 0;
89       q = short (u.d + (u.d < 1.0 ? 0.3822484 : 0.2734375));
90     }
91     else
92     {
93       q = short (SFB_QUANT_OFFSET + pow (normalizedMagn, 0.75));
94     }
95 #else
96     short q = short (SFB_QUANT_OFFSET + pow (normalizedMagn, 0.75));
97 #endif
98 
99     if (q > 0)
100     {
101       if (q >= SCHAR_MAX)
102       {
103         if (maxQ < q)
104         {
105           maxQ = q; // find maximum quantized magnitude in vector
106         }
107         q = SCHAR_MAX;
108       }
109       else
110       {
111         const double diffRoundD = m_lutXExp43[q    ] - normalizedMagn;
112         const double diffRoundU = m_lutXExp43[q + 1] - normalizedMagn;
113 
114         if (diffRoundU * diffRoundU < diffRoundD * diffRoundD)
115         {
116           q++; // round-up gives lower distortion than round-down
117         }
118       }
119       if (maxQ < q)
120       {
121         maxQ = q;
122       }
123       numQ++;
124       dNum += m_lutXExp43[q] * normalizedMagn;
125       dDen += m_lutXExp43[q] * m_lutXExp43[q];
126     }
127 #if SFB_QUANT_PERCEPT_OPT
128     else // q == 0, assume perceptual transparency for code below
129     {
130       dNum += normalizedMagn * normalizedMagn;
131       dDen += normalizedMagn * normalizedMagn;
132     }
133 #endif
134     coeffQuant[i] = (uint8_t) q;
135   }
136 
137   if (sigMaxQ) *sigMaxQ = maxQ; // max. quantized value magnitude
138   if (sigNumQ) *sigNumQ = numQ; // nonzero coeff. count (L0 norm)
139 
140   sf = scaleFactor;
141   // compute least-squares optimal modifier added to scale factor
142   if (dNum > SF_THRESH_POS * dDen) sf++;
143   else
144   if (dNum < SF_THRESH_NEG * dDen) sf--;
145 
146 #if EC_TRELLIS_OPT_CODING
147   if (arithmCoder && (sf > 0) && (maxQ <= SCHAR_MAX)) // use RDOC
148   {
149     EntropyCoder& entrCoder = *arithmCoder;
150 #if EC_TRAIN
151     const uint32_t codStart = entrCoder.arithGetCodState ();
152     const uint32_t ctxStart = entrCoder.arithGetCtxState ();
153     uint32_t bitCount = entrCoder.arithCodeSigMagn (&coeffQuant[-((int) coeffOffset)], coeffOffset, numCoeffs);
154 
155     bitCount += (entrCoder.arithGetCtxState () >> 17) & 31; // refinement: +new-old m_acBits
156     bitCount -= __min ((ctxStart >> 17) & 31, bitCount);
157     bitCount += (uint32_t) numQ;  // add sign bits for completion
158 
159     entrCoder.arithSetCodState (codStart);  // back to last state
160     entrCoder.arithSetCtxState (ctxStart);
161 #else
162     uint32_t bitCount = (uint32_t) numQ;
163 #endif
164     if ((bitCount = quantizeMagnRDOC (entrCoder, (uint8_t) sf, bitCount, coeffOffset, coeffMagn, numCoeffs, coeffQuant)) > 0)
165     {
166       numQ = bitCount & SHRT_MAX;
167 
168       if ((numQ > 0) && (sf < m_maxSfIndex)) // nonzero-quantized
169       {
170         const double magnNormDiv = m_lutSfNorm[sf];
171 
172         dNum = dDen = 0.0;
173         for (int i = numCoeffs - 1; i >= 0; i--)
174         {
175           const double normalizedMagn = (double) coeffMagn[i] * magnNormDiv;
176           const uint8_t q = coeffQuant[i];
177 
178           if (q > 0)
179           {
180             dNum += m_lutXExp43[q] * normalizedMagn;
181             dDen += m_lutXExp43[q] * m_lutXExp43[q];
182           }
183 #if SFB_QUANT_PERCEPT_OPT
184           else   // assume perceptual transparency for code below
185           {
186             dNum += normalizedMagn * normalizedMagn;
187             dDen += normalizedMagn * normalizedMagn;
188           }
189 #endif
190         }
191 
192         // re-compute least-squares optimal scale factor modifier
193         if (dNum > SF_THRESH_POS * dDen) sf++;
194 #if !SFB_QUANT_PERCEPT_OPT
195         else
196         if (dNum < SF_THRESH_NEG * dDen) sf--; // reduces SFB RMS
197 #endif
198       } // if nonzero
199 
200       if (sigMaxQ) *sigMaxQ = (numQ > 0 ? maxQ : 0); // a new max
201       if (sigNumQ) *sigNumQ = numQ; // a new nonzero coeff. count
202     }
203   }
204 #endif // EC_TRELLIS_OPT_CODING
205 
206 #if SFB_QUANT_PERCEPT_OPT
207   if ((numQ > 0) && (sf > 0 && sf <= scaleFactor)) // recover RMS
208   {
209     const double magnNormDiv = m_lutSfNorm[sf];
210 
211     dNum = 0.0;  // dDen has normalized energy after quantization
212     for (int i = numCoeffs - 1; i >= 0; i--)
213     {
214       const double normalizedMagn = (double) coeffMagn[i] * magnNormDiv;
215 
216       dNum += normalizedMagn * normalizedMagn;
217     }
218 
219     if (dNum > SF_THRESH_POS * SF_THRESH_POS * dDen) sf++;
220   }
221 #endif
222   return (uint8_t) __max (0, sf); // optimized scale factor index
223 }
224 
225 #if EC_TRELLIS_OPT_CODING
quantizeMagnRDOC(EntropyCoder & entropyCoder,const uint8_t optimalSf,const unsigned targetBitCount,const uint16_t coeffOffset,const unsigned * const coeffMagn,const uint16_t numCoeffs,uint8_t * const quantCoeffs)226 uint32_t SfbQuantizer::quantizeMagnRDOC (EntropyCoder& entropyCoder, const uint8_t optimalSf, const unsigned targetBitCount,
227                                          const uint16_t coeffOffset, const unsigned* const coeffMagn, // initial MDCT magnitudes
228                                          const uint16_t numCoeffs, uint8_t* const quantCoeffs) // returns updated SFB statistics
229 {
230   // numTuples: num of trellis stages. Based on: A. Aggarwal, S. L. Regunathan, and K. Rose,
231   // "Trellis-Based Optimization of MPEG-4 Advanced Audio Coding," in Proc. IEEE Workshop on
232   // Speech Coding, pp. 142-144, Sep. 2000. Modified for arithmetic instead of Huffman coder
233   const uint32_t  codStart = entropyCoder.arithGetCodState ();
234   const uint32_t  ctxStart = entropyCoder.arithGetCtxState (); // before call to getBitCount
235   const double stepSizeDiv = m_lutSfNorm[optimalSf];
236   const uint16_t numStates = 4; // 4 reduction types: [0, 0], [0, -1], [-1, 0], and [-1, -1]
237   const uint16_t numTuples = numCoeffs >> 1;
238   uint8_t* const quantRate = &m_coeffTemp[((unsigned) m_maxSize8M1 + 1) << 3];
239   uint32_t prevCodState[4] = {0, 0, 0, 0};
240   uint32_t prevCtxState[4] = {0, 0, 0, 0};
241   double   prevVtrbCost[4] = {0, 0, 0, 0};
242   uint32_t tempCodState[4] = {0, 0, 0, 0};
243   uint32_t tempCtxState[4] = {0, 0, 0, 0};
244   double   tempVtrbCost[4] = {0, 0, 0, 0};
245   double   quantDist[32][4];   // TODO: dynamic memory allocation
246   uint8_t* const optimalIs = (uint8_t* const) (quantDist[32-1]);
247   uint8_t  tempQuant[4], numQ; // for tuple/SFB sign bit counting
248   unsigned tempBitCount, tuple, is;
249   int ds;
250 #if EC_TRAIN
251   double refSfbDist = 0.0, tempSfbDist = 0.0;
252 #else
253   const double lambda = getLagrangeValue (m_rateIndex);
254 #endif
255 
256   if ((coeffMagn == nullptr) || (quantCoeffs == nullptr) || (optimalSf > m_maxSfIndex) || (numTuples == 0) || (numTuples > 32) ||
257       (targetBitCount == 0)  || (targetBitCount > SHRT_MAX))
258   {
259     return 0; // invalid input error
260   }
261 
262   // save third-last tuple value, required due to an insufficiency of arithGet/SetCtxState()
263   if (coeffOffset > 5) tempQuant[3] = entropyCoder.arithGetTuplePtr ()[(coeffOffset >> 1) - 3];
264 
265   for (tuple = 0; tuple < numTuples; tuple++) // tuple-wise non-weighted distortion and rate
266   {
267     const uint16_t  tupleStart = tuple << 1;
268     const uint16_t tupleOffset = coeffOffset + tupleStart;
269     const double   normalMagnA = (double) coeffMagn[tupleStart    ] * stepSizeDiv;
270     const double   normalMagnB = (double) coeffMagn[tupleStart + 1] * stepSizeDiv;
271     uint8_t  coeffQuantA = quantCoeffs[tupleStart];
272     uint8_t  coeffQuantB = quantCoeffs[tupleStart + 1];
273 
274     for (is = 0; is < numStates; is++)  // populate tuple trellis
275     {
276       uint8_t* const mag = (is != 0 ? tempQuant : quantCoeffs) - (int) tupleOffset; // see arithCodeSigMagn()
277       uint8_t*  currRate = &quantRate[(is + tuple * numStates) * numStates];
278       double diffA, diffB;
279 
280       if (is != 0) // test reduction of quantized MDCT magnitudes
281       {
282         const uint8_t redA = is >> 1;
283         const uint8_t redB = is &  1;
284 
285         if ((redA > 0 && coeffQuantA != 1) || (redB > 0 && coeffQuantB != 1))  // avoid path
286         {
287           tempCodState[is] = tempCodState[0];
288           tempCtxState[is] = tempCtxState[0];
289           memset (currRate, UCHAR_MAX, numStates);
290 
291           continue;
292         }
293         tempQuant[0] = (coeffQuantA -= redA);
294         tempQuant[1] = (coeffQuantB -= redB);
295       }
296       diffA = m_lutXExp43[coeffQuantA] - normalMagnA;
297       diffB = m_lutXExp43[coeffQuantB] - normalMagnB;
298       quantDist[tuple][is] = diffA * diffA + diffB * diffB;
299 
300       numQ  = (coeffQuantA > 0 ? 1 : 0) + (coeffQuantB > 0 ? 1 : 0);
301 
302       if (tuple == 0) // first tuple, with tupleStart == sfbStart
303       {
304         entropyCoder.arithSetCodState (codStart); // start of SFB
305         entropyCoder.arithSetCtxState (ctxStart);
306         tempBitCount = entropyCoder.arithCodeSigMagn (mag, tupleOffset, 2);
307 
308         tempBitCount += (entropyCoder.arithGetCtxState () >> 17) & 31;  // +new-old m_acBits
309         tempBitCount -= __min ((ctxStart >> 17) & 31, tempBitCount);
310 
311         memset (currRate, tempBitCount + numQ, numStates);
312       }
313       else // tuple > 0, rate depends on decisions for last tuple
314       {
315         for (ds = numStates - 1; ds >= 0; ds--)
316         {
317           if (quantRate[(ds + (tuple-1) * numStates) * numStates] >= UCHAR_MAX)// avoid path
318           {
319             currRate[ds] = UCHAR_MAX;
320 
321             continue;
322           }
323 
324           entropyCoder.arithSetCodState (prevCodState[ds]);
325           entropyCoder.arithSetCtxState (prevCtxState[ds], tupleOffset);
326           tempBitCount = entropyCoder.arithCodeSigMagn (mag, tupleOffset, 2);
327 
328           tempBitCount += (entropyCoder.arithGetCtxState () >> 17) & 31;// +new-old m_acBits
329           tempBitCount -= __min ((prevCtxState[ds] >> 17) & 31, tempBitCount);
330 
331           currRate[ds] = uint8_t (tempBitCount + numQ);
332         }
333       }
334       // statistically best place to save states is after ds == 0
335       tempCodState[is] = entropyCoder.arithGetCodState ();
336       tempCtxState[is] = entropyCoder.arithGetCtxState ();
337     } // for is
338 #if EC_TRAIN
339     refSfbDist += quantDist[tuple][0];
340 #endif
341     memcpy (prevCodState, tempCodState, numStates * sizeof (uint32_t));
342     memcpy (prevCtxState, tempCtxState, numStates * sizeof (uint32_t));
343   } // for tuple
344 
345   entropyCoder.arithSetCodState (codStart); // back to last state
346   entropyCoder.arithSetCtxState (ctxStart, coeffOffset);
347   // restore third-last tuple value, see insufficiency note above
348   if (coeffOffset > 5) entropyCoder.arithGetTuplePtr ()[(coeffOffset >> 1) - 3] = tempQuant[3];
349 
350 #if EC_TRAIN
351   tempBitCount = targetBitCount + 1; // Viterbi search for minimum distortion at target rate
352   for (double lambda = 0.015625; (lambda <= 0.375) && (tempBitCount > targetBitCount); lambda += 0.0078125)
353 #endif
354   {
355     double* const  prevCost = prevVtrbCost;
356 #if !EC_TRAIN
357     uint8_t* const prevPath = (uint8_t*) quantDist;// backtracker
358 #endif
359     double   costMinIs = (double) UINT_MAX;
360     unsigned pathMinIs = 0;
361 #if EC_TRAIN
362     uint8_t prevPath[16*4];
363     tempSfbDist = 0.0;
364 #endif
365 
366     for (is = 0; is < numStates; is++) // initialize minimum path
367     {
368       const uint8_t  currRate = quantRate[is * numStates];
369 
370       prevCost[is] = (currRate >= UCHAR_MAX ? (double) UINT_MAX : lambda * currRate + quantDist[0][is]);
371       prevPath[is] = 0;
372     }
373 
374     for (tuple = 1; tuple < numTuples; tuple++) // find min. path
375     {
376       double* const  currCost = tempVtrbCost;
377       uint8_t* const currPath = &prevPath[tuple * numStates];
378 
379       for (is = 0; is < numStates; is++)  // tuple's minimum path
380       {
381         uint8_t* currRate = &quantRate[(is + tuple * numStates) * numStates];
382         double  costMinDs = (double) UINT_MAX;
383         uint8_t pathMinDs = 0;
384 
385         for (ds = numStates - 1; ds >= 0; ds--)    // transitions
386         {
387           const double costCurr = (currRate[ds] >= UCHAR_MAX ? (double) UINT_MAX : prevCost[ds] + lambda * currRate[ds]);
388 
389           if (costMinDs > costCurr)
390           {
391             costMinDs = costCurr;
392             pathMinDs = (uint8_t) ds;
393           }
394         }
395         if (costMinDs < UINT_MAX) costMinDs += quantDist[tuple][is];
396 
397         currCost[is] = costMinDs;
398         currPath[is] = pathMinDs;
399       } // for is
400 
401       memcpy (prevCost, currCost, numStates * sizeof (double)); // TODO: avoid memcpy, use pointer swapping instead for speed
402     } // for tuple
403 
404     for (tempBitCount = is = 0; is < numStates; is++)  // minimum
405     {
406       if (costMinIs > prevCost[is])
407       {
408         costMinIs = prevCost[is];
409         pathMinIs = is;
410       }
411     }
412 
413     for (tuple--; tuple > 0; tuple--)  // min-cost rate and types
414     {
415       const uint8_t* currPath = &prevPath[tuple * numStates];
416       const uint8_t pathMinDs = currPath[pathMinIs];
417 
418       optimalIs[tuple] = (uint8_t) pathMinIs;
419       tempBitCount    += quantRate[pathMinDs + (pathMinIs + tuple * numStates) * numStates];
420 #if EC_TRAIN
421       tempSfbDist += quantDist[tuple][pathMinIs];
422 #endif
423       pathMinIs = pathMinDs;
424     }
425     optimalIs[0]  = (uint8_t) pathMinIs;
426     tempBitCount += quantRate[pathMinIs * numStates];
427 #if EC_TRAIN
428     tempSfbDist += quantDist[0][pathMinIs];
429 #endif
430   } // Viterbi search
431 
432 #if EC_TRAIN
433   if ((tempSfbDist <= refSfbDist) || (tempBitCount <= targetBitCount))
434 #endif
435   {
436 #if !EC_TRAIN
437     numQ = 0;
438 #endif
439     for (tuple = 0; tuple < numTuples; tuple++) // re-quantize SFB with R/D optimal rounding
440     {
441       const uint16_t tupleStart = tuple << 1;
442       const uint8_t  tupIs = optimalIs[tuple];
443       uint8_t& coeffQuantA = quantCoeffs[tupleStart];
444       uint8_t& coeffQuantB = quantCoeffs[tupleStart + 1];
445 
446       if (tupIs != 0) // optimal red of quantized MDCT magnitudes
447       {
448         coeffQuantA -= (tupIs >> 1);
449         coeffQuantB -= (tupIs &  1);
450       }
451 #if !EC_TRAIN
452       numQ += (coeffQuantA > 0 ? 1 : 0) + (coeffQuantB > 0 ? 1 : 0);
453 #endif
454     } // for tuple
455 
456 #if EC_TRAIN
457     return tempBitCount;
458 #else
459     return (1u << 15) | numQ; // final stats: OK flag | sign bits
460 #endif
461   }
462 
463   return targetBitCount;
464 }
465 #endif // EC_TRELLIS_OPT_CODING
466 
467 // constructor
SfbQuantizer()468 SfbQuantizer::SfbQuantizer ()
469 {
470   // initialize all helper buffers
471   m_coeffMagn = nullptr;
472 #if EC_TRELLIS_OPT_CODING
473   m_coeffTemp = nullptr;
474 #endif
475   m_lut2ExpX4 = nullptr;
476   m_lutSfNorm = nullptr;
477   m_lutXExp43 = nullptr;
478 
479   m_maxSfIndex = 0;
480 #if EC_TRELLIS_OPT_CODING
481   m_numCStates = 0;
482 
483   for (unsigned b = 0; b < 52; b++)
484   {
485     m_quantDist[b] = nullptr;
486     m_quantInSf[b] = nullptr;
487     m_quantRate[b] = nullptr;
488   }
489 #endif
490 }
491 
492 // destructor
~SfbQuantizer()493 SfbQuantizer::~SfbQuantizer ()
494 {
495   // free allocated helper buffers
496   MFREE (m_coeffMagn);
497 #if EC_TRELLIS_OPT_CODING
498   MFREE (m_coeffTemp);
499 #endif
500   MFREE (m_lut2ExpX4);
501   MFREE (m_lutSfNorm);
502   MFREE (m_lutXExp43);
503 #if EC_TRELLIS_OPT_CODING
504 
505   for (unsigned b = 0; b < 52; b++)
506   {
507     MFREE (m_quantDist[b]);
508     MFREE (m_quantInSf[b]);
509     MFREE (m_quantRate[b]);
510   }
511 #endif
512 }
513 
514 // public functions
initQuantMemory(const unsigned maxTransfLength,const uint8_t numSwb,const uint8_t bitRateMode,const unsigned samplingRate,const uint8_t maxScaleFacIndex)515 unsigned SfbQuantizer::initQuantMemory (const unsigned maxTransfLength,
516 #if EC_TRELLIS_OPT_CODING
517                                         const uint8_t numSwb, const uint8_t bitRateMode, const unsigned samplingRate,
518 #endif
519                                         const uint8_t maxScaleFacIndex /*= SCHAR_MAX*/)
520 {
521   const unsigned numScaleFactors = (unsigned) maxScaleFacIndex + 1;
522 #if EC_TRELLIS_OPT_CODING
523   const uint8_t complexityOffset = (samplingRate < 28800 ? 8 - (samplingRate >> 13) : 5) + ((bitRateMode == 0) && (samplingRate >= 8192) ? 1 : 0);
524   const uint8_t numTrellisStates = complexityOffset - __min (2, (bitRateMode + 2) >> 2);  // number of states per SFB
525   const uint8_t numSquaredStates = numTrellisStates * numTrellisStates;
526   const uint16_t quantRateLength = (samplingRate < 28800 || samplingRate >= 57600 ? 512 : 256); // quantizeMagnRDOC()
527 #endif
528   unsigned x;
529 
530   if ((maxTransfLength < 128) || (maxTransfLength > 2048) || (maxTransfLength & 7) || (maxScaleFacIndex == 0) || (maxScaleFacIndex > SCHAR_MAX))
531   {
532     return 1; // invalid arguments error
533   }
534 
535   m_maxSfIndex = maxScaleFacIndex;
536 
537   if ((m_coeffMagn = (unsigned*) malloc (maxTransfLength * sizeof (unsigned))) == nullptr ||
538 #if EC_TRELLIS_OPT_CODING
539       (m_coeffTemp = (uint8_t* ) malloc (maxTransfLength + quantRateLength  )) == nullptr ||
540 #endif
541       (m_lut2ExpX4 = (double*  ) malloc (numScaleFactors * sizeof (double  ))) == nullptr ||
542       (m_lutSfNorm = (double*  ) malloc (numScaleFactors * sizeof (double  ))) == nullptr ||
543       (m_lutXExp43 = (double*  ) malloc ((SCHAR_MAX + 1) * sizeof (double  ))) == nullptr)
544   {
545     return 2; // memory allocation error
546   }
547 #if EC_TRELLIS_OPT_CODING
548   m_maxSize8M1 = (maxTransfLength >> 3) - 1;
549   m_numCStates = numTrellisStates;
550   m_rateIndex  = bitRateMode;
551 
552   for (x = 0; x < __min (52u, numSwb); x++)
553   {
554     if ((m_quantDist[x] = (double*  ) malloc (numTrellisStates * sizeof (double  ))) == nullptr ||
555         (m_quantInSf[x] = (uint8_t* ) malloc (numTrellisStates * sizeof (uint8_t ))) == nullptr ||
556         (m_quantRate[x] = (uint16_t*) malloc (numSquaredStates * sizeof (uint16_t))) == nullptr)
557     {
558       return 2;
559     }
560   }
561 #else
562   memset (m_coeffTemp, 0, sizeof (m_coeffTemp));
563 #endif
564   // calculate scale factor gain 2^(x/4)
565   for (x = 0; x < numScaleFactors; x++)
566   {
567     m_lut2ExpX4[x] = pow (2.0, (double) x / 4.0);
568     m_lutSfNorm[x] = 1.0 / m_lut2ExpX4[x];
569   }
570   // calculate dequantized coeff x^(4/3)
571   for (x = 0; x < (SCHAR_MAX + 1); x++)
572   {
573     m_lutXExp43[x] = pow ((double) x, 4.0 / 3.0);
574   }
575 
576   return 0; // no error
577 }
578 
quantizeSpecSfb(EntropyCoder & entropyCoder,const int32_t * const inputCoeffs,const uint8_t grpLength,const uint16_t * const grpOffsets,uint32_t * const grpStats,const unsigned sfb,const uint8_t sfIndex,const uint8_t sfIndexPred,uint8_t * const quantCoeffs)579 uint8_t SfbQuantizer::quantizeSpecSfb (EntropyCoder& entropyCoder, const int32_t* const inputCoeffs, const uint8_t grpLength,
580                                        const uint16_t* const grpOffsets, uint32_t* const grpStats,  // quant./coding statistics
581                                        const unsigned sfb, const uint8_t sfIndex, const uint8_t sfIndexPred /*= UCHAR_MAX*/,
582                                        uint8_t* const quantCoeffs /*= nullptr*/) // returns the RD optimized scale factor index
583 {
584 #if EC_TRELLIS_OPT_CODING
585   EntropyCoder* const entrCoder = (grpLength == 1 ? &entropyCoder : nullptr);
586 #endif
587   uint8_t sfBest = sfIndex;
588 
589   if ((inputCoeffs == nullptr) || (grpOffsets == nullptr) || (sfb >= 52) || (sfIndex > m_maxSfIndex))
590   {
591     return UCHAR_MAX; // invalid input error
592   }
593 
594 #if EC_TRELLIS_OPT_CODING
595   if (grpLength == 1) // references for RDOC
596   {
597     m_quantDist[sfb][1] = -1.0;
598     m_quantInSf[sfb][1] = sfIndex;
599     m_quantRate[sfb][1] = 0; // for sgn bits
600     m_quantRate[sfb][0] = entropyCoder.arithGetCtxState () & USHRT_MAX; // ref start context
601   }
602 #endif
603   if ((sfIndex == 0) || (sfIndexPred <= m_maxSfIndex && sfIndex + INDEX_OFFSET < sfIndexPred))
604   {
605     const uint16_t   grpStart = grpOffsets[0];
606     const uint16_t   sfbStart = grpOffsets[sfb];
607     const uint16_t   sfbWidth = grpOffsets[sfb + 1] - sfbStart;
608     uint32_t* const coeffMagn = &m_coeffMagn[sfbStart];
609 
610     for (int i = sfbWidth - 1; i >= 0; i--) // back up magnitudes
611     {
612       coeffMagn[i] = abs (inputCoeffs[sfbStart + i]);
613     }
614 
615     if (quantCoeffs)
616     {
617       memset (&quantCoeffs[sfbStart], 0, sfbWidth * sizeof (uint8_t)); // SFB output zeroing
618       if (grpStats) // approximate bit count
619       {
620         grpStats[sfb] = getBitCount (entropyCoder, 0, 0, grpLength, &quantCoeffs[grpStart], sfbStart - grpStart, sfbWidth);
621       }
622     }
623     return sfIndexPred - (sfIndex == 0 ? 0 : INDEX_OFFSET); // save delta bits if applicable
624   }
625   else // nonzero sf, optimized quantization
626   {
627     const uint16_t   grpStart = grpOffsets[0];
628     const uint16_t   sfbStart = grpOffsets[sfb];
629     const uint16_t   sfbWidth = grpOffsets[sfb + 1] - sfbStart;
630     const uint16_t   cpyWidth = sfbWidth * sizeof (uint8_t);
631     uint32_t* const coeffMagn = &m_coeffMagn[sfbStart];
632     uint32_t codStart = 0, ctxStart = 0;
633     uint32_t codFinal = 0, ctxFinal = 0;
634     double   distBest = 0, distCurr = 0;
635     short    maxQBest = 0, maxQCurr = 0;
636     short    numQBest = 0, numQCurr = 0;
637 #if EC_TRELLIS_OPT_CODING
638     bool rdOptimQuant = (grpLength != 1);
639 #else
640     bool rdOptimQuant = true;
641 #endif
642     uint8_t* ptrBest  = &m_coeffTemp[0];
643     uint8_t* ptrCurr  = &m_coeffTemp[100];
644     uint8_t  sfCurr   = sfIndex;
645 
646     for (int i = sfbWidth - 1; i >= 0; i--) // back up magnitudes
647     {
648       coeffMagn[i] = abs (inputCoeffs[sfbStart + i]);
649     }
650 
651 // --- determine default quantization result using range limited scale factor as a reference
652     sfBest = quantizeMagnSfb (coeffMagn, sfCurr, ptrBest, sfbWidth,
653 #if EC_TRELLIS_OPT_CODING
654                               entrCoder, sfbStart - grpStart,
655 #endif
656                               &maxQBest, &numQBest);
657 
658     if (maxQBest > SCHAR_MAX) // limit SNR via scale factor index
659     {
660       for (uint8_t c = 0; (c < 2) && (maxQBest > SCHAR_MAX); c++)  // very rarely done twice
661       {
662         sfCurr += getScaleFacOffset (pow ((double) maxQBest, 4.0 / 3.0) * 0.001566492688) + c; // / m_lutXExp43[SCHAR_MAX]
663         sfBest = quantizeMagnSfb (coeffMagn, sfCurr, ptrBest, sfbWidth,
664 #if EC_TRELLIS_OPT_CODING
665                                   entrCoder, sfbStart - grpStart,
666 #endif
667                                   &maxQBest, &numQBest);
668       }
669       rdOptimQuant = false;
670     }
671     else if ((sfBest < sfCurr) && (sfBest != sfIndexPred)) // re-optimize above quantization
672     {
673       sfBest = quantizeMagnSfb (coeffMagn, --sfCurr, ptrBest, sfbWidth,
674 #if EC_TRELLIS_OPT_CODING
675                                 entrCoder, sfbStart - grpStart,
676 #endif
677                                 &maxQBest, &numQBest);
678 
679       rdOptimQuant &= (maxQBest <= SCHAR_MAX);
680     }
681 
682 #if EC_TRELLIS_OPT_CODING
683     if (grpLength == 1) // ref masking level
684     {
685       m_quantInSf[sfb][1] = __min (sfCurr, m_maxSfIndex);
686     }
687 #endif
688     if (maxQBest == 0) // SFB was quantized to zero - zero output
689     {
690       if (quantCoeffs)
691       {
692         memset (&quantCoeffs[sfbStart], 0, cpyWidth);
693         if (grpStats) // estimated bit count
694         {
695           grpStats[sfb] = getBitCount (entropyCoder, 0, 0, grpLength, &quantCoeffs[grpStart], sfbStart - grpStart, sfbWidth);
696         }
697       }
698       return sfIndexPred; // repeat scale factor, save delta bits
699     }
700 
701 // --- check whether optimized quantization and coding results in lower rate-distortion cost
702     distBest = getQuantDist (coeffMagn, sfBest, ptrBest, sfbWidth);
703 
704 #if EC_TRELLIS_OPT_CODING
705     if (grpLength == 1) // ref band-wise NMR
706     {
707       const double refSfbNmrDiv = m_lutSfNorm[m_quantInSf[sfb][1]];
708 
709       m_quantDist[sfb][1] = distBest * refSfbNmrDiv * refSfbNmrDiv;
710       m_quantRate[sfb][1] = numQBest; // sgn
711     }
712 #endif
713     if (quantCoeffs)
714     {
715       memcpy (&quantCoeffs[sfbStart], ptrBest, cpyWidth);
716 
717       codStart = entropyCoder.arithGetCodState (); // start state
718       ctxStart = entropyCoder.arithGetCtxState ();
719       numQBest += getBitCount (entropyCoder, sfBest, sfIndexPred, grpLength, &quantCoeffs[grpStart], sfbStart - grpStart, sfbWidth);
720       codFinal = entropyCoder.arithGetCodState (); // final state
721       ctxFinal = entropyCoder.arithGetCtxState ();
722     }
723     rdOptimQuant &= (distBest > 0.0);
724 
725     if ((sfBest < sfCurr) && (sfBest != sfIndexPred) && rdOptimQuant) // R/D re-optimization
726     {
727 #if EC_TRELLIS_OPT_CODING
728       const double refSfbNmrDiv = m_lutSfNorm[sfCurr];
729       const double lambda       = getLagrangeValue (m_rateIndex);
730 #endif
731       sfCurr = quantizeMagnSfb (coeffMagn, sfCurr - 1, ptrCurr, sfbWidth,
732 #if EC_TRELLIS_OPT_CODING
733                                 entrCoder, sfbStart - grpStart,
734 #endif
735                                 &maxQCurr, &numQCurr);
736 
737       distCurr = getQuantDist (coeffMagn, sfCurr, ptrCurr, sfbWidth);
738       if (quantCoeffs)
739       {
740         memcpy (&quantCoeffs[sfbStart], ptrCurr, cpyWidth);
741 
742         entropyCoder.arithSetCodState (codStart);  // reset state
743         entropyCoder.arithSetCtxState (ctxStart);
744         numQCurr += getBitCount (entropyCoder, sfCurr, sfIndexPred, grpLength, &quantCoeffs[grpStart], sfbStart - grpStart, sfbWidth);
745       }
746 
747       // rate-distortion decision, using empirical Lagrange value
748 #if EC_TRELLIS_OPT_CODING
749       if (distCurr * refSfbNmrDiv * refSfbNmrDiv + lambda * numQCurr < distBest * refSfbNmrDiv * refSfbNmrDiv + lambda * numQBest)
750 #else
751       if ((maxQCurr <= maxQBest) && (numQCurr <= numQBest + (distCurr >= distBest ? -1 : short (0.5 + distBest / __max (1.0, distCurr)))))
752 #endif
753       {
754         maxQBest = maxQCurr;
755         numQBest = numQCurr;
756         sfBest   = sfCurr;
757       }
758       else if (quantCoeffs) // discard result, recover best trial
759       {
760         memcpy (&quantCoeffs[sfbStart], ptrBest, cpyWidth);
761 
762         entropyCoder.arithSetCodState (codFinal);  // reset state
763         entropyCoder.arithSetCtxState (ctxFinal);
764       }
765     }
766 
767     if (grpStats)
768     {
769       grpStats[sfb] = ((uint32_t) maxQBest << 16) | numQBest; // max magnitude and bit count
770     }
771   } // if sfIndex == 0
772 
773   return __min (sfBest, m_maxSfIndex);
774 }
775 
776 #if EC_TRELLIS_OPT_CODING
quantizeSpecRDOC(EntropyCoder & entropyCoder,uint8_t * const optimalSf,const unsigned targetBitCount,const uint16_t * const grpOffsets,uint32_t * const grpStats,const unsigned numSfb,uint8_t * const quantCoeffs)777 unsigned SfbQuantizer::quantizeSpecRDOC (EntropyCoder& entropyCoder, uint8_t* const optimalSf, const unsigned targetBitCount,
778                                          const uint16_t* const grpOffsets, uint32_t* const grpStats,  // quant./coding statistics
779                                          const unsigned numSfb, uint8_t* const quantCoeffs)  // returns RD optimization bit count
780 {
781   // numSfb: number of trellis stages. Based on: A. Aggarwal, S. L. Regunathan, and K. Rose,
782   // "Trellis-Based Optimization of MPEG-4 Advanced Audio Coding," see also quantizeMagnRDOC
783   const uint32_t codStart = USHRT_MAX << 16;
784   const uint32_t ctxStart = m_quantRate[0][0]; // start context before call to quantizeSfb()
785   const uint32_t codFinal = entropyCoder.arithGetCodState ();
786   const uint32_t ctxFinal = entropyCoder.arithGetCtxState (); // after call to quantizeSfb()
787   const uint16_t grpStart = grpOffsets[0];
788   uint8_t* const inScaleFac = &m_coeffTemp[((unsigned) m_maxSize8M1 - 6) << 3];
789   uint32_t  prevCodState[8] = {0, 0, 0, 0, 0, 0, 0, 0};
790   uint32_t  prevCtxState[8] = {0, 0, 0, 0, 0, 0, 0, 0};
791   uint8_t   prevScaleFac[8] = {0, 0, 0, 0, 0, 0, 0, 0};
792   double    prevVtrbCost[8] = {0, 0, 0, 0, 0, 0, 0, 0};
793   uint32_t  tempCodState[8] = {0, 0, 0, 0, 0, 0, 0, 0};
794   uint32_t  tempCtxState[8] = {0, 0, 0, 0, 0, 0, 0, 0};
795   uint8_t   tempScaleFac[8] = {0, 0, 0, 0, 0, 0, 0, 0};
796   double    tempVtrbCost[8] = {0, 0, 0, 0, 0, 0, 0, 0};
797   unsigned  tempBitCount, sfb, is;
798   int ds;
799 #if EC_TRAIN
800   double refGrpDist = 0.0, tempGrpDist = 0.0;
801 #else
802   const double lambda = getLagrangeValue (m_rateIndex);
803 #endif
804 
805   if ((optimalSf == nullptr) || (quantCoeffs == nullptr) || (grpOffsets == nullptr) || (numSfb == 0) || (numSfb > 52) ||
806       (targetBitCount == 0)  || (targetBitCount > SHRT_MAX))
807   {
808     return 0; // invalid input error
809   }
810 
811   for (sfb = 0; sfb < numSfb; sfb++) // SFB-wise scale factor, weighted distortion, and rate
812   {
813     const uint8_t       refSf = m_quantInSf[sfb][1];
814     const uint16_t    refNumQ = m_quantRate[sfb][1];
815     const double refQuantDist = m_quantDist[sfb][1];
816     const double refQuantNorm = m_lutSfNorm[refSf] * m_lutSfNorm[refSf];
817     const uint16_t   sfbStart = grpOffsets[sfb];
818     const uint16_t   sfbWidth = grpOffsets[sfb + 1] - sfbStart;
819     const uint32_t* coeffMagn = &m_coeffMagn[sfbStart];
820     uint8_t* const  tempQuant = &m_coeffTemp[sfbStart - grpStart];
821     bool maxSnrReached = false;
822 
823     if (refQuantDist < 0.0) memset (tempQuant, 0, sfbWidth * sizeof (uint8_t));
824 #if EC_TRAIN
825     else refGrpDist += refQuantDist;
826 #endif
827     if (grpStats)
828     {
829       grpStats[sfb] = (grpStats[sfb] & (USHRT_MAX << 16)) | refNumQ; // keep magn, sign bits
830     }
831 
832     for (is = 0; is < m_numCStates; is++) // populate SFB trellis
833     {
834       const uint8_t* mag = (is != 1 ? m_coeffTemp /*= tempQuant[grpStart - sfbStart]*/ : &quantCoeffs[grpStart]);
835       double&   currDist = m_quantDist[sfb][is];
836       uint16_t* currRate = &m_quantRate[sfb][is * m_numCStates];
837       uint8_t     sfBest = optimalSf[sfb]; // optimal scalefactor
838       short maxQCurr = 0, numQCurr = 0; // for sign bits counting
839 
840       if (refQuantDist < 0.0) // -1.0 means SFB is zero-quantized
841       {
842         currDist = -1.0;
843         m_quantInSf[sfb][is] = refSf;
844       }
845       else if (is != 1) // quantization & distortion not computed
846       {
847         const uint8_t sfCurr = __max (0, __min (m_maxSfIndex, refSf + 1 - (int) is));
848 
849         currDist = -1.0;
850         if ((sfCurr == 0) || maxSnrReached)
851         {
852           maxSnrReached = true;
853         }
854         else // sfCurr > 0 && sfCurr <= m_maxSfIndex, re-quantize
855         {
856           sfBest = quantizeMagnSfb (coeffMagn, sfCurr, tempQuant, sfbWidth,
857                                     &entropyCoder, sfbStart - grpStart,
858                                     &maxQCurr, &numQCurr);
859 
860           if (maxQCurr > SCHAR_MAX)
861           {
862             maxSnrReached = true; numQCurr = 0;
863           }
864           else
865           {
866             currDist = getQuantDist (coeffMagn, sfBest, tempQuant, sfbWidth) * refQuantNorm;
867           }
868         }
869         if (currDist < 0.0) memset (tempQuant, 0, sfbWidth * sizeof (uint8_t));
870         m_quantInSf[sfb][is] = sfCurr; // store initial scale fac
871       }
872       else // is == 1, quant. & dist. computed with quantizeSfb()
873       {
874         numQCurr = refNumQ;
875       }
876 
877       if (sfb == 0) // first SFB, having sfbStart - grpStart == 0
878       {
879         entropyCoder.arithSetCodState (codStart);  // group start
880         entropyCoder.arithSetCtxState (ctxStart, 0);
881         tempBitCount = (maxSnrReached ? USHRT_MAX : numQCurr + getBitCount (entropyCoder, sfBest, UCHAR_MAX, 1, mag, 0, sfbWidth));
882 
883         for (ds = m_numCStates - 1; ds >= 0; ds--)
884         {
885           currRate[ds] = (uint16_t) tempBitCount;
886         }
887         tempCodState[is] = entropyCoder.arithGetCodState ();
888         tempCtxState[is] = entropyCoder.arithGetCtxState ();
889       }
890       else // sfb > 0, rate depends on decisions in preceding SFB
891       {
892         for (ds = m_numCStates - 1; ds >= 0; ds--)
893         {
894           const uint16_t prevRate = m_quantRate[sfb - 1][ds * m_numCStates];
895 
896           entropyCoder.arithSetCodState (prevCodState[ds]);
897           entropyCoder.arithSetCtxState (prevCtxState[ds], sfbStart - grpStart);
898           tempBitCount = (maxSnrReached || (prevRate >= USHRT_MAX) ? USHRT_MAX : numQCurr + getBitCount (entropyCoder,
899                            (numQCurr == 0 ? prevScaleFac[ds] : sfBest), prevScaleFac[ds], 1, mag, sfbStart - grpStart, sfbWidth));
900           currRate[ds] = (uint16_t) tempBitCount;
901 
902           if (ds == 1) // statistically best place to save states
903           {
904             tempCodState[is] = entropyCoder.arithGetCodState ();
905             tempCtxState[is] = entropyCoder.arithGetCtxState ();
906           }
907         }
908       }
909       tempScaleFac[is] = sfBest; // optimized factor for next SFB
910     } // for is
911 
912     memcpy (prevCodState, tempCodState, m_numCStates * sizeof (uint32_t));
913     memcpy (prevCtxState, tempCtxState, m_numCStates * sizeof (uint32_t));
914     memcpy (prevScaleFac, tempScaleFac, m_numCStates * sizeof (uint8_t ));
915   } // for sfb
916 
917   entropyCoder.arithSetCodState (codFinal); // back to last state
918   entropyCoder.arithSetCtxState (ctxFinal, grpOffsets[numSfb] - grpStart);
919 
920 #if EC_TRAIN
921   tempBitCount = targetBitCount + 1; // Viterbi search for minimum distortion at target rate
922   for (double lambda = 0.015625; (lambda <= 0.375) && (tempBitCount > targetBitCount); lambda += 0.0078125)
923 #endif
924   {
925     double* const  prevCost = prevVtrbCost;
926     uint8_t* const prevPath = m_coeffTemp; // trellis backtracker
927     double   costMinIs = (double) UINT_MAX;
928     unsigned pathMinIs = 1;
929 #if EC_TRAIN
930     tempGrpDist = 0.0;
931 #endif
932 
933     for (is = 0; is < m_numCStates; is++) // initial minimum path
934     {
935       const uint16_t currRate = m_quantRate[0][is * m_numCStates];
936 
937       prevCost[is] = (currRate >= USHRT_MAX ? (double) UINT_MAX : lambda * currRate + __max (0.0, m_quantDist[0][is]));
938       prevPath[is] = 0;
939     }
940 
941     for (sfb = 1; sfb < numSfb; sfb++) // search for minimum path
942     {
943       double* const  currCost = tempVtrbCost;
944       uint8_t* const currPath = &prevPath[sfb * m_numCStates];
945 
946       for (is = 0; is < m_numCStates; is++) // SFB's minimum path
947       {
948         uint16_t* currRate = &m_quantRate[sfb][is * m_numCStates];
949         double   costMinDs = (double) UINT_MAX;
950         uint8_t  pathMinDs = 1;
951 
952         for (ds = m_numCStates - 1; ds >= 0; ds--) // transitions
953         {
954           const double costCurr = (currRate[ds] >= USHRT_MAX ? (double) UINT_MAX : prevCost[ds] + lambda * currRate[ds]);
955 
956           if (costMinDs > costCurr)
957           {
958             costMinDs = costCurr;
959             pathMinDs = (uint8_t) ds;
960           }
961         }
962         if (costMinDs < UINT_MAX) costMinDs += __max (0.0, m_quantDist[sfb][is]);
963 
964         currCost[is] = costMinDs;
965         currPath[is] = pathMinDs;
966       } // for is
967 
968       memcpy (prevCost, currCost, m_numCStates * sizeof (double)); // TODO: avoid memcpy, use pointer swapping instead for speed
969     } // for sfb
970 
971     for (sfb--, is = 0; is < m_numCStates; is++) // group minimum
972     {
973       if (costMinIs > prevCost[is])
974       {
975         costMinIs = prevCost[is];
976         pathMinIs = is;
977       }
978     }
979 
980     for (tempBitCount = 0; sfb > 0; sfb--) // min-cost group rate
981     {
982       const uint8_t* currPath = &prevPath[sfb * m_numCStates];
983       const uint8_t pathMinDs = currPath[pathMinIs];
984 
985       inScaleFac[sfb] = (m_quantDist[sfb][pathMinIs] < 0.0 ? UCHAR_MAX : m_quantInSf[sfb][pathMinIs]);
986       tempBitCount   +=  m_quantRate[sfb][pathMinDs + pathMinIs * m_numCStates];
987 #if EC_TRAIN
988       tempGrpDist += __max (0.0, m_quantDist[sfb][pathMinIs]);
989 #endif
990       pathMinIs = pathMinDs;
991     }
992     inScaleFac[0] = (m_quantDist[0][pathMinIs] < 0.0 ? UCHAR_MAX : m_quantInSf[0][pathMinIs]);
993     tempBitCount +=  m_quantRate[0][pathMinIs * m_numCStates];
994 #if EC_TRAIN
995     tempGrpDist += __max (0.0, m_quantDist[0][pathMinIs]);
996 #endif
997   } // Viterbi search
998 
999 #if EC_TRAIN
1000   if ((tempGrpDist <= refGrpDist) || (tempBitCount <= targetBitCount))
1001 #endif
1002   {
1003     uint8_t sfIndexPred = UCHAR_MAX;
1004 
1005     if (grpStats)
1006     {
1007       entropyCoder.arithSetCodState (codStart);// set group start
1008       entropyCoder.arithSetCtxState (ctxStart, 0);
1009 
1010       tempBitCount = 0;
1011     }
1012     for (sfb = 0; sfb < numSfb; sfb++) // re-quantize spectrum with R/D optimized parameters
1013     {
1014       const uint16_t sfbStart = grpOffsets[sfb];
1015       const uint16_t sfbWidth = grpOffsets[sfb + 1] - sfbStart;
1016 
1017       if ((inScaleFac[sfb] == UCHAR_MAX) || (sfIndexPred <= m_maxSfIndex && inScaleFac[sfb] + INDEX_OFFSET < sfIndexPred))
1018       {
1019         memset (&quantCoeffs[sfbStart], 0, sfbWidth * sizeof (uint8_t));  // zero SFB output
1020 
1021         optimalSf[sfb] = sfIndexPred - (inScaleFac[sfb] == UCHAR_MAX ? 0 : INDEX_OFFSET);
1022       }
1023       else if (inScaleFac[sfb] != m_quantInSf[sfb][1]) // speedup
1024       {
1025         short maxQBest = 0, numQBest = 0;
1026 
1027         optimalSf[sfb] = quantizeMagnSfb (&m_coeffMagn[sfbStart], inScaleFac[sfb], &quantCoeffs[sfbStart], sfbWidth,
1028                                           &entropyCoder, sfbStart - grpStart,
1029                                           &maxQBest, &numQBest);
1030 
1031         if (maxQBest == 0) optimalSf[sfb] = sfIndexPred; // empty
1032         if (grpStats)
1033         {
1034           grpStats[sfb] = ((uint32_t) maxQBest << 16) | numQBest; // max magn. and sign bits
1035         }
1036       }
1037 
1038       if (grpStats) // complete statistics with per-SFB bit count
1039       {
1040         grpStats[sfb] += getBitCount (entropyCoder, optimalSf[sfb], sfIndexPred, 1, &quantCoeffs[grpStart], sfbStart - grpStart, sfbWidth);
1041         tempBitCount  += grpStats[sfb] & USHRT_MAX;
1042       }
1043 
1044       if ((sfb > 0) && (optimalSf[sfb] < UCHAR_MAX) && (sfIndexPred == UCHAR_MAX))
1045       {
1046         memset (optimalSf, optimalSf[sfb], sfb * sizeof (uint8_t)); // back-propagate factor
1047       }
1048       sfIndexPred = optimalSf[sfb];
1049     } // for sfb
1050 
1051     return tempBitCount + (grpStats ? 2 : 0); // last coding bits
1052   }
1053 
1054   return targetBitCount;
1055 }
1056 #endif // EC_TRELLIS_OPT_CODING
1057