1 /*
2  * H.265 video codec.
3  * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
4  *
5  * This file is part of libde265.
6  *
7  * libde265 is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU Lesser General Public License as
9  * published by the Free Software Foundation, either version 3 of
10  * the License, or (at your option) any later version.
11  *
12  * libde265 is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public License
18  * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
19  */
20 
21 #include "transform.h"
22 #include "util.h"
23 
24 #include <assert.h>
25 
26 
27 const int tab8_22[] = { 29,30,31,32,33,33,34,34,35,35,36,36,37 /*,37*/ };
28 
29 
30 // (8.6.1)
decode_quantization_parameters(thread_context * tctx,int xC,int yC,int xCUBase,int yCUBase)31 void decode_quantization_parameters(thread_context* tctx, int xC,int yC,
32                                     int xCUBase, int yCUBase)
33 {
34   logtrace(LogTransform,">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> decode_quantization_parameters(int xC,int yC)=(%d,%d)\n", xC,yC);
35 
36   const pic_parameter_set& pps = tctx->img->get_pps();
37   const seq_parameter_set& sps = tctx->img->get_sps();
38   slice_segment_header* shdr = tctx->shdr;
39 
40   // top left pixel position of current quantization group
41   int xQG = xCUBase - (xCUBase & ((1<<pps.Log2MinCuQpDeltaSize)-1));
42   int yQG = yCUBase - (yCUBase & ((1<<pps.Log2MinCuQpDeltaSize)-1));
43 
44   logtrace(LogTransform,"QG: %d,%d\n",xQG,yQG);
45 
46 
47   // we only have to set QP in the first call in a quantization-group
48 
49   /* TODO: check why this does not work with HoneyBee stream
50 
51   if (xQG == tctx->currentQG_x &&
52       yQG == tctx->currentQG_y)
53     {
54       return;
55     }
56   */
57 
58   // if first QG in CU, remember last QPY of last CU previous QG
59 
60   if (xQG != tctx->currentQG_x ||
61       yQG != tctx->currentQG_y)
62     {
63       tctx->lastQPYinPreviousQG = tctx->currentQPY;
64       tctx->currentQG_x = xQG;
65       tctx->currentQG_y = yQG;
66     }
67 
68   int qPY_PRED;
69 
70   // first QG in CTB row ?
71 
72   int ctbLSBMask = ((1<<sps.Log2CtbSizeY)-1);
73   bool firstInCTBRow = (xQG == 0 && ((yQG & ctbLSBMask)==0));
74 
75   // first QG in slice ?    TODO: a "firstQG" flag in the thread context would be faster
76 
77   int first_ctb_in_slice_RS = tctx->shdr->SliceAddrRS;
78 
79   int SliceStartX = (first_ctb_in_slice_RS % sps.PicWidthInCtbsY) * sps.CtbSizeY;
80   int SliceStartY = (first_ctb_in_slice_RS / sps.PicWidthInCtbsY) * sps.CtbSizeY;
81 
82   bool firstQGInSlice = (SliceStartX == xQG && SliceStartY == yQG);
83 
84   // first QG in tile ?
85 
86   bool firstQGInTile = false;
87   if (pps.tiles_enabled_flag) {
88     if ((xQG & ((1 << sps.Log2CtbSizeY)-1)) == 0 &&
89         (yQG & ((1 << sps.Log2CtbSizeY)-1)) == 0)
90       {
91         int ctbX = xQG >> sps.Log2CtbSizeY;
92         int ctbY = yQG >> sps.Log2CtbSizeY;
93 
94         firstQGInTile = pps.is_tile_start_CTB(ctbX,ctbY); // TODO: this is slow
95       }
96   }
97 
98 
99   if (firstQGInSlice || firstQGInTile ||
100       (firstInCTBRow && pps.entropy_coding_sync_enabled_flag)) {
101     qPY_PRED = tctx->shdr->SliceQPY;
102   }
103   else {
104     qPY_PRED = tctx->lastQPYinPreviousQG;
105   }
106 
107 
108   int qPYA,qPYB;
109 
110   if (tctx->img->available_zscan(xQG,yQG, xQG-1,yQG)) {
111     int xTmp = (xQG-1) >> sps.Log2MinTrafoSize;
112     int yTmp = (yQG  ) >> sps.Log2MinTrafoSize;
113     int minTbAddrA = pps.MinTbAddrZS[xTmp + yTmp*sps.PicWidthInTbsY];
114     int ctbAddrA = minTbAddrA >> (2 * (sps.Log2CtbSizeY-sps.Log2MinTrafoSize));
115     if (ctbAddrA == tctx->CtbAddrInTS) {
116       qPYA = tctx->img->get_QPY(xQG-1,yQG);
117     }
118     else {
119       qPYA = qPY_PRED;
120     }
121   }
122   else {
123     qPYA = qPY_PRED;
124   }
125 
126   if (tctx->img->available_zscan(xQG,yQG, xQG,yQG-1)) {
127     int xTmp = (xQG  ) >> sps.Log2MinTrafoSize;
128     int yTmp = (yQG-1) >> sps.Log2MinTrafoSize;
129     int minTbAddrB = pps.MinTbAddrZS[xTmp + yTmp*sps.PicWidthInTbsY];
130     int ctbAddrB = minTbAddrB >> (2 * (sps.Log2CtbSizeY-sps.Log2MinTrafoSize));
131     if (ctbAddrB == tctx->CtbAddrInTS) {
132       qPYB = tctx->img->get_QPY(xQG,yQG-1);
133     }
134     else {
135       qPYB = qPY_PRED;
136     }
137   }
138   else {
139     qPYB = qPY_PRED;
140   }
141 
142   qPY_PRED = (qPYA + qPYB + 1)>>1;
143 
144   logtrace(LogTransform,"qPY_PRED = %d  (%d, %d)\n",qPY_PRED, qPYA, qPYB);
145 
146   int QPY = ((qPY_PRED + tctx->CuQpDelta + 52+2*sps.QpBdOffset_Y) %
147              (52 + sps.QpBdOffset_Y)) - sps.QpBdOffset_Y;
148 
149   tctx->qPYPrime = QPY + sps.QpBdOffset_Y;
150   if (tctx->qPYPrime<0) {
151     tctx->qPYPrime=0;
152   }
153 
154   int qPiCb = Clip3(-sps.QpBdOffset_C,57, QPY+pps.pic_cb_qp_offset + shdr->slice_cb_qp_offset + tctx->CuQpOffsetCb);
155   int qPiCr = Clip3(-sps.QpBdOffset_C,57, QPY+pps.pic_cr_qp_offset + shdr->slice_cr_qp_offset + tctx->CuQpOffsetCr);
156 
157   logtrace(LogTransform,"qPiCb:%d (%d %d), qPiCr:%d (%d %d)\n",
158            qPiCb, pps.pic_cb_qp_offset, shdr->slice_cb_qp_offset,
159            qPiCr, pps.pic_cr_qp_offset, shdr->slice_cr_qp_offset);
160 
161   int qPCb,qPCr;
162 
163   if (sps.ChromaArrayType == CHROMA_420) {
164     qPCb = table8_22(qPiCb);
165     qPCr = table8_22(qPiCr);
166   }
167   else {
168     qPCb = qPiCb;
169     qPCr = qPiCr;
170   }
171 
172   //printf("q: %d %d\n",qPiCb, qPCb);
173 
174   tctx->qPCbPrime = qPCb + sps.QpBdOffset_C;
175   if (tctx->qPCbPrime<0) {
176     tctx->qPCbPrime = 0;
177   }
178 
179   tctx->qPCrPrime = qPCr + sps.QpBdOffset_C;
180   if (tctx->qPCrPrime<0) {
181     tctx->qPCrPrime = 0;
182   }
183 
184   /*
185   printf("Q: %d (%d %d %d / %d %d) %d %d %d\n",QPY,
186          sps->QpBdOffset_Y,
187          pps->pic_cb_qp_offset + shdr->slice_cb_qp_offset,
188          pps->pic_cr_qp_offset + shdr->slice_cr_qp_offset,
189          sps->QpBdOffset_C, sps->QpBdOffset_C,
190          tctx->qPYPrime, tctx->qPCbPrime, tctx->qPCrPrime);
191   */
192 
193   int log2CbSize = tctx->img->get_log2CbSize(xCUBase, yCUBase);
194 
195   // TODO: On broken input, log2CbSize may be zero (multithreaded only). Not sure yet why.
196   // Maybe another decoding thread is overwriting the value set in slice.cc:read_coding_unit.
197   // id:000163,sig:06,src:002041,op:havoc,rep:16.bin
198   if (log2CbSize<3) { log2CbSize=3; }
199 
200   tctx->img->set_QPY(xCUBase, yCUBase, log2CbSize, QPY);
201   tctx->currentQPY = QPY;
202 
203   /*
204   printf("SET QPY POC=%d %d;%d-%d;%d = %d\n",ctx->img->PicOrderCntVal,xCUBase,yCUBase,
205          xCUBase+(1<<log2CbSize),yCUBase+(1<<log2CbSize), QPY);
206   */
207 
208   logtrace(LogTransform,"qPY(%d,%d,%d)= %d, qPYPrime=%d\n",
209            xCUBase,yCUBase,1<<log2CbSize,QPY,tctx->qPYPrime);
210 }
211 
212 
213 
214 template <class pixel_t>
transform_coefficients(acceleration_functions * acceleration,int16_t * coeff,int coeffStride,int nT,int trType,pixel_t * dst,int dstStride,int bit_depth)215 void transform_coefficients(acceleration_functions* acceleration,
216                             int16_t* coeff, int coeffStride, int nT, int trType,
217                             pixel_t* dst, int dstStride, int bit_depth)
218 {
219   logtrace(LogTransform,"transform --- trType: %d nT: %d\n",trType,nT);
220 
221 
222   if (trType==1) {
223 
224     acceleration->transform_4x4_dst_add<pixel_t>(dst, coeff, dstStride, bit_depth);
225 
226   } else {
227 
228     /**/ if (nT==4)  { acceleration->transform_add<pixel_t>(0,dst,coeff,dstStride, bit_depth); }
229     else if (nT==8)  { acceleration->transform_add<pixel_t>(1,dst,coeff,dstStride, bit_depth); }
230     else if (nT==16) { acceleration->transform_add<pixel_t>(2,dst,coeff,dstStride, bit_depth); }
231     else             { acceleration->transform_add<pixel_t>(3,dst,coeff,dstStride, bit_depth); }
232   }
233 
234 #if 0
235   printf("decoded pixels:\n");
236   for (int y=0;y<nT;y++,printf("\n"))
237     for (int x=0;x<nT;x++) {
238       printf("%02x ",dst[y*dstStride+x]);
239     }
240 #endif
241 }
242 
243 
244 // TODO: make this an accelerated function
cross_comp_pred(const thread_context * tctx,int32_t * residual,int nT)245 void cross_comp_pred(const thread_context* tctx, int32_t* residual, int nT)
246 {
247   const int BitDepthC = tctx->img->get_sps().BitDepth_C;
248   const int BitDepthY = tctx->img->get_sps().BitDepth_Y;
249 
250   for (int y=0;y<nT;y++)
251     for (int x=0;x<nT;x++) {
252       /* TODO: the most usual case is definitely BitDepthY == BitDepthC, in which case
253          we could just omit two shifts. The second most common case is probably
254          BitDepthY>BitDepthC, for which we could also eliminate one shift. The remaining
255          case is also one shift only.
256       */
257 
258       residual[y*nT+x] += (tctx->ResScaleVal *
259                            ((tctx->residual_luma[y*nT+x] << BitDepthC ) >> BitDepthY ) ) >> 3;
260     }
261 }
262 
263 
264 template <class pixel_t>
transform_coefficients_explicit(thread_context * tctx,int16_t * coeff,int coeffStride,int nT,int trType,pixel_t * dst,int dstStride,int bit_depth,int cIdx)265 void transform_coefficients_explicit(thread_context* tctx,
266                                      int16_t* coeff, int coeffStride, int nT, int trType,
267                                      pixel_t* dst, int dstStride, int bit_depth, int cIdx)
268 {
269   logtrace(LogTransform,"transform --- trType: %d nT: %d\n",trType,nT);
270 
271   const acceleration_functions* acceleration = &tctx->decctx->acceleration;
272 
273   int32_t residual_buffer[32*32];
274   int32_t* residual;
275   if (cIdx==0) {
276     residual = tctx->residual_luma;
277   }
278   else {
279     residual = residual_buffer;
280   }
281 
282 
283   // TODO
284   int bdShift = 20 - bit_depth;
285   int max_coeff_bits = 15;
286 
287   if (trType==1) {
288 
289     acceleration->transform_idst_4x4(residual, coeff, bdShift, max_coeff_bits);
290 
291   } else {
292 
293     /**/ if (nT==4)  { acceleration->transform_idct_4x4(residual,coeff,bdShift,max_coeff_bits); }
294     else if (nT==8)  { acceleration->transform_idct_8x8(residual,coeff,bdShift,max_coeff_bits); }
295     else if (nT==16) { acceleration->transform_idct_16x16(residual,coeff,bdShift,max_coeff_bits); }
296     else             { acceleration->transform_idct_32x32(residual,coeff,bdShift,max_coeff_bits); }
297   }
298 
299 
300   //printBlk("prediction",(uint8_t*)dst,nT,dstStride);
301   //printBlk("residual",residual,nT,nT);
302 
303   if (cIdx != 0) {
304     if (tctx->ResScaleVal != 0) {
305       cross_comp_pred(tctx, residual, nT);
306     }
307 
308     //printBlk("cross-comp-pred modified residual",residual,nT,nT);
309   }
310 
311   acceleration->add_residual(dst,dstStride, residual,nT, bit_depth);
312 }
313 
314 
inv_transform(acceleration_functions * acceleration,uint8_t * dst,int dstStride,int16_t * coeff,int log2TbSize,int trType)315 void inv_transform(acceleration_functions* acceleration,
316                    uint8_t* dst, int dstStride, int16_t* coeff,
317                    int log2TbSize, int trType)
318 {
319   if (trType==1) {
320     assert(log2TbSize==2);
321 
322     acceleration->transform_4x4_dst_add_8(dst, coeff, dstStride);
323 
324   } else {
325     acceleration->transform_add_8[log2TbSize-2](dst,coeff,dstStride);
326   }
327 
328 
329 #if 0
330   int nT = 1<<log2TbSize;
331   printf("decoded pixels:\n");
332   for (int y=0;y<nT;y++,printf("\n"))
333     for (int x=0;x<nT;x++) {
334   printf("%02x ",dst[y*dstStride+x]);
335 }
336 #endif
337 }
338 
339 
fwd_transform(acceleration_functions * acceleration,int16_t * coeff,int coeffStride,int log2TbSize,int trType,const int16_t * src,int srcStride)340 void fwd_transform(acceleration_functions* acceleration,
341                    int16_t* coeff, int coeffStride, int log2TbSize, int trType,
342                    const int16_t* src, int srcStride)
343 {
344   logtrace(LogTransform,"transform --- trType: %d nT: %d\n",trType,1<<log2TbSize);
345 
346   if (trType==1) {
347     // DST 4x4
348 
349     acceleration->fwd_transform_4x4_dst_8(coeff, src, srcStride);
350   } else {
351     // DCT 4x4, 8x8, 16x16, 32x32
352 
353     acceleration->fwd_transform_8[log2TbSize-2](coeff,src,srcStride);
354   }
355 }
356 
357 
358 
359 static const int levelScale[] = { 40,45,51,57,64,72 };
360 
361 // (8.6.2) and (8.6.3)
362 template <class pixel_t>
scale_coefficients_internal(thread_context * tctx,int xT,int yT,int x0,int y0,int nT,int cIdx,bool transform_skip_flag,bool intra,int rdpcmMode)363 void scale_coefficients_internal(thread_context* tctx,
364                                  int xT,int yT, // position of TU in frame (chroma adapted)
365                                  int x0,int y0, // position of CU in frame (chroma adapted)
366                                  int nT, int cIdx,
367                                  bool transform_skip_flag, bool intra, int rdpcmMode)
368 {
369   const seq_parameter_set& sps = tctx->img->get_sps();
370   const pic_parameter_set& pps = tctx->img->get_pps();
371 
372   int qP;
373   switch (cIdx) {
374   case 0: qP = tctx->qPYPrime;  break;
375   case 1: qP = tctx->qPCbPrime; break;
376   case 2: qP = tctx->qPCrPrime; break;
377   default: qP = 0; assert(0); break; // should never happen
378   }
379 
380   logtrace(LogTransform,"qP: %d\n",qP);
381 
382 
383   int16_t* coeff;
384   int      coeffStride;
385 
386   coeff = tctx->coeffBuf;
387   coeffStride = nT;
388 
389 
390 
391 
392 
393   pixel_t* pred;
394   int      stride;
395   pred = tctx->img->get_image_plane_at_pos_NEW<pixel_t>(cIdx, xT,yT);
396   stride = tctx->img->get_image_stride(cIdx);
397 
398   // We explicitly include the case for sizeof(pixel_t)==1 so that the compiler
399   // can optimize away a lot of code for 8-bit pixels.
400   const int bit_depth = ((sizeof(pixel_t)==1) ? 8 : sps.get_bit_depth(cIdx));
401 
402   //assert(intra == (tctx->img->get_pred_mode(xT,yT)==MODE_INTRA));
403   int cuPredModeIntra = (tctx->img->get_pred_mode(xT,yT)==MODE_INTRA);
404 
405   bool rotateCoeffs = (sps.range_extension.transform_skip_rotation_enabled_flag &&
406                        nT == 4 &&
407                        cuPredModeIntra);
408 
409   if (tctx->cu_transquant_bypass_flag) {
410 
411     int32_t residual_buffer[32*32];
412 
413     int32_t* residual;
414     if (cIdx==0) residual = tctx->residual_luma;
415     else         residual = residual_buffer;
416 
417 
418     // TODO: we could fold the coefficient rotation into the coefficient expansion here:
419     for (int i=0;i<tctx->nCoeff[cIdx];i++) {
420       int32_t currCoeff = tctx->coeffList[cIdx][i];
421       tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff;
422     }
423 
424     if (rotateCoeffs) {
425       tctx->decctx->acceleration.rotate_coefficients(coeff, nT);
426     }
427 
428     if (rdpcmMode) {
429       if (rdpcmMode==2)
430         tctx->decctx->acceleration.transform_bypass_rdpcm_v(residual, coeff, nT);
431       else
432         tctx->decctx->acceleration.transform_bypass_rdpcm_h(residual, coeff, nT);
433     }
434     else {
435       tctx->decctx->acceleration.transform_bypass(residual, coeff, nT);
436     }
437 
438     if (cIdx != 0) {
439       if (tctx->ResScaleVal != 0) {
440         cross_comp_pred(tctx, residual, nT);
441       }
442     }
443 
444     tctx->decctx->acceleration.add_residual(pred,stride, residual,nT, bit_depth);
445 
446     if (rotateCoeffs) {
447       memset(coeff, 0, nT*nT*sizeof(int16_t)); // delete all, because we moved the coeffs around
448     }
449   }
450   else {
451     // (8.6.3)
452 
453     int bdShift = (cIdx==0 ? sps.BitDepth_Y : sps.BitDepth_C) + Log2(nT) - 5;
454 
455     logtrace(LogTransform,"bdShift=%d\n",bdShift);
456 
457     logtrace(LogTransform,"dequant %d;%d cIdx=%d qp=%d\n",xT*(cIdx?2:1),yT*(cIdx?2:1),cIdx,qP);
458 
459 
460     // --- inverse quantization ---
461 
462     if (sps.scaling_list_enable_flag==0) {
463 
464       //const int m_x_y = 16;
465       const int m_x_y = 1;
466       bdShift -= 4;  // this is equivalent to having a m_x_y of 16 and we can use 32bit integers
467 
468       const int offset = (1<<(bdShift-1));
469       const int fact = m_x_y * levelScale[qP%6] << (qP/6);
470 
471       for (int i=0;i<tctx->nCoeff[cIdx];i++) {
472 
473         // usually, this needs to be 64bit, but because we modify the shift above, we can use 16 bit
474         int32_t currCoeff  = tctx->coeffList[cIdx][i];
475 
476         //logtrace(LogTransform,"coefficient[%d] = %d\n",tctx->coeffPos[cIdx][i],
477         //tctx->coeffList[cIdx][i]);
478 
479         currCoeff = Clip3(-32768,32767,
480                           ( (currCoeff * fact + offset ) >> bdShift));
481 
482         //logtrace(LogTransform," -> %d\n",currCoeff);
483 
484         tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff;
485       }
486     }
487     else {
488       const int offset = (1<<(bdShift-1));
489 
490       const uint8_t* sclist;
491       int matrixID = cIdx;
492       if (!intra) {
493         if (nT<32) { matrixID += 3; }
494         else { matrixID++; }
495       }
496 
497       switch (nT) {
498       case  4: sclist = &pps.scaling_list.ScalingFactor_Size0[matrixID][0][0]; break;
499       case  8: sclist = &pps.scaling_list.ScalingFactor_Size1[matrixID][0][0]; break;
500       case 16: sclist = &pps.scaling_list.ScalingFactor_Size2[matrixID][0][0]; break;
501       case 32: sclist = &pps.scaling_list.ScalingFactor_Size3[matrixID][0][0]; break;
502       default: assert(0);
503       }
504 
505       for (int i=0;i<tctx->nCoeff[cIdx];i++) {
506         int pos = tctx->coeffPos[cIdx][i];
507         int x = pos%nT;
508         int y = pos/nT;
509 
510         const int m_x_y = sclist[x+y*nT];
511         const int fact = m_x_y * levelScale[qP%6] << (qP/6);
512 
513         int64_t currCoeff  = tctx->coeffList[cIdx][i];
514 
515         currCoeff = Clip3(-32768,32767,
516                           ( (currCoeff * fact + offset ) >> bdShift));
517 
518         tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff;
519       }
520     }
521 
522 
523     // --- do transform or skip ---
524 
525     logtrace(LogTransform,"coefficients OUT:\n");
526     for (int y=0;y<nT;y++) {
527       logtrace(LogTransform,"  ");
528       for (int x=0;x<nT;x++) {
529         logtrace(LogTransform,"*%3d ", coeff[x+y*coeffStride]);
530       }
531       logtrace(LogTransform,"*\n");
532     }
533 
534     int bdShift2 = (cIdx==0) ? 20-sps.BitDepth_Y : 20-sps.BitDepth_C;
535 
536     logtrace(LogTransform,"bdShift2=%d\n",bdShift2);
537 
538     logtrace(LogSlice,"get_transform_skip_flag(%d,%d, cIdx=%d)=%d\n",xT,yT,cIdx,
539              transform_skip_flag);
540 
541     if (transform_skip_flag) {
542 
543       int extended_precision_processing_flag = 0;
544       int Log2nTbS = Log2(nT);
545       int bdShift = libde265_max( 20 - bit_depth, extended_precision_processing_flag ? 11 : 0 );
546       int tsShift = (extended_precision_processing_flag ? libde265_min( 5, bdShift - 2 ) : 5 )
547         + Log2nTbS;
548 
549       if (rotateCoeffs) {
550         tctx->decctx->acceleration.rotate_coefficients(coeff, nT);
551       }
552 
553       int32_t residual_buffer[32*32];
554 
555       int32_t* residual;
556       if (cIdx==0) residual = tctx->residual_luma;
557       else         residual = residual_buffer;
558 
559       if (rdpcmMode) {
560         /*
561         if (rdpcmMode==2)
562           tctx->decctx->acceleration.transform_skip_rdpcm_v(pred,coeff, Log2(nT), stride, bit_depth);
563         else
564           tctx->decctx->acceleration.transform_skip_rdpcm_h(pred,coeff, Log2(nT), stride, bit_depth);
565         */
566 
567         if (rdpcmMode==2)
568           tctx->decctx->acceleration.rdpcm_v(residual, coeff,nT, tsShift,bdShift);
569         else
570           tctx->decctx->acceleration.rdpcm_h(residual, coeff,nT, tsShift,bdShift);
571       }
572       else {
573         //tctx->decctx->acceleration.transform_skip(pred, coeff, stride, bit_depth);
574 
575         tctx->decctx->acceleration.transform_skip_residual(residual, coeff, nT, tsShift, bdShift);
576       }
577 
578       if (cIdx != 0) {
579         if (tctx->ResScaleVal != 0) {
580           cross_comp_pred(tctx, residual, nT);
581         }
582       }
583 
584       tctx->decctx->acceleration.add_residual(pred,stride, residual,nT, bit_depth);
585 
586       if (rotateCoeffs) {
587         memset(coeff, 0, nT*nT*sizeof(int16_t)); // delete all, because we moved the coeffs around
588       }
589     }
590     else {
591       int trType;
592 
593       //if (nT==4 && cIdx==0 && tctx->img->get_pred_mode(xT,yT)==MODE_INTRA) {
594       if (nT==4 && cIdx==0 && cuPredModeIntra) {
595         trType=1;
596       }
597       else {
598         trType=0;
599       }
600 
601       assert(rdpcmMode==0);
602 
603 
604       if (tctx->img->get_pps().range_extension.cross_component_prediction_enabled_flag) {
605         // cross-component-prediction: transform to residual buffer and add in a separate step
606 
607         transform_coefficients_explicit(tctx, coeff, coeffStride, nT, trType,
608                                         pred, stride, bit_depth, cIdx);
609       }
610       else {
611         transform_coefficients(&tctx->decctx->acceleration, coeff, coeffStride, nT, trType,
612                                pred, stride, bit_depth);
613       }
614     }
615   }
616 
617 
618   logtrace(LogTransform,"pixels (cIdx:%d), position %d %d:\n",cIdx, xT,yT);
619 
620   for (int y=0;y<nT;y++) {
621     logtrace(LogTransform,"RECO-%3d-%3d-%d ",xT,yT+y,cIdx);
622 
623     for (int x=0;x<nT;x++) {
624       logtrace(LogTransform,"*%03x ", pred[x+y*stride]);
625     }
626 
627     logtrace(LogTransform,"*\n");
628   }
629 
630   // zero out scrap coefficient buffer again
631 
632   for (int i=0;i<tctx->nCoeff[cIdx];i++) {
633     tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = 0;
634   }
635 }
636 
637 
scale_coefficients(thread_context * tctx,int xT,int yT,int x0,int y0,int nT,int cIdx,bool transform_skip_flag,bool intra,int rdpcmMode)638 void scale_coefficients(thread_context* tctx,
639                         int xT,int yT, // position of TU in frame (chroma adapted)
640                         int x0,int y0, // position of CU in frame (chroma adapted)
641                         int nT, int cIdx,
642                         bool transform_skip_flag, bool intra,
643                         int rdpcmMode // 0 - off, 1 - Horizontal, 2 - Vertical
644                         )
645 {
646   if (tctx->img->high_bit_depth(cIdx)) {
647     scale_coefficients_internal<uint16_t>(tctx, xT,yT, x0,y0, nT,cIdx, transform_skip_flag, intra,
648                                           rdpcmMode);
649   } else {
650     scale_coefficients_internal<uint8_t> (tctx, xT,yT, x0,y0, nT,cIdx, transform_skip_flag, intra,
651                                           rdpcmMode);
652   }
653 }
654 
655 
656 //#define QUANT_IQUANT_SHIFT    20 // Q(QP%6) * IQ(QP%6) = 2^20
657 #define QUANT_SHIFT           14 // Q(4) = 2^14
658 //#define SCALE_BITS            15 // Inherited from TMuC, pressumably for fractional bit estimates in RDOQ
659 #define MAX_TR_DYNAMIC_RANGE  15 // Maximum transform dynamic range (excluding sign bit)
660 
661 
662 const static uint16_t g_quantScales[6] = {
663   26214,23302,20560,18396,16384,14564
664 };
665 
quant_coefficients(int16_t * out_coeff,const int16_t * in_coeff,int log2TrSize,int qp,bool intra)666 void quant_coefficients(//encoder_context* ectx,
667                         int16_t* out_coeff,
668                         const int16_t* in_coeff,
669                         int log2TrSize, int qp,
670                         bool intra)
671 {
672   const int qpDiv6 = qp / 6;
673   const int qpMod6 = qp % 6;
674 
675   //int uiLog2TrSize = xLog2( iWidth - 1);
676 
677   int uiQ = g_quantScales[qpMod6];
678   int bitDepth = 8;
679   int transformShift = MAX_TR_DYNAMIC_RANGE - bitDepth - log2TrSize;  // Represents scaling through forward transform
680   int qBits = QUANT_SHIFT + qpDiv6 + transformShift;
681 
682   /* TODO: originally, this was checking for intra slices, why not for intra mode ?
683    */
684   int rnd = (intra ? 171 : 85) << (qBits-9);
685 
686   int x, y;
687   int uiAcSum = 0;
688 
689   int nStride = (1<<log2TrSize);
690 
691   for (y=0; y < (1<<log2TrSize) ; y++) {
692     for (x=0; x < (1<<log2TrSize) ; x++) {
693       int level;
694       int sign;
695       int blockPos = y * nStride + x;
696       level  = in_coeff[blockPos];
697       //logtrace(LogTransform,"(%d,%d) %d -> ", x,y,level);
698       sign   = (level < 0 ? -1: 1);
699 
700       level = (abs_value(level) * uiQ + rnd ) >> qBits;
701       uiAcSum += level;
702       level *= sign;
703       out_coeff[blockPos] = Clip3(-32768, 32767, level);
704       //logtrace(LogTransform,"%d\n", out_coeff[blockPos]);
705     }
706   }
707 }
708 
709 
dequant_coefficients(int16_t * out_coeff,const int16_t * in_coeff,int log2TrSize,int qP)710 void dequant_coefficients(int16_t* out_coeff,
711                           const int16_t* in_coeff,
712                           int log2TrSize, int qP)
713 {
714   const int m_x_y = 1;
715   int bitDepth = 8;
716   int bdShift = bitDepth + log2TrSize - 5;
717   bdShift -= 4;  // this is equivalent to having a m_x_y of 16 and we can use 32bit integers
718 
719   const int offset = (1<<(bdShift-1));
720   const int fact = m_x_y * levelScale[qP%6] << (qP/6);
721 
722   int blkSize = (1<<log2TrSize);
723   int nCoeff  = (1<<(log2TrSize<<1));
724 
725   for (int i=0;i<nCoeff;i++) {
726 
727     // usually, this needs to be 64bit, but because we modify the shift above, we can use 16 bit
728     int32_t currCoeff  = in_coeff[i];
729 
730     //logtrace(LogTransform,"coefficient[%d] = %d\n",i,currCoeff);
731 
732     currCoeff = Clip3(-32768,32767,
733                       ( (currCoeff * fact + offset ) >> bdShift));
734 
735     //logtrace(LogTransform," -> %d\n",currCoeff);
736 
737     out_coeff[i] = currCoeff;
738   }
739 }
740