1 /*
2 * H.265 video codec.
3 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
4 *
5 * This file is part of libde265.
6 *
7 * libde265 is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as
9 * published by the Free Software Foundation, either version 3 of
10 * the License, or (at your option) any later version.
11 *
12 * libde265 is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public License
18 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
19 */
20
21 #include "transform.h"
22 #include "util.h"
23
24 #include <assert.h>
25
26
27 const int tab8_22[] = { 29,30,31,32,33,33,34,34,35,35,36,36,37 /*,37*/ };
28
29
30 // (8.6.1)
decode_quantization_parameters(thread_context * tctx,int xC,int yC,int xCUBase,int yCUBase)31 void decode_quantization_parameters(thread_context* tctx, int xC,int yC,
32 int xCUBase, int yCUBase)
33 {
34 logtrace(LogTransform,">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> decode_quantization_parameters(int xC,int yC)=(%d,%d)\n", xC,yC);
35
36 const pic_parameter_set& pps = tctx->img->get_pps();
37 const seq_parameter_set& sps = tctx->img->get_sps();
38 slice_segment_header* shdr = tctx->shdr;
39
40 // top left pixel position of current quantization group
41 int xQG = xCUBase - (xCUBase & ((1<<pps.Log2MinCuQpDeltaSize)-1));
42 int yQG = yCUBase - (yCUBase & ((1<<pps.Log2MinCuQpDeltaSize)-1));
43
44 logtrace(LogTransform,"QG: %d,%d\n",xQG,yQG);
45
46
47 // we only have to set QP in the first call in a quantization-group
48
49 /* TODO: check why this does not work with HoneyBee stream
50
51 if (xQG == tctx->currentQG_x &&
52 yQG == tctx->currentQG_y)
53 {
54 return;
55 }
56 */
57
58 // if first QG in CU, remember last QPY of last CU previous QG
59
60 if (xQG != tctx->currentQG_x ||
61 yQG != tctx->currentQG_y)
62 {
63 tctx->lastQPYinPreviousQG = tctx->currentQPY;
64 tctx->currentQG_x = xQG;
65 tctx->currentQG_y = yQG;
66 }
67
68 int qPY_PRED;
69
70 // first QG in CTB row ?
71
72 int ctbLSBMask = ((1<<sps.Log2CtbSizeY)-1);
73 bool firstInCTBRow = (xQG == 0 && ((yQG & ctbLSBMask)==0));
74
75 // first QG in slice ? TODO: a "firstQG" flag in the thread context would be faster
76
77 int first_ctb_in_slice_RS = tctx->shdr->SliceAddrRS;
78
79 int SliceStartX = (first_ctb_in_slice_RS % sps.PicWidthInCtbsY) * sps.CtbSizeY;
80 int SliceStartY = (first_ctb_in_slice_RS / sps.PicWidthInCtbsY) * sps.CtbSizeY;
81
82 bool firstQGInSlice = (SliceStartX == xQG && SliceStartY == yQG);
83
84 // first QG in tile ?
85
86 bool firstQGInTile = false;
87 if (pps.tiles_enabled_flag) {
88 if ((xQG & ((1 << sps.Log2CtbSizeY)-1)) == 0 &&
89 (yQG & ((1 << sps.Log2CtbSizeY)-1)) == 0)
90 {
91 int ctbX = xQG >> sps.Log2CtbSizeY;
92 int ctbY = yQG >> sps.Log2CtbSizeY;
93
94 firstQGInTile = pps.is_tile_start_CTB(ctbX,ctbY); // TODO: this is slow
95 }
96 }
97
98
99 if (firstQGInSlice || firstQGInTile ||
100 (firstInCTBRow && pps.entropy_coding_sync_enabled_flag)) {
101 qPY_PRED = tctx->shdr->SliceQPY;
102 }
103 else {
104 qPY_PRED = tctx->lastQPYinPreviousQG;
105 }
106
107
108 int qPYA,qPYB;
109
110 if (tctx->img->available_zscan(xQG,yQG, xQG-1,yQG)) {
111 int xTmp = (xQG-1) >> sps.Log2MinTrafoSize;
112 int yTmp = (yQG ) >> sps.Log2MinTrafoSize;
113 int minTbAddrA = pps.MinTbAddrZS[xTmp + yTmp*sps.PicWidthInTbsY];
114 int ctbAddrA = minTbAddrA >> (2 * (sps.Log2CtbSizeY-sps.Log2MinTrafoSize));
115 if (ctbAddrA == tctx->CtbAddrInTS) {
116 qPYA = tctx->img->get_QPY(xQG-1,yQG);
117 }
118 else {
119 qPYA = qPY_PRED;
120 }
121 }
122 else {
123 qPYA = qPY_PRED;
124 }
125
126 if (tctx->img->available_zscan(xQG,yQG, xQG,yQG-1)) {
127 int xTmp = (xQG ) >> sps.Log2MinTrafoSize;
128 int yTmp = (yQG-1) >> sps.Log2MinTrafoSize;
129 int minTbAddrB = pps.MinTbAddrZS[xTmp + yTmp*sps.PicWidthInTbsY];
130 int ctbAddrB = minTbAddrB >> (2 * (sps.Log2CtbSizeY-sps.Log2MinTrafoSize));
131 if (ctbAddrB == tctx->CtbAddrInTS) {
132 qPYB = tctx->img->get_QPY(xQG,yQG-1);
133 }
134 else {
135 qPYB = qPY_PRED;
136 }
137 }
138 else {
139 qPYB = qPY_PRED;
140 }
141
142 qPY_PRED = (qPYA + qPYB + 1)>>1;
143
144 logtrace(LogTransform,"qPY_PRED = %d (%d, %d)\n",qPY_PRED, qPYA, qPYB);
145
146 int QPY = ((qPY_PRED + tctx->CuQpDelta + 52+2*sps.QpBdOffset_Y) %
147 (52 + sps.QpBdOffset_Y)) - sps.QpBdOffset_Y;
148
149 tctx->qPYPrime = QPY + sps.QpBdOffset_Y;
150 if (tctx->qPYPrime<0) {
151 tctx->qPYPrime=0;
152 }
153
154 int qPiCb = Clip3(-sps.QpBdOffset_C,57, QPY+pps.pic_cb_qp_offset + shdr->slice_cb_qp_offset + tctx->CuQpOffsetCb);
155 int qPiCr = Clip3(-sps.QpBdOffset_C,57, QPY+pps.pic_cr_qp_offset + shdr->slice_cr_qp_offset + tctx->CuQpOffsetCr);
156
157 logtrace(LogTransform,"qPiCb:%d (%d %d), qPiCr:%d (%d %d)\n",
158 qPiCb, pps.pic_cb_qp_offset, shdr->slice_cb_qp_offset,
159 qPiCr, pps.pic_cr_qp_offset, shdr->slice_cr_qp_offset);
160
161 int qPCb,qPCr;
162
163 if (sps.ChromaArrayType == CHROMA_420) {
164 qPCb = table8_22(qPiCb);
165 qPCr = table8_22(qPiCr);
166 }
167 else {
168 qPCb = qPiCb;
169 qPCr = qPiCr;
170 }
171
172 //printf("q: %d %d\n",qPiCb, qPCb);
173
174 tctx->qPCbPrime = qPCb + sps.QpBdOffset_C;
175 if (tctx->qPCbPrime<0) {
176 tctx->qPCbPrime = 0;
177 }
178
179 tctx->qPCrPrime = qPCr + sps.QpBdOffset_C;
180 if (tctx->qPCrPrime<0) {
181 tctx->qPCrPrime = 0;
182 }
183
184 /*
185 printf("Q: %d (%d %d %d / %d %d) %d %d %d\n",QPY,
186 sps->QpBdOffset_Y,
187 pps->pic_cb_qp_offset + shdr->slice_cb_qp_offset,
188 pps->pic_cr_qp_offset + shdr->slice_cr_qp_offset,
189 sps->QpBdOffset_C, sps->QpBdOffset_C,
190 tctx->qPYPrime, tctx->qPCbPrime, tctx->qPCrPrime);
191 */
192
193 int log2CbSize = tctx->img->get_log2CbSize(xCUBase, yCUBase);
194
195 // TODO: On broken input, log2CbSize may be zero (multithreaded only). Not sure yet why.
196 // Maybe another decoding thread is overwriting the value set in slice.cc:read_coding_unit.
197 // id:000163,sig:06,src:002041,op:havoc,rep:16.bin
198 if (log2CbSize<3) { log2CbSize=3; }
199
200 tctx->img->set_QPY(xCUBase, yCUBase, log2CbSize, QPY);
201 tctx->currentQPY = QPY;
202
203 /*
204 printf("SET QPY POC=%d %d;%d-%d;%d = %d\n",ctx->img->PicOrderCntVal,xCUBase,yCUBase,
205 xCUBase+(1<<log2CbSize),yCUBase+(1<<log2CbSize), QPY);
206 */
207
208 logtrace(LogTransform,"qPY(%d,%d,%d)= %d, qPYPrime=%d\n",
209 xCUBase,yCUBase,1<<log2CbSize,QPY,tctx->qPYPrime);
210 }
211
212
213
214 template <class pixel_t>
transform_coefficients(acceleration_functions * acceleration,int16_t * coeff,int coeffStride,int nT,int trType,pixel_t * dst,int dstStride,int bit_depth)215 void transform_coefficients(acceleration_functions* acceleration,
216 int16_t* coeff, int coeffStride, int nT, int trType,
217 pixel_t* dst, int dstStride, int bit_depth)
218 {
219 logtrace(LogTransform,"transform --- trType: %d nT: %d\n",trType,nT);
220
221
222 if (trType==1) {
223
224 acceleration->transform_4x4_dst_add<pixel_t>(dst, coeff, dstStride, bit_depth);
225
226 } else {
227
228 /**/ if (nT==4) { acceleration->transform_add<pixel_t>(0,dst,coeff,dstStride, bit_depth); }
229 else if (nT==8) { acceleration->transform_add<pixel_t>(1,dst,coeff,dstStride, bit_depth); }
230 else if (nT==16) { acceleration->transform_add<pixel_t>(2,dst,coeff,dstStride, bit_depth); }
231 else { acceleration->transform_add<pixel_t>(3,dst,coeff,dstStride, bit_depth); }
232 }
233
234 #if 0
235 printf("decoded pixels:\n");
236 for (int y=0;y<nT;y++,printf("\n"))
237 for (int x=0;x<nT;x++) {
238 printf("%02x ",dst[y*dstStride+x]);
239 }
240 #endif
241 }
242
243
244 // TODO: make this an accelerated function
cross_comp_pred(const thread_context * tctx,int32_t * residual,int nT)245 void cross_comp_pred(const thread_context* tctx, int32_t* residual, int nT)
246 {
247 const int BitDepthC = tctx->img->get_sps().BitDepth_C;
248 const int BitDepthY = tctx->img->get_sps().BitDepth_Y;
249
250 for (int y=0;y<nT;y++)
251 for (int x=0;x<nT;x++) {
252 /* TODO: the most usual case is definitely BitDepthY == BitDepthC, in which case
253 we could just omit two shifts. The second most common case is probably
254 BitDepthY>BitDepthC, for which we could also eliminate one shift. The remaining
255 case is also one shift only.
256 */
257
258 residual[y*nT+x] += (tctx->ResScaleVal *
259 ((tctx->residual_luma[y*nT+x] << BitDepthC ) >> BitDepthY ) ) >> 3;
260 }
261 }
262
263
264 template <class pixel_t>
transform_coefficients_explicit(thread_context * tctx,int16_t * coeff,int coeffStride,int nT,int trType,pixel_t * dst,int dstStride,int bit_depth,int cIdx)265 void transform_coefficients_explicit(thread_context* tctx,
266 int16_t* coeff, int coeffStride, int nT, int trType,
267 pixel_t* dst, int dstStride, int bit_depth, int cIdx)
268 {
269 logtrace(LogTransform,"transform --- trType: %d nT: %d\n",trType,nT);
270
271 const acceleration_functions* acceleration = &tctx->decctx->acceleration;
272
273 int32_t residual_buffer[32*32];
274 int32_t* residual;
275 if (cIdx==0) {
276 residual = tctx->residual_luma;
277 }
278 else {
279 residual = residual_buffer;
280 }
281
282
283 // TODO
284 int bdShift = 20 - bit_depth;
285 int max_coeff_bits = 15;
286
287 if (trType==1) {
288
289 acceleration->transform_idst_4x4(residual, coeff, bdShift, max_coeff_bits);
290
291 } else {
292
293 /**/ if (nT==4) { acceleration->transform_idct_4x4(residual,coeff,bdShift,max_coeff_bits); }
294 else if (nT==8) { acceleration->transform_idct_8x8(residual,coeff,bdShift,max_coeff_bits); }
295 else if (nT==16) { acceleration->transform_idct_16x16(residual,coeff,bdShift,max_coeff_bits); }
296 else { acceleration->transform_idct_32x32(residual,coeff,bdShift,max_coeff_bits); }
297 }
298
299
300 //printBlk("prediction",(uint8_t*)dst,nT,dstStride);
301 //printBlk("residual",residual,nT,nT);
302
303 if (cIdx != 0) {
304 if (tctx->ResScaleVal != 0) {
305 cross_comp_pred(tctx, residual, nT);
306 }
307
308 //printBlk("cross-comp-pred modified residual",residual,nT,nT);
309 }
310
311 acceleration->add_residual(dst,dstStride, residual,nT, bit_depth);
312 }
313
314
inv_transform(acceleration_functions * acceleration,uint8_t * dst,int dstStride,int16_t * coeff,int log2TbSize,int trType)315 void inv_transform(acceleration_functions* acceleration,
316 uint8_t* dst, int dstStride, int16_t* coeff,
317 int log2TbSize, int trType)
318 {
319 if (trType==1) {
320 assert(log2TbSize==2);
321
322 acceleration->transform_4x4_dst_add_8(dst, coeff, dstStride);
323
324 } else {
325 acceleration->transform_add_8[log2TbSize-2](dst,coeff,dstStride);
326 }
327
328
329 #if 0
330 int nT = 1<<log2TbSize;
331 printf("decoded pixels:\n");
332 for (int y=0;y<nT;y++,printf("\n"))
333 for (int x=0;x<nT;x++) {
334 printf("%02x ",dst[y*dstStride+x]);
335 }
336 #endif
337 }
338
339
fwd_transform(acceleration_functions * acceleration,int16_t * coeff,int coeffStride,int log2TbSize,int trType,const int16_t * src,int srcStride)340 void fwd_transform(acceleration_functions* acceleration,
341 int16_t* coeff, int coeffStride, int log2TbSize, int trType,
342 const int16_t* src, int srcStride)
343 {
344 logtrace(LogTransform,"transform --- trType: %d nT: %d\n",trType,1<<log2TbSize);
345
346 if (trType==1) {
347 // DST 4x4
348
349 acceleration->fwd_transform_4x4_dst_8(coeff, src, srcStride);
350 } else {
351 // DCT 4x4, 8x8, 16x16, 32x32
352
353 acceleration->fwd_transform_8[log2TbSize-2](coeff,src,srcStride);
354 }
355 }
356
357
358
359 static const int levelScale[] = { 40,45,51,57,64,72 };
360
361 // (8.6.2) and (8.6.3)
362 template <class pixel_t>
scale_coefficients_internal(thread_context * tctx,int xT,int yT,int x0,int y0,int nT,int cIdx,bool transform_skip_flag,bool intra,int rdpcmMode)363 void scale_coefficients_internal(thread_context* tctx,
364 int xT,int yT, // position of TU in frame (chroma adapted)
365 int x0,int y0, // position of CU in frame (chroma adapted)
366 int nT, int cIdx,
367 bool transform_skip_flag, bool intra, int rdpcmMode)
368 {
369 const seq_parameter_set& sps = tctx->img->get_sps();
370 const pic_parameter_set& pps = tctx->img->get_pps();
371
372 int qP;
373 switch (cIdx) {
374 case 0: qP = tctx->qPYPrime; break;
375 case 1: qP = tctx->qPCbPrime; break;
376 case 2: qP = tctx->qPCrPrime; break;
377 default: qP = 0; assert(0); break; // should never happen
378 }
379
380 logtrace(LogTransform,"qP: %d\n",qP);
381
382
383 int16_t* coeff;
384 int coeffStride;
385
386 coeff = tctx->coeffBuf;
387 coeffStride = nT;
388
389
390
391
392
393 pixel_t* pred;
394 int stride;
395 pred = tctx->img->get_image_plane_at_pos_NEW<pixel_t>(cIdx, xT,yT);
396 stride = tctx->img->get_image_stride(cIdx);
397
398 // We explicitly include the case for sizeof(pixel_t)==1 so that the compiler
399 // can optimize away a lot of code for 8-bit pixels.
400 const int bit_depth = ((sizeof(pixel_t)==1) ? 8 : sps.get_bit_depth(cIdx));
401
402 //assert(intra == (tctx->img->get_pred_mode(xT,yT)==MODE_INTRA));
403 int cuPredModeIntra = (tctx->img->get_pred_mode(xT,yT)==MODE_INTRA);
404
405 bool rotateCoeffs = (sps.range_extension.transform_skip_rotation_enabled_flag &&
406 nT == 4 &&
407 cuPredModeIntra);
408
409 if (tctx->cu_transquant_bypass_flag) {
410
411 int32_t residual_buffer[32*32];
412
413 int32_t* residual;
414 if (cIdx==0) residual = tctx->residual_luma;
415 else residual = residual_buffer;
416
417
418 // TODO: we could fold the coefficient rotation into the coefficient expansion here:
419 for (int i=0;i<tctx->nCoeff[cIdx];i++) {
420 int32_t currCoeff = tctx->coeffList[cIdx][i];
421 tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff;
422 }
423
424 if (rotateCoeffs) {
425 tctx->decctx->acceleration.rotate_coefficients(coeff, nT);
426 }
427
428 if (rdpcmMode) {
429 if (rdpcmMode==2)
430 tctx->decctx->acceleration.transform_bypass_rdpcm_v(residual, coeff, nT);
431 else
432 tctx->decctx->acceleration.transform_bypass_rdpcm_h(residual, coeff, nT);
433 }
434 else {
435 tctx->decctx->acceleration.transform_bypass(residual, coeff, nT);
436 }
437
438 if (cIdx != 0) {
439 if (tctx->ResScaleVal != 0) {
440 cross_comp_pred(tctx, residual, nT);
441 }
442 }
443
444 tctx->decctx->acceleration.add_residual(pred,stride, residual,nT, bit_depth);
445
446 if (rotateCoeffs) {
447 memset(coeff, 0, nT*nT*sizeof(int16_t)); // delete all, because we moved the coeffs around
448 }
449 }
450 else {
451 // (8.6.3)
452
453 int bdShift = (cIdx==0 ? sps.BitDepth_Y : sps.BitDepth_C) + Log2(nT) - 5;
454
455 logtrace(LogTransform,"bdShift=%d\n",bdShift);
456
457 logtrace(LogTransform,"dequant %d;%d cIdx=%d qp=%d\n",xT*(cIdx?2:1),yT*(cIdx?2:1),cIdx,qP);
458
459
460 // --- inverse quantization ---
461
462 if (sps.scaling_list_enable_flag==0) {
463
464 //const int m_x_y = 16;
465 const int m_x_y = 1;
466 bdShift -= 4; // this is equivalent to having a m_x_y of 16 and we can use 32bit integers
467
468 const int offset = (1<<(bdShift-1));
469 const int fact = m_x_y * levelScale[qP%6] << (qP/6);
470
471 for (int i=0;i<tctx->nCoeff[cIdx];i++) {
472
473 // usually, this needs to be 64bit, but because we modify the shift above, we can use 16 bit
474 int32_t currCoeff = tctx->coeffList[cIdx][i];
475
476 //logtrace(LogTransform,"coefficient[%d] = %d\n",tctx->coeffPos[cIdx][i],
477 //tctx->coeffList[cIdx][i]);
478
479 currCoeff = Clip3(-32768,32767,
480 ( (currCoeff * fact + offset ) >> bdShift));
481
482 //logtrace(LogTransform," -> %d\n",currCoeff);
483
484 tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff;
485 }
486 }
487 else {
488 const int offset = (1<<(bdShift-1));
489
490 const uint8_t* sclist;
491 int matrixID = cIdx;
492 if (!intra) {
493 if (nT<32) { matrixID += 3; }
494 else { matrixID++; }
495 }
496
497 switch (nT) {
498 case 4: sclist = &pps.scaling_list.ScalingFactor_Size0[matrixID][0][0]; break;
499 case 8: sclist = &pps.scaling_list.ScalingFactor_Size1[matrixID][0][0]; break;
500 case 16: sclist = &pps.scaling_list.ScalingFactor_Size2[matrixID][0][0]; break;
501 case 32: sclist = &pps.scaling_list.ScalingFactor_Size3[matrixID][0][0]; break;
502 default: assert(0);
503 }
504
505 for (int i=0;i<tctx->nCoeff[cIdx];i++) {
506 int pos = tctx->coeffPos[cIdx][i];
507 int x = pos%nT;
508 int y = pos/nT;
509
510 const int m_x_y = sclist[x+y*nT];
511 const int fact = m_x_y * levelScale[qP%6] << (qP/6);
512
513 int64_t currCoeff = tctx->coeffList[cIdx][i];
514
515 currCoeff = Clip3(-32768,32767,
516 ( (currCoeff * fact + offset ) >> bdShift));
517
518 tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = currCoeff;
519 }
520 }
521
522
523 // --- do transform or skip ---
524
525 logtrace(LogTransform,"coefficients OUT:\n");
526 for (int y=0;y<nT;y++) {
527 logtrace(LogTransform," ");
528 for (int x=0;x<nT;x++) {
529 logtrace(LogTransform,"*%3d ", coeff[x+y*coeffStride]);
530 }
531 logtrace(LogTransform,"*\n");
532 }
533
534 int bdShift2 = (cIdx==0) ? 20-sps.BitDepth_Y : 20-sps.BitDepth_C;
535
536 logtrace(LogTransform,"bdShift2=%d\n",bdShift2);
537
538 logtrace(LogSlice,"get_transform_skip_flag(%d,%d, cIdx=%d)=%d\n",xT,yT,cIdx,
539 transform_skip_flag);
540
541 if (transform_skip_flag) {
542
543 int extended_precision_processing_flag = 0;
544 int Log2nTbS = Log2(nT);
545 int bdShift = libde265_max( 20 - bit_depth, extended_precision_processing_flag ? 11 : 0 );
546 int tsShift = (extended_precision_processing_flag ? libde265_min( 5, bdShift - 2 ) : 5 )
547 + Log2nTbS;
548
549 if (rotateCoeffs) {
550 tctx->decctx->acceleration.rotate_coefficients(coeff, nT);
551 }
552
553 int32_t residual_buffer[32*32];
554
555 int32_t* residual;
556 if (cIdx==0) residual = tctx->residual_luma;
557 else residual = residual_buffer;
558
559 if (rdpcmMode) {
560 /*
561 if (rdpcmMode==2)
562 tctx->decctx->acceleration.transform_skip_rdpcm_v(pred,coeff, Log2(nT), stride, bit_depth);
563 else
564 tctx->decctx->acceleration.transform_skip_rdpcm_h(pred,coeff, Log2(nT), stride, bit_depth);
565 */
566
567 if (rdpcmMode==2)
568 tctx->decctx->acceleration.rdpcm_v(residual, coeff,nT, tsShift,bdShift);
569 else
570 tctx->decctx->acceleration.rdpcm_h(residual, coeff,nT, tsShift,bdShift);
571 }
572 else {
573 //tctx->decctx->acceleration.transform_skip(pred, coeff, stride, bit_depth);
574
575 tctx->decctx->acceleration.transform_skip_residual(residual, coeff, nT, tsShift, bdShift);
576 }
577
578 if (cIdx != 0) {
579 if (tctx->ResScaleVal != 0) {
580 cross_comp_pred(tctx, residual, nT);
581 }
582 }
583
584 tctx->decctx->acceleration.add_residual(pred,stride, residual,nT, bit_depth);
585
586 if (rotateCoeffs) {
587 memset(coeff, 0, nT*nT*sizeof(int16_t)); // delete all, because we moved the coeffs around
588 }
589 }
590 else {
591 int trType;
592
593 //if (nT==4 && cIdx==0 && tctx->img->get_pred_mode(xT,yT)==MODE_INTRA) {
594 if (nT==4 && cIdx==0 && cuPredModeIntra) {
595 trType=1;
596 }
597 else {
598 trType=0;
599 }
600
601 assert(rdpcmMode==0);
602
603
604 if (tctx->img->get_pps().range_extension.cross_component_prediction_enabled_flag) {
605 // cross-component-prediction: transform to residual buffer and add in a separate step
606
607 transform_coefficients_explicit(tctx, coeff, coeffStride, nT, trType,
608 pred, stride, bit_depth, cIdx);
609 }
610 else {
611 transform_coefficients(&tctx->decctx->acceleration, coeff, coeffStride, nT, trType,
612 pred, stride, bit_depth);
613 }
614 }
615 }
616
617
618 logtrace(LogTransform,"pixels (cIdx:%d), position %d %d:\n",cIdx, xT,yT);
619
620 for (int y=0;y<nT;y++) {
621 logtrace(LogTransform,"RECO-%3d-%3d-%d ",xT,yT+y,cIdx);
622
623 for (int x=0;x<nT;x++) {
624 logtrace(LogTransform,"*%03x ", pred[x+y*stride]);
625 }
626
627 logtrace(LogTransform,"*\n");
628 }
629
630 // zero out scrap coefficient buffer again
631
632 for (int i=0;i<tctx->nCoeff[cIdx];i++) {
633 tctx->coeffBuf[ tctx->coeffPos[cIdx][i] ] = 0;
634 }
635 }
636
637
scale_coefficients(thread_context * tctx,int xT,int yT,int x0,int y0,int nT,int cIdx,bool transform_skip_flag,bool intra,int rdpcmMode)638 void scale_coefficients(thread_context* tctx,
639 int xT,int yT, // position of TU in frame (chroma adapted)
640 int x0,int y0, // position of CU in frame (chroma adapted)
641 int nT, int cIdx,
642 bool transform_skip_flag, bool intra,
643 int rdpcmMode // 0 - off, 1 - Horizontal, 2 - Vertical
644 )
645 {
646 if (tctx->img->high_bit_depth(cIdx)) {
647 scale_coefficients_internal<uint16_t>(tctx, xT,yT, x0,y0, nT,cIdx, transform_skip_flag, intra,
648 rdpcmMode);
649 } else {
650 scale_coefficients_internal<uint8_t> (tctx, xT,yT, x0,y0, nT,cIdx, transform_skip_flag, intra,
651 rdpcmMode);
652 }
653 }
654
655
656 //#define QUANT_IQUANT_SHIFT 20 // Q(QP%6) * IQ(QP%6) = 2^20
657 #define QUANT_SHIFT 14 // Q(4) = 2^14
658 //#define SCALE_BITS 15 // Inherited from TMuC, pressumably for fractional bit estimates in RDOQ
659 #define MAX_TR_DYNAMIC_RANGE 15 // Maximum transform dynamic range (excluding sign bit)
660
661
662 const static uint16_t g_quantScales[6] = {
663 26214,23302,20560,18396,16384,14564
664 };
665
quant_coefficients(int16_t * out_coeff,const int16_t * in_coeff,int log2TrSize,int qp,bool intra)666 void quant_coefficients(//encoder_context* ectx,
667 int16_t* out_coeff,
668 const int16_t* in_coeff,
669 int log2TrSize, int qp,
670 bool intra)
671 {
672 const int qpDiv6 = qp / 6;
673 const int qpMod6 = qp % 6;
674
675 //int uiLog2TrSize = xLog2( iWidth - 1);
676
677 int uiQ = g_quantScales[qpMod6];
678 int bitDepth = 8;
679 int transformShift = MAX_TR_DYNAMIC_RANGE - bitDepth - log2TrSize; // Represents scaling through forward transform
680 int qBits = QUANT_SHIFT + qpDiv6 + transformShift;
681
682 /* TODO: originally, this was checking for intra slices, why not for intra mode ?
683 */
684 int rnd = (intra ? 171 : 85) << (qBits-9);
685
686 int x, y;
687 int uiAcSum = 0;
688
689 int nStride = (1<<log2TrSize);
690
691 for (y=0; y < (1<<log2TrSize) ; y++) {
692 for (x=0; x < (1<<log2TrSize) ; x++) {
693 int level;
694 int sign;
695 int blockPos = y * nStride + x;
696 level = in_coeff[blockPos];
697 //logtrace(LogTransform,"(%d,%d) %d -> ", x,y,level);
698 sign = (level < 0 ? -1: 1);
699
700 level = (abs_value(level) * uiQ + rnd ) >> qBits;
701 uiAcSum += level;
702 level *= sign;
703 out_coeff[blockPos] = Clip3(-32768, 32767, level);
704 //logtrace(LogTransform,"%d\n", out_coeff[blockPos]);
705 }
706 }
707 }
708
709
dequant_coefficients(int16_t * out_coeff,const int16_t * in_coeff,int log2TrSize,int qP)710 void dequant_coefficients(int16_t* out_coeff,
711 const int16_t* in_coeff,
712 int log2TrSize, int qP)
713 {
714 const int m_x_y = 1;
715 int bitDepth = 8;
716 int bdShift = bitDepth + log2TrSize - 5;
717 bdShift -= 4; // this is equivalent to having a m_x_y of 16 and we can use 32bit integers
718
719 const int offset = (1<<(bdShift-1));
720 const int fact = m_x_y * levelScale[qP%6] << (qP/6);
721
722 int blkSize = (1<<log2TrSize);
723 int nCoeff = (1<<(log2TrSize<<1));
724
725 for (int i=0;i<nCoeff;i++) {
726
727 // usually, this needs to be 64bit, but because we modify the shift above, we can use 16 bit
728 int32_t currCoeff = in_coeff[i];
729
730 //logtrace(LogTransform,"coefficient[%d] = %d\n",i,currCoeff);
731
732 currCoeff = Clip3(-32768,32767,
733 ( (currCoeff * fact + offset ) >> bdShift));
734
735 //logtrace(LogTransform," -> %d\n",currCoeff);
736
737 out_coeff[i] = currCoeff;
738 }
739 }
740