1 /*
2 * H.265 video codec.
3 * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
4 *
5 * This file is part of libde265.
6 *
7 * libde265 is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as
9 * published by the Free Software Foundation, either version 3 of
10 * the License, or (at your option) any later version.
11 *
12 * libde265 is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public License
18 * along with libde265. If not, see <http://www.gnu.org/licenses/>.
19 */
20
21 #include "sao.h"
22 #include "util.h"
23
24 #include <stdlib.h>
25 #include <string.h>
26
27
28 template <class pixel_t>
apply_sao_internal(de265_image * img,int xCtb,int yCtb,const slice_segment_header * shdr,int cIdx,int nSW,int nSH,const pixel_t * in_img,int in_stride,pixel_t * out_img,int out_stride)29 void apply_sao_internal(de265_image* img, int xCtb,int yCtb,
30 const slice_segment_header* shdr, int cIdx, int nSW,int nSH,
31 const pixel_t* in_img, int in_stride,
32 /* */ pixel_t* out_img, int out_stride)
33 {
34 const sao_info* saoinfo = img->get_sao_info(xCtb,yCtb);
35
36 int SaoTypeIdx = (saoinfo->SaoTypeIdx >> (2*cIdx)) & 0x3;
37
38 logtrace(LogSAO,"apply_sao CTB %d;%d cIdx:%d type=%d (%dx%d)\n",xCtb,yCtb,cIdx, SaoTypeIdx, nSW,nSH);
39
40 if (SaoTypeIdx==0) {
41 return;
42 }
43
44 const seq_parameter_set* sps = &img->get_sps();
45 const pic_parameter_set* pps = &img->get_pps();
46 const int bitDepth = (cIdx==0 ? sps->BitDepth_Y : sps->BitDepth_C);
47 const int maxPixelValue = (1<<bitDepth)-1;
48
49 // top left position of CTB in pixels
50 const int xC = xCtb*nSW;
51 const int yC = yCtb*nSH;
52
53 const int width = img->get_width(cIdx);
54 const int height = img->get_height(cIdx);
55
56 const int ctbSliceAddrRS = img->get_SliceHeader(xC,yC)->SliceAddrRS;
57
58 const int picWidthInCtbs = sps->PicWidthInCtbsY;
59 const int chromashiftW = sps->get_chroma_shift_W(cIdx);
60 const int chromashiftH = sps->get_chroma_shift_H(cIdx);
61 const int ctbshiftW = sps->Log2CtbSizeY - chromashiftW;
62 const int ctbshiftH = sps->Log2CtbSizeY - chromashiftH;
63
64
65 for (int i=0;i<5;i++)
66 {
67 logtrace(LogSAO,"offset[%d] = %d\n", i, i==0 ? 0 : saoinfo->saoOffsetVal[cIdx][i-1]);
68 }
69
70
71 // actual size of CTB to be processed (can be smaller when partially outside of image)
72 const int ctbW = (xC+nSW>width) ? width -xC : nSW;
73 const int ctbH = (yC+nSH>height) ? height-yC : nSH;
74
75
76 const bool extendedTests = img->get_CTB_has_pcm_or_cu_transquant_bypass(xCtb,yCtb);
77
78 if (SaoTypeIdx==2) {
79 int hPos[2], vPos[2];
80 int vPosStride[2]; // vPos[] multiplied by image stride
81 int SaoEoClass = (saoinfo->SaoEoClass >> (2*cIdx)) & 0x3;
82
83 switch (SaoEoClass) {
84 case 0: hPos[0]=-1; hPos[1]= 1; vPos[0]= 0; vPos[1]=0; break;
85 case 1: hPos[0]= 0; hPos[1]= 0; vPos[0]=-1; vPos[1]=1; break;
86 case 2: hPos[0]=-1; hPos[1]= 1; vPos[0]=-1; vPos[1]=1; break;
87 case 3: hPos[0]= 1; hPos[1]=-1; vPos[0]=-1; vPos[1]=1; break;
88 }
89
90 vPosStride[0] = vPos[0] * in_stride;
91 vPosStride[1] = vPos[1] * in_stride;
92
93 /* Reorder sao_info.saoOffsetVal[] array, so that we can index it
94 directly with the sum of the two pixel-difference signs. */
95 int8_t saoOffsetVal[5]; // [2] unused
96 saoOffsetVal[0] = saoinfo->saoOffsetVal[cIdx][1-1];
97 saoOffsetVal[1] = saoinfo->saoOffsetVal[cIdx][2-1];
98 saoOffsetVal[2] = 0;
99 saoOffsetVal[3] = saoinfo->saoOffsetVal[cIdx][3-1];
100 saoOffsetVal[4] = saoinfo->saoOffsetVal[cIdx][4-1];
101
102
103 for (int j=0;j<ctbH;j++) {
104 const pixel_t* in_ptr = &in_img [xC+(yC+j)*in_stride];
105 /* */ pixel_t* out_ptr = &out_img[xC+(yC+j)*out_stride];
106
107 for (int i=0;i<ctbW;i++) {
108 int edgeIdx = -1;
109
110 logtrace(LogSAO, "pos %d,%d\n",xC+i,yC+j);
111
112 if ((extendedTests &&
113 (sps->pcm_loop_filter_disable_flag &&
114 img->get_pcm_flag((xC+i)<<chromashiftW,(yC+j)<<chromashiftH))) ||
115 img->get_cu_transquant_bypass((xC+i)<<chromashiftW,(yC+j)<<chromashiftH)) {
116 continue;
117 }
118
119 // do the expensive test for boundaries only at the boundaries
120 bool testBoundary = (i==0 || j==0 || i==ctbW-1 || j==ctbH-1);
121
122 if (testBoundary)
123 for (int k=0;k<2;k++) {
124 int xS = xC+i+hPos[k];
125 int yS = yC+j+vPos[k];
126
127 if (xS<0 || yS<0 || xS>=width || yS>=height) {
128 edgeIdx=0;
129 break;
130 }
131
132
133 // This part seems inefficient with all the get_SliceHeaderIndex() calls,
134 // but removing this part (because the input was known to have only a single
135 // slice anyway) reduced computation time only by 1.3%.
136 // TODO: however, this may still be a big part of SAO itself.
137
138 slice_segment_header* sliceHeader = img->get_SliceHeader(xS<<chromashiftW,
139 yS<<chromashiftH);
140 if (sliceHeader==NULL) { return; }
141
142 int sliceAddrRS = sliceHeader->SliceAddrRS;
143 if (sliceAddrRS < ctbSliceAddrRS &&
144 img->get_SliceHeader((xC+i)<<chromashiftW,
145 (yC+j)<<chromashiftH)->slice_loop_filter_across_slices_enabled_flag==0) {
146 edgeIdx=0;
147 break;
148 }
149
150 if (sliceAddrRS > ctbSliceAddrRS &&
151 img->get_SliceHeader(xS<<chromashiftW,
152 yS<<chromashiftH)->slice_loop_filter_across_slices_enabled_flag==0) {
153 edgeIdx=0;
154 break;
155 }
156
157
158 if (pps->loop_filter_across_tiles_enabled_flag==0 &&
159 pps->TileIdRS[(xS>>ctbshiftW) + (yS>>ctbshiftH)*picWidthInCtbs] !=
160 pps->TileIdRS[(xC>>ctbshiftW) + (yC>>ctbshiftH)*picWidthInCtbs]) {
161 edgeIdx=0;
162 break;
163 }
164 }
165
166 if (edgeIdx != 0) {
167
168 edgeIdx = ( Sign(in_ptr[i] - in_ptr[i+hPos[0]+vPosStride[0]]) +
169 Sign(in_ptr[i] - in_ptr[i+hPos[1]+vPosStride[1]]) );
170
171 if (1) { // edgeIdx != 0) { // seems to be faster without this check (zero in offset table)
172 int offset = saoOffsetVal[edgeIdx+2];
173
174 out_ptr[i] = Clip3(0,maxPixelValue,
175 in_ptr[i] + offset);
176 }
177 }
178 }
179 }
180 }
181 else {
182 int bandShift = bitDepth-5;
183 int saoLeftClass = saoinfo->sao_band_position[cIdx];
184 logtrace(LogSAO,"saoLeftClass: %d\n",saoLeftClass);
185
186 int bandTable[32];
187 memset(bandTable, 0, sizeof(int)*32);
188
189 for (int k=0;k<4;k++) {
190 bandTable[ (k+saoLeftClass)&31 ] = k+1;
191 }
192
193
194 /* If PCM or transquant_bypass is used in this CTB, we have to
195 run all checks (A).
196 Otherwise, we run a simplified version of the code (B).
197
198 NOTE: this whole part of SAO does not seem to be a significant part of the time spent
199 */
200
201 if (extendedTests) {
202
203 // (A) full version with all checks
204
205 for (int j=0;j<ctbH;j++)
206 for (int i=0;i<ctbW;i++) {
207
208 if ((sps->pcm_loop_filter_disable_flag &&
209 img->get_pcm_flag((xC+i)<<chromashiftW,(yC+j)<<chromashiftH)) ||
210 img->get_cu_transquant_bypass((xC+i)<<chromashiftW,(yC+j)<<chromashiftH)) {
211 continue;
212 }
213
214 // Shifts are a strange thing. On x86, >>x actually computes >>(x%64).
215 // So we have to take care of large bandShifts.
216 int bandIdx;
217 if (bandShift >= 8) {
218 bandIdx = 0;
219 } else {
220 bandIdx = bandTable[ in_img[xC+i+(yC+j)*in_stride]>>bandShift ];
221 }
222
223 if (bandIdx>0) {
224 int offset = saoinfo->saoOffsetVal[cIdx][bandIdx-1];
225
226 logtrace(LogSAO,"%d %d (%d) offset %d %x -> %x\n",xC+i,yC+j,bandIdx,
227 offset,
228 in_img[xC+i+(yC+j)*in_stride],
229 in_img[xC+i+(yC+j)*in_stride]+offset);
230
231 out_img[xC+i+(yC+j)*out_stride] = Clip3(0,maxPixelValue,
232 in_img[xC+i+(yC+j)*in_stride] + offset);
233 }
234 }
235 }
236 else
237 {
238 // (B) simplified version (only works if no PCM and transquant_bypass is active)
239
240 for (int j=0;j<ctbH;j++)
241 for (int i=0;i<ctbW;i++) {
242
243 // see above
244 int bandIdx;
245 if (bandShift >= 8) {
246 bandIdx = 0;
247 } else {
248 bandIdx = bandTable[ in_img[xC+i+(yC+j)*in_stride]>>bandShift ];
249 }
250
251 if (bandIdx>0) {
252 int offset = saoinfo->saoOffsetVal[cIdx][bandIdx-1];
253
254 out_img[xC+i+(yC+j)*out_stride] = Clip3(0,maxPixelValue,
255 in_img[xC+i+(yC+j)*in_stride] + offset);
256 }
257 }
258 }
259 }
260 }
261
262
263 template <class pixel_t>
apply_sao(de265_image * img,int xCtb,int yCtb,const slice_segment_header * shdr,int cIdx,int nSW,int nSH,const pixel_t * in_img,int in_stride,pixel_t * out_img,int out_stride)264 void apply_sao(de265_image* img, int xCtb,int yCtb,
265 const slice_segment_header* shdr, int cIdx, int nSW,int nSH,
266 const pixel_t* in_img, int in_stride,
267 /* */ pixel_t* out_img, int out_stride)
268 {
269 if (img->high_bit_depth(cIdx)) {
270 apply_sao_internal<uint16_t>(img,xCtb,yCtb, shdr,cIdx,nSW,nSH,
271 (uint16_t*)in_img, in_stride,
272 (uint16_t*)out_img,out_stride);
273 }
274 else {
275 apply_sao_internal<uint8_t>(img,xCtb,yCtb, shdr,cIdx,nSW,nSH,
276 in_img, in_stride,
277 out_img,out_stride);
278 }
279 }
280
281
apply_sample_adaptive_offset(de265_image * img)282 void apply_sample_adaptive_offset(de265_image* img)
283 {
284 const seq_parameter_set& sps = img->get_sps();
285
286 if (sps.sample_adaptive_offset_enabled_flag==0) {
287 return;
288 }
289
290 de265_image inputCopy;
291 de265_error err = inputCopy.copy_image(img);
292 if (err != DE265_OK) {
293 img->decctx->add_warning(DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY,false);
294 return;
295 }
296
297 for (int yCtb=0; yCtb<sps.PicHeightInCtbsY; yCtb++)
298 for (int xCtb=0; xCtb<sps.PicWidthInCtbsY; xCtb++)
299 {
300 const slice_segment_header* shdr = img->get_SliceHeaderCtb(xCtb,yCtb);
301
302 if (shdr->slice_sao_luma_flag) {
303 apply_sao(img, xCtb,yCtb, shdr, 0, 1<<sps.Log2CtbSizeY, 1<<sps.Log2CtbSizeY,
304 inputCopy.get_image_plane(0), inputCopy.get_image_stride(0),
305 img->get_image_plane(0), img->get_image_stride(0));
306 }
307
308 if (shdr->slice_sao_chroma_flag) {
309 int nSW = (1<<sps.Log2CtbSizeY) / sps.SubWidthC;
310 int nSH = (1<<sps.Log2CtbSizeY) / sps.SubHeightC;
311
312 apply_sao(img, xCtb,yCtb, shdr, 1, nSW,nSH,
313 inputCopy.get_image_plane(1), inputCopy.get_image_stride(1),
314 img->get_image_plane(1), img->get_image_stride(1));
315
316 apply_sao(img, xCtb,yCtb, shdr, 2, nSW,nSH,
317 inputCopy.get_image_plane(2), inputCopy.get_image_stride(2),
318 img->get_image_plane(2), img->get_image_stride(2));
319 }
320 }
321 }
322
323
apply_sample_adaptive_offset_sequential(de265_image * img)324 void apply_sample_adaptive_offset_sequential(de265_image* img)
325 {
326 const seq_parameter_set& sps = img->get_sps();
327
328 if (sps.sample_adaptive_offset_enabled_flag==0) {
329 return;
330 }
331
332 int lumaImageSize = img->get_image_stride(0) * img->get_height(0) * img->get_bytes_per_pixel(0);
333 int chromaImageSize = img->get_image_stride(1) * img->get_height(1) * img->get_bytes_per_pixel(1);
334
335 uint8_t* inputCopy = new uint8_t[ libde265_max(lumaImageSize, chromaImageSize) ];
336 if (inputCopy == NULL) {
337 img->decctx->add_warning(DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY,false);
338 return;
339 }
340
341
342 int nChannels = 3;
343 if (sps.ChromaArrayType == CHROMA_MONO) { nChannels=1; }
344
345 for (int cIdx=0;cIdx<nChannels;cIdx++) {
346
347 int stride = img->get_image_stride(cIdx);
348 int height = img->get_height(cIdx);
349
350 memcpy(inputCopy, img->get_image_plane(cIdx), stride * height * img->get_bytes_per_pixel(cIdx));
351
352 for (int yCtb=0; yCtb<sps.PicHeightInCtbsY; yCtb++)
353 for (int xCtb=0; xCtb<sps.PicWidthInCtbsY; xCtb++)
354 {
355 const slice_segment_header* shdr = img->get_SliceHeaderCtb(xCtb,yCtb);
356 if (shdr==NULL) { return; }
357
358 if (cIdx==0 && shdr->slice_sao_luma_flag) {
359 apply_sao(img, xCtb,yCtb, shdr, 0, 1<<sps.Log2CtbSizeY, 1<<sps.Log2CtbSizeY,
360 inputCopy, stride,
361 img->get_image_plane(0), img->get_image_stride(0));
362 }
363
364 if (cIdx!=0 && shdr->slice_sao_chroma_flag) {
365 int nSW = (1<<sps.Log2CtbSizeY) / sps.SubWidthC;
366 int nSH = (1<<sps.Log2CtbSizeY) / sps.SubHeightC;
367
368 apply_sao(img, xCtb,yCtb, shdr, cIdx, nSW,nSH,
369 inputCopy, stride,
370 img->get_image_plane(cIdx), img->get_image_stride(cIdx));
371 }
372 }
373 }
374
375 delete[] inputCopy;
376 }
377
378
379
380
381 class thread_task_sao : public thread_task
382 {
383 public:
384 int ctb_y;
385 de265_image* img; /* this is where we get the SPS from
386 (either inputImg or outputImg can be a dummy image)
387 */
388
389 de265_image* inputImg;
390 de265_image* outputImg;
391 int inputProgress;
392
393 virtual void work();
name() const394 virtual std::string name() const {
395 char buf[100];
396 sprintf(buf,"sao-%d",ctb_y);
397 return buf;
398 }
399 };
400
401
work()402 void thread_task_sao::work()
403 {
404 state = Running;
405 img->thread_run(this);
406
407 const seq_parameter_set& sps = img->get_sps();
408
409 const int rightCtb = sps.PicWidthInCtbsY-1;
410 const int ctbSize = (1<<sps.Log2CtbSizeY);
411
412
413 // wait until also the CTB-rows below and above are ready
414
415 img->wait_for_progress(this, rightCtb,ctb_y, inputProgress);
416
417 if (ctb_y>0) {
418 img->wait_for_progress(this, rightCtb,ctb_y-1, inputProgress);
419 }
420
421 if (ctb_y+1<sps.PicHeightInCtbsY) {
422 img->wait_for_progress(this, rightCtb,ctb_y+1, inputProgress);
423 }
424
425
426 // copy input image to output for this CTB-row
427
428 outputImg->copy_lines_from(inputImg, ctb_y * ctbSize, (ctb_y+1) * ctbSize);
429
430
431 // process SAO in the CTB-row
432
433 for (int xCtb=0; xCtb<sps.PicWidthInCtbsY; xCtb++)
434 {
435 const slice_segment_header* shdr = img->get_SliceHeaderCtb(xCtb,ctb_y);
436 if (shdr==NULL) {
437 break;
438 }
439
440 if (shdr->slice_sao_luma_flag) {
441 apply_sao(img, xCtb,ctb_y, shdr, 0, ctbSize, ctbSize,
442 inputImg ->get_image_plane(0), inputImg ->get_image_stride(0),
443 outputImg->get_image_plane(0), outputImg->get_image_stride(0));
444 }
445
446 if (shdr->slice_sao_chroma_flag) {
447 int nSW = ctbSize / sps.SubWidthC;
448 int nSH = ctbSize / sps.SubHeightC;
449
450 apply_sao(img, xCtb,ctb_y, shdr, 1, nSW,nSH,
451 inputImg ->get_image_plane(1), inputImg ->get_image_stride(1),
452 outputImg->get_image_plane(1), outputImg->get_image_stride(1));
453
454 apply_sao(img, xCtb,ctb_y, shdr, 2, nSW,nSH,
455 inputImg ->get_image_plane(2), inputImg ->get_image_stride(2),
456 outputImg->get_image_plane(2), outputImg->get_image_stride(2));
457 }
458 }
459
460
461 // mark SAO progress
462
463 for (int x=0;x<=rightCtb;x++) {
464 const int CtbWidth = sps.PicWidthInCtbsY;
465 img->ctb_progress[x+ctb_y*CtbWidth].set_progress(CTB_PROGRESS_SAO);
466 }
467
468
469 state = Finished;
470 img->thread_finishes(this);
471 }
472
473
add_sao_tasks(image_unit * imgunit,int saoInputProgress)474 bool add_sao_tasks(image_unit* imgunit, int saoInputProgress)
475 {
476 de265_image* img = imgunit->img;
477 const seq_parameter_set& sps = img->get_sps();
478
479 if (sps.sample_adaptive_offset_enabled_flag==0) {
480 return false;
481 }
482
483
484 decoder_context* ctx = img->decctx;
485
486 de265_error err = imgunit->sao_output.alloc_image(img->get_width(), img->get_height(),
487 img->get_chroma_format(),
488 img->get_shared_sps(),
489 false,
490 img->decctx, //img->encctx,
491 img->pts, img->user_data, true);
492 if (err != DE265_OK) {
493 img->decctx->add_warning(DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY,false);
494 return false;
495 }
496
497 int nRows = sps.PicHeightInCtbsY;
498
499 int n=0;
500 img->thread_start(nRows);
501
502 for (int y=0;y<nRows;y++)
503 {
504 thread_task_sao* task = new thread_task_sao;
505
506 task->inputImg = img;
507 task->outputImg = &imgunit->sao_output;
508 task->img = img;
509 task->ctb_y = y;
510 task->inputProgress = saoInputProgress;
511
512 imgunit->tasks.push_back(task);
513 add_task(&ctx->thread_pool_, task);
514 n++;
515 }
516
517 /* Currently need barrier here because when are finished, we have to swap the pixel
518 data back into the main image. */
519 img->wait_for_completion();
520
521 img->exchange_pixel_data_with(imgunit->sao_output);
522
523 return true;
524 }
525