1 /*
2  * H.265 video codec.
3  * Copyright (c) 2013-2014 struktur AG, Dirk Farin <farin@struktur.de>
4  *
5  * This file is part of libde265.
6  *
7  * libde265 is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU Lesser General Public License as
9  * published by the Free Software Foundation, either version 3 of
10  * the License, or (at your option) any later version.
11  *
12  * libde265 is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public License
18  * along with libde265.  If not, see <http://www.gnu.org/licenses/>.
19  */
20 
21 #include "sao.h"
22 #include "util.h"
23 
24 #include <stdlib.h>
25 #include <string.h>
26 
27 
28 template <class pixel_t>
apply_sao_internal(de265_image * img,int xCtb,int yCtb,const slice_segment_header * shdr,int cIdx,int nSW,int nSH,const pixel_t * in_img,int in_stride,pixel_t * out_img,int out_stride)29 void apply_sao_internal(de265_image* img, int xCtb,int yCtb,
30                         const slice_segment_header* shdr, int cIdx, int nSW,int nSH,
31                         const pixel_t* in_img,  int in_stride,
32                         /* */ pixel_t* out_img, int out_stride)
33 {
34   const sao_info* saoinfo = img->get_sao_info(xCtb,yCtb);
35 
36   int SaoTypeIdx = (saoinfo->SaoTypeIdx >> (2*cIdx)) & 0x3;
37 
38   logtrace(LogSAO,"apply_sao CTB %d;%d cIdx:%d type=%d (%dx%d)\n",xCtb,yCtb,cIdx, SaoTypeIdx, nSW,nSH);
39 
40   if (SaoTypeIdx==0) {
41     return;
42   }
43 
44   const seq_parameter_set* sps = &img->get_sps();
45   const pic_parameter_set* pps = &img->get_pps();
46   const int bitDepth = (cIdx==0 ? sps->BitDepth_Y : sps->BitDepth_C);
47   const int maxPixelValue = (1<<bitDepth)-1;
48 
49   // top left position of CTB in pixels
50   const int xC = xCtb*nSW;
51   const int yC = yCtb*nSH;
52 
53   const int width  = img->get_width(cIdx);
54   const int height = img->get_height(cIdx);
55 
56   const int ctbSliceAddrRS = img->get_SliceHeader(xC,yC)->SliceAddrRS;
57 
58   const int picWidthInCtbs = sps->PicWidthInCtbsY;
59   const int chromashiftW = sps->get_chroma_shift_W(cIdx);
60   const int chromashiftH = sps->get_chroma_shift_H(cIdx);
61   const int ctbshiftW = sps->Log2CtbSizeY - chromashiftW;
62   const int ctbshiftH = sps->Log2CtbSizeY - chromashiftH;
63 
64 
65   for (int i=0;i<5;i++)
66     {
67       logtrace(LogSAO,"offset[%d] = %d\n", i, i==0 ? 0 : saoinfo->saoOffsetVal[cIdx][i-1]);
68     }
69 
70 
71   // actual size of CTB to be processed (can be smaller when partially outside of image)
72   const int ctbW = (xC+nSW>width)  ? width -xC : nSW;
73   const int ctbH = (yC+nSH>height) ? height-yC : nSH;
74 
75 
76   const bool extendedTests = img->get_CTB_has_pcm_or_cu_transquant_bypass(xCtb,yCtb);
77 
78   if (SaoTypeIdx==2) {
79     int hPos[2], vPos[2];
80     int vPosStride[2]; // vPos[] multiplied by image stride
81     int SaoEoClass = (saoinfo->SaoEoClass >> (2*cIdx)) & 0x3;
82 
83     switch (SaoEoClass) {
84     case 0: hPos[0]=-1; hPos[1]= 1; vPos[0]= 0; vPos[1]=0; break;
85     case 1: hPos[0]= 0; hPos[1]= 0; vPos[0]=-1; vPos[1]=1; break;
86     case 2: hPos[0]=-1; hPos[1]= 1; vPos[0]=-1; vPos[1]=1; break;
87     case 3: hPos[0]= 1; hPos[1]=-1; vPos[0]=-1; vPos[1]=1; break;
88     }
89 
90     vPosStride[0] = vPos[0] * in_stride;
91     vPosStride[1] = vPos[1] * in_stride;
92 
93     /* Reorder sao_info.saoOffsetVal[] array, so that we can index it
94        directly with the sum of the two pixel-difference signs. */
95     int8_t  saoOffsetVal[5]; // [2] unused
96     saoOffsetVal[0] = saoinfo->saoOffsetVal[cIdx][1-1];
97     saoOffsetVal[1] = saoinfo->saoOffsetVal[cIdx][2-1];
98     saoOffsetVal[2] = 0;
99     saoOffsetVal[3] = saoinfo->saoOffsetVal[cIdx][3-1];
100     saoOffsetVal[4] = saoinfo->saoOffsetVal[cIdx][4-1];
101 
102 
103     for (int j=0;j<ctbH;j++) {
104       const pixel_t* in_ptr  = &in_img [xC+(yC+j)*in_stride];
105       /* */ pixel_t* out_ptr = &out_img[xC+(yC+j)*out_stride];
106 
107       for (int i=0;i<ctbW;i++) {
108         int edgeIdx = -1;
109 
110         logtrace(LogSAO, "pos %d,%d\n",xC+i,yC+j);
111 
112         if ((extendedTests &&
113              (sps->pcm_loop_filter_disable_flag &&
114               img->get_pcm_flag((xC+i)<<chromashiftW,(yC+j)<<chromashiftH))) ||
115             img->get_cu_transquant_bypass((xC+i)<<chromashiftW,(yC+j)<<chromashiftH)) {
116           continue;
117         }
118 
119         // do the expensive test for boundaries only at the boundaries
120         bool testBoundary = (i==0 || j==0 || i==ctbW-1 || j==ctbH-1);
121 
122         if (testBoundary)
123           for (int k=0;k<2;k++) {
124             int xS = xC+i+hPos[k];
125             int yS = yC+j+vPos[k];
126 
127             if (xS<0 || yS<0 || xS>=width || yS>=height) {
128               edgeIdx=0;
129               break;
130             }
131 
132 
133             // This part seems inefficient with all the get_SliceHeaderIndex() calls,
134             // but removing this part (because the input was known to have only a single
135             // slice anyway) reduced computation time only by 1.3%.
136             // TODO: however, this may still be a big part of SAO itself.
137 
138             slice_segment_header* sliceHeader = img->get_SliceHeader(xS<<chromashiftW,
139                                                                      yS<<chromashiftH);
140             if (sliceHeader==NULL) { return; }
141 
142             int sliceAddrRS = sliceHeader->SliceAddrRS;
143             if (sliceAddrRS <  ctbSliceAddrRS &&
144                 img->get_SliceHeader((xC+i)<<chromashiftW,
145                                      (yC+j)<<chromashiftH)->slice_loop_filter_across_slices_enabled_flag==0) {
146               edgeIdx=0;
147               break;
148             }
149 
150             if (sliceAddrRS >  ctbSliceAddrRS &&
151                 img->get_SliceHeader(xS<<chromashiftW,
152                                      yS<<chromashiftH)->slice_loop_filter_across_slices_enabled_flag==0) {
153               edgeIdx=0;
154               break;
155             }
156 
157 
158             if (pps->loop_filter_across_tiles_enabled_flag==0 &&
159                 pps->TileIdRS[(xS>>ctbshiftW) + (yS>>ctbshiftH)*picWidthInCtbs] !=
160                 pps->TileIdRS[(xC>>ctbshiftW) + (yC>>ctbshiftH)*picWidthInCtbs]) {
161               edgeIdx=0;
162               break;
163             }
164           }
165 
166         if (edgeIdx != 0) {
167 
168           edgeIdx = ( Sign(in_ptr[i] - in_ptr[i+hPos[0]+vPosStride[0]]) +
169                       Sign(in_ptr[i] - in_ptr[i+hPos[1]+vPosStride[1]])   );
170 
171           if (1) { // edgeIdx != 0) {   // seems to be faster without this check (zero in offset table)
172             int offset = saoOffsetVal[edgeIdx+2];
173 
174             out_ptr[i] = Clip3(0,maxPixelValue,
175                                in_ptr[i] + offset);
176           }
177         }
178       }
179     }
180   }
181   else {
182     int bandShift = bitDepth-5;
183     int saoLeftClass = saoinfo->sao_band_position[cIdx];
184     logtrace(LogSAO,"saoLeftClass: %d\n",saoLeftClass);
185 
186     int bandTable[32];
187     memset(bandTable, 0, sizeof(int)*32);
188 
189     for (int k=0;k<4;k++) {
190       bandTable[ (k+saoLeftClass)&31 ] = k+1;
191     }
192 
193 
194     /* If PCM or transquant_bypass is used in this CTB, we have to
195        run all checks (A).
196        Otherwise, we run a simplified version of the code (B).
197 
198        NOTE: this whole part of SAO does not seem to be a significant part of the time spent
199     */
200 
201     if (extendedTests) {
202 
203       // (A) full version with all checks
204 
205       for (int j=0;j<ctbH;j++)
206         for (int i=0;i<ctbW;i++) {
207 
208           if ((sps->pcm_loop_filter_disable_flag &&
209                img->get_pcm_flag((xC+i)<<chromashiftW,(yC+j)<<chromashiftH)) ||
210               img->get_cu_transquant_bypass((xC+i)<<chromashiftW,(yC+j)<<chromashiftH)) {
211             continue;
212           }
213 
214           // Shifts are a strange thing. On x86, >>x actually computes >>(x%64).
215           // So we have to take care of large bandShifts.
216           int bandIdx;
217           if (bandShift >= 8) {
218             bandIdx = 0;
219           } else {
220             bandIdx = bandTable[ in_img[xC+i+(yC+j)*in_stride]>>bandShift ];
221           }
222 
223           if (bandIdx>0) {
224             int offset = saoinfo->saoOffsetVal[cIdx][bandIdx-1];
225 
226             logtrace(LogSAO,"%d %d (%d) offset %d  %x -> %x\n",xC+i,yC+j,bandIdx,
227                      offset,
228                      in_img[xC+i+(yC+j)*in_stride],
229                      in_img[xC+i+(yC+j)*in_stride]+offset);
230 
231             out_img[xC+i+(yC+j)*out_stride] = Clip3(0,maxPixelValue,
232                                                     in_img[xC+i+(yC+j)*in_stride] + offset);
233           }
234         }
235     }
236     else
237       {
238         // (B) simplified version (only works if no PCM and transquant_bypass is active)
239 
240         for (int j=0;j<ctbH;j++)
241           for (int i=0;i<ctbW;i++) {
242 
243             // see above
244             int bandIdx;
245             if (bandShift >= 8) {
246               bandIdx = 0;
247             } else {
248               bandIdx = bandTable[ in_img[xC+i+(yC+j)*in_stride]>>bandShift ];
249             }
250 
251             if (bandIdx>0) {
252               int offset = saoinfo->saoOffsetVal[cIdx][bandIdx-1];
253 
254               out_img[xC+i+(yC+j)*out_stride] = Clip3(0,maxPixelValue,
255                                                       in_img[xC+i+(yC+j)*in_stride] + offset);
256             }
257           }
258       }
259   }
260 }
261 
262 
263 template <class pixel_t>
apply_sao(de265_image * img,int xCtb,int yCtb,const slice_segment_header * shdr,int cIdx,int nSW,int nSH,const pixel_t * in_img,int in_stride,pixel_t * out_img,int out_stride)264 void apply_sao(de265_image* img, int xCtb,int yCtb,
265                const slice_segment_header* shdr, int cIdx, int nSW,int nSH,
266                const pixel_t* in_img,  int in_stride,
267                /* */ pixel_t* out_img, int out_stride)
268 {
269   if (img->high_bit_depth(cIdx)) {
270     apply_sao_internal<uint16_t>(img,xCtb,yCtb, shdr,cIdx,nSW,nSH,
271                                  (uint16_t*)in_img, in_stride,
272                                  (uint16_t*)out_img,out_stride);
273   }
274   else {
275     apply_sao_internal<uint8_t>(img,xCtb,yCtb, shdr,cIdx,nSW,nSH,
276                                 in_img, in_stride,
277                                 out_img,out_stride);
278   }
279 }
280 
281 
apply_sample_adaptive_offset(de265_image * img)282 void apply_sample_adaptive_offset(de265_image* img)
283 {
284   const seq_parameter_set& sps = img->get_sps();
285 
286   if (sps.sample_adaptive_offset_enabled_flag==0) {
287     return;
288   }
289 
290   de265_image inputCopy;
291   de265_error err = inputCopy.copy_image(img);
292   if (err != DE265_OK) {
293     img->decctx->add_warning(DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY,false);
294     return;
295   }
296 
297   for (int yCtb=0; yCtb<sps.PicHeightInCtbsY; yCtb++)
298     for (int xCtb=0; xCtb<sps.PicWidthInCtbsY; xCtb++)
299       {
300         const slice_segment_header* shdr = img->get_SliceHeaderCtb(xCtb,yCtb);
301 
302         if (shdr->slice_sao_luma_flag) {
303           apply_sao(img, xCtb,yCtb, shdr, 0, 1<<sps.Log2CtbSizeY, 1<<sps.Log2CtbSizeY,
304                     inputCopy.get_image_plane(0), inputCopy.get_image_stride(0),
305                     img->get_image_plane(0), img->get_image_stride(0));
306         }
307 
308         if (shdr->slice_sao_chroma_flag) {
309           int nSW = (1<<sps.Log2CtbSizeY) / sps.SubWidthC;
310           int nSH = (1<<sps.Log2CtbSizeY) / sps.SubHeightC;
311 
312           apply_sao(img, xCtb,yCtb, shdr, 1, nSW,nSH,
313                     inputCopy.get_image_plane(1), inputCopy.get_image_stride(1),
314                     img->get_image_plane(1), img->get_image_stride(1));
315 
316           apply_sao(img, xCtb,yCtb, shdr, 2, nSW,nSH,
317                     inputCopy.get_image_plane(2), inputCopy.get_image_stride(2),
318                     img->get_image_plane(2), img->get_image_stride(2));
319         }
320       }
321 }
322 
323 
apply_sample_adaptive_offset_sequential(de265_image * img)324 void apply_sample_adaptive_offset_sequential(de265_image* img)
325 {
326   const seq_parameter_set& sps = img->get_sps();
327 
328   if (sps.sample_adaptive_offset_enabled_flag==0) {
329     return;
330   }
331 
332   int lumaImageSize   = img->get_image_stride(0) * img->get_height(0) * img->get_bytes_per_pixel(0);
333   int chromaImageSize = img->get_image_stride(1) * img->get_height(1) * img->get_bytes_per_pixel(1);
334 
335   uint8_t* inputCopy = new uint8_t[ libde265_max(lumaImageSize, chromaImageSize) ];
336   if (inputCopy == NULL) {
337     img->decctx->add_warning(DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY,false);
338     return;
339   }
340 
341 
342   int nChannels = 3;
343   if (sps.ChromaArrayType == CHROMA_MONO) { nChannels=1; }
344 
345   for (int cIdx=0;cIdx<nChannels;cIdx++) {
346 
347     int stride = img->get_image_stride(cIdx);
348     int height = img->get_height(cIdx);
349 
350     memcpy(inputCopy, img->get_image_plane(cIdx), stride * height * img->get_bytes_per_pixel(cIdx));
351 
352     for (int yCtb=0; yCtb<sps.PicHeightInCtbsY; yCtb++)
353       for (int xCtb=0; xCtb<sps.PicWidthInCtbsY; xCtb++)
354         {
355           const slice_segment_header* shdr = img->get_SliceHeaderCtb(xCtb,yCtb);
356           if (shdr==NULL) { return; }
357 
358           if (cIdx==0 && shdr->slice_sao_luma_flag) {
359             apply_sao(img, xCtb,yCtb, shdr, 0, 1<<sps.Log2CtbSizeY, 1<<sps.Log2CtbSizeY,
360                       inputCopy, stride,
361                       img->get_image_plane(0), img->get_image_stride(0));
362           }
363 
364           if (cIdx!=0 && shdr->slice_sao_chroma_flag) {
365             int nSW = (1<<sps.Log2CtbSizeY) / sps.SubWidthC;
366             int nSH = (1<<sps.Log2CtbSizeY) / sps.SubHeightC;
367 
368             apply_sao(img, xCtb,yCtb, shdr, cIdx, nSW,nSH,
369                       inputCopy, stride,
370                       img->get_image_plane(cIdx), img->get_image_stride(cIdx));
371           }
372         }
373   }
374 
375   delete[] inputCopy;
376 }
377 
378 
379 
380 
381 class thread_task_sao : public thread_task
382 {
383 public:
384   int  ctb_y;
385   de265_image* img; /* this is where we get the SPS from
386                        (either inputImg or outputImg can be a dummy image)
387                     */
388 
389   de265_image* inputImg;
390   de265_image* outputImg;
391   int inputProgress;
392 
393   virtual void work();
name() const394   virtual std::string name() const {
395     char buf[100];
396     sprintf(buf,"sao-%d",ctb_y);
397     return buf;
398   }
399 };
400 
401 
work()402 void thread_task_sao::work()
403 {
404   state = Running;
405   img->thread_run(this);
406 
407   const seq_parameter_set& sps = img->get_sps();
408 
409   const int rightCtb = sps.PicWidthInCtbsY-1;
410   const int ctbSize  = (1<<sps.Log2CtbSizeY);
411 
412 
413   // wait until also the CTB-rows below and above are ready
414 
415   img->wait_for_progress(this, rightCtb,ctb_y,  inputProgress);
416 
417   if (ctb_y>0) {
418     img->wait_for_progress(this, rightCtb,ctb_y-1, inputProgress);
419   }
420 
421   if (ctb_y+1<sps.PicHeightInCtbsY) {
422     img->wait_for_progress(this, rightCtb,ctb_y+1, inputProgress);
423   }
424 
425 
426   // copy input image to output for this CTB-row
427 
428   outputImg->copy_lines_from(inputImg, ctb_y * ctbSize, (ctb_y+1) * ctbSize);
429 
430 
431   // process SAO in the CTB-row
432 
433   for (int xCtb=0; xCtb<sps.PicWidthInCtbsY; xCtb++)
434     {
435       const slice_segment_header* shdr = img->get_SliceHeaderCtb(xCtb,ctb_y);
436       if (shdr==NULL) {
437         break;
438       }
439 
440       if (shdr->slice_sao_luma_flag) {
441         apply_sao(img, xCtb,ctb_y, shdr, 0, ctbSize, ctbSize,
442                   inputImg ->get_image_plane(0), inputImg ->get_image_stride(0),
443                   outputImg->get_image_plane(0), outputImg->get_image_stride(0));
444       }
445 
446       if (shdr->slice_sao_chroma_flag) {
447         int nSW = ctbSize / sps.SubWidthC;
448         int nSH = ctbSize / sps.SubHeightC;
449 
450         apply_sao(img, xCtb,ctb_y, shdr, 1, nSW,nSH,
451                   inputImg ->get_image_plane(1), inputImg ->get_image_stride(1),
452                   outputImg->get_image_plane(1), outputImg->get_image_stride(1));
453 
454         apply_sao(img, xCtb,ctb_y, shdr, 2, nSW,nSH,
455                   inputImg ->get_image_plane(2), inputImg ->get_image_stride(2),
456                   outputImg->get_image_plane(2), outputImg->get_image_stride(2));
457       }
458     }
459 
460 
461   // mark SAO progress
462 
463   for (int x=0;x<=rightCtb;x++) {
464     const int CtbWidth = sps.PicWidthInCtbsY;
465     img->ctb_progress[x+ctb_y*CtbWidth].set_progress(CTB_PROGRESS_SAO);
466   }
467 
468 
469   state = Finished;
470   img->thread_finishes(this);
471 }
472 
473 
add_sao_tasks(image_unit * imgunit,int saoInputProgress)474 bool add_sao_tasks(image_unit* imgunit, int saoInputProgress)
475 {
476   de265_image* img = imgunit->img;
477   const seq_parameter_set& sps = img->get_sps();
478 
479   if (sps.sample_adaptive_offset_enabled_flag==0) {
480     return false;
481   }
482 
483 
484   decoder_context* ctx = img->decctx;
485 
486   de265_error err = imgunit->sao_output.alloc_image(img->get_width(), img->get_height(),
487                                                     img->get_chroma_format(),
488                                                     img->get_shared_sps(),
489                                                     false,
490                                                     img->decctx, //img->encctx,
491                                                     img->pts, img->user_data, true);
492   if (err != DE265_OK) {
493     img->decctx->add_warning(DE265_WARNING_CANNOT_APPLY_SAO_OUT_OF_MEMORY,false);
494     return false;
495   }
496 
497   int nRows = sps.PicHeightInCtbsY;
498 
499   int n=0;
500   img->thread_start(nRows);
501 
502   for (int y=0;y<nRows;y++)
503     {
504       thread_task_sao* task = new thread_task_sao;
505 
506       task->inputImg  = img;
507       task->outputImg = &imgunit->sao_output;
508       task->img = img;
509       task->ctb_y = y;
510       task->inputProgress = saoInputProgress;
511 
512       imgunit->tasks.push_back(task);
513       add_task(&ctx->thread_pool_, task);
514       n++;
515     }
516 
517   /* Currently need barrier here because when are finished, we have to swap the pixel
518      data back into the main image. */
519   img->wait_for_completion();
520 
521   img->exchange_pixel_data_with(imgunit->sao_output);
522 
523   return true;
524 }
525