1 /*****************************************************************************
2  * Copyright (C) 2013 x265 project
3  *
4  * Author: Shazeb Nawaz Khan <shazeb@multicorewareinc.com>
5  *         Steve Borho <steve@borho.org>
6  *         Kavitha Sampas <kavitha@multicorewareinc.com>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21  *
22  * This program is also available under a commercial proprietary license.
23  * For more information, contact us at license @ x265.com.
24  *****************************************************************************/
25 
26 #include "common.h"
27 #include "frame.h"
28 #include "picyuv.h"
29 #include "lowres.h"
30 #include "slice.h"
31 #include "mv.h"
32 #include "bitstream.h"
33 
34 using namespace X265_NS;
35 namespace {
36 struct Cache
37 {
38     const int * intraCost;
39     int         numPredDir;
40     int         csp;
41     int         hshift;
42     int         vshift;
43     int         lowresWidthInCU;
44     int         lowresHeightInCU;
45 };
46 
sliceHeaderCost(WeightParam * w,int lambda,int bChroma)47 int sliceHeaderCost(WeightParam *w, int lambda, int bChroma)
48 {
49     /* 4 times higher, because chroma is analyzed at full resolution. */
50     if (bChroma)
51         lambda *= 4;
52     int denomCost = bs_size_ue(w[0].log2WeightDenom) * (2 - bChroma);
53     return lambda * (10 + denomCost + 2 * (bs_size_se(w[0].inputWeight) + bs_size_se(w[0].inputOffset)));
54 }
55 
56 /* make a motion compensated copy of lowres ref into mcout with the same stride.
57  * The borders of mcout are not extended */
mcLuma(pixel * mcout,Lowres & ref,const MV * mvs)58 void mcLuma(pixel* mcout, Lowres& ref, const MV * mvs)
59 {
60     intptr_t stride = ref.lumaStride;
61     const int mvshift = 1 << 2;
62     const int cuSize = 8;
63     MV mvmin, mvmax;
64 
65     int cu = 0;
66 
67     for (int y = 0; y < ref.lines; y += cuSize)
68     {
69         intptr_t pixoff = y * stride;
70         mvmin.y = (int16_t)((-y - 8) * mvshift);
71         mvmax.y = (int16_t)((ref.lines - y - 1 + 8) * mvshift);
72 
73         for (int x = 0; x < ref.width; x += cuSize, pixoff += cuSize, cu++)
74         {
75             ALIGN_VAR_16(pixel, buf8x8[8 * 8]);
76             intptr_t bstride = 8;
77             mvmin.x = (int16_t)((-x - 8) * mvshift);
78             mvmax.x = (int16_t)((ref.width - x - 1 + 8) * mvshift);
79 
80             /* clip MV to available pixels */
81             MV mv = mvs[cu];
82             mv = mv.clipped(mvmin, mvmax);
83             pixel *tmp = ref.lowresMC(pixoff, mv, buf8x8, bstride);
84             primitives.cu[BLOCK_8x8].copy_pp(mcout + pixoff, stride, tmp, bstride);
85         }
86     }
87 }
88 
89 /* use lowres MVs from lookahead to generate a motion compensated chroma plane.
90  * if a block had cheaper lowres cost as intra, we treat it as MV 0 */
mcChroma(pixel * mcout,pixel * src,intptr_t stride,const MV * mvs,const Cache & cache,int height,int width)91 void mcChroma(pixel *      mcout,
92               pixel *      src,
93               intptr_t     stride,
94               const MV *   mvs,
95               const Cache& cache,
96               int          height,
97               int          width)
98 {
99     /* the motion vectors correspond to 8x8 lowres luma blocks, or 16x16 fullres
100      * luma blocks. We have to adapt block size to chroma csp */
101     int csp = cache.csp;
102     int bw = 16 >> cache.hshift;
103     int bh = 16 >> cache.vshift;
104     const int mvshift = 1 << 2;
105     MV mvmin, mvmax;
106 
107     for (int y = 0; y < height; y += bh)
108     {
109         /* note: lowres block count per row might be different from chroma block
110          * count per row because of rounding issues, so be very careful with indexing
111          * into the lowres structures */
112         int cu = y * cache.lowresWidthInCU;
113         intptr_t pixoff = y * stride;
114         mvmin.y = (int16_t)((-y - 8) * mvshift);
115         mvmax.y = (int16_t)((height - y - 1 + 8) * mvshift);
116 
117         for (int x = 0; x < width; x += bw, cu++, pixoff += bw)
118         {
119             if (x < cache.lowresWidthInCU && y < cache.lowresHeightInCU)
120             {
121                 MV mv = mvs[cu]; // lowres MV
122                 mv <<= 1;        // fullres MV
123                 mv.x >>= cache.hshift;
124                 mv.y >>= cache.vshift;
125 
126                 /* clip MV to available pixels */
127                 mvmin.x = (int16_t)((-x - 8) * mvshift);
128                 mvmax.x = (int16_t)((width - x - 1 + 8) * mvshift);
129                 mv = mv.clipped(mvmin, mvmax);
130 
131                 intptr_t fpeloffset = (mv.y >> 2) * stride + (mv.x >> 2);
132                 pixel *temp = src + pixoff + fpeloffset;
133 
134                 int xFrac = mv.x & 0x7;
135                 int yFrac = mv.y & 0x7;
136                 if ((yFrac | xFrac) == 0)
137                 {
138                     primitives.chroma[csp].pu[LUMA_16x16].copy_pp(mcout + pixoff, stride, temp, stride);
139                 }
140                 else if (yFrac == 0)
141                 {
142                     primitives.chroma[csp].pu[LUMA_16x16].filter_hpp(temp, stride, mcout + pixoff, stride, xFrac);
143                 }
144                 else if (xFrac == 0)
145                 {
146                     primitives.chroma[csp].pu[LUMA_16x16].filter_vpp(temp, stride, mcout + pixoff, stride, yFrac);
147                 }
148                 else
149                 {
150                     ALIGN_VAR_16(int16_t, imm[16 * (16 + NTAPS_CHROMA)]);
151                     primitives.chroma[csp].pu[LUMA_16x16].filter_hps(temp, stride, imm, bw, xFrac, 1);
152                     primitives.chroma[csp].pu[LUMA_16x16].filter_vsp(imm + ((NTAPS_CHROMA >> 1) - 1) * bw, bw, mcout + pixoff, stride, yFrac);
153                 }
154             }
155             else
156             {
157                 primitives.chroma[csp].pu[LUMA_16x16].copy_pp(mcout + pixoff, stride, src + pixoff, stride);
158             }
159         }
160     }
161 }
162 
163 /* Measure sum of 8x8 satd costs between source frame and reference
164  * frame (potentially weighted, potentially motion compensated). We
165  * always use source images for this analysis since reference recon
166  * pixels have unreliable availability */
weightCost(pixel * fenc,pixel * ref,pixel * weightTemp,intptr_t stride,const Cache & cache,int width,int height,WeightParam * w,bool bLuma)167 uint32_t weightCost(pixel *         fenc,
168                     pixel *         ref,
169                     pixel *         weightTemp,
170                     intptr_t        stride,
171                     const Cache &   cache,
172                     int             width,
173                     int             height,
174                     WeightParam *   w,
175                     bool            bLuma)
176 {
177     if (w)
178     {
179         /* make a weighted copy of the reference plane */
180         int offset = w->inputOffset << (X265_DEPTH - 8);
181         int weight = w->inputWeight;
182         int denom = w->log2WeightDenom;
183         int round = denom ? 1 << (denom - 1) : 0;
184         int correction = IF_INTERNAL_PREC - X265_DEPTH; /* intermediate interpolation depth */
185         int pwidth = ((width + 15) >> 4) << 4;
186 
187         primitives.weight_pp(ref, weightTemp, stride, pwidth, height,
188                              weight, round << correction, denom + correction, offset);
189         ref = weightTemp;
190     }
191 
192     uint32_t cost = 0;
193     pixel *f = fenc, *r = ref;
194 
195     if (bLuma)
196     {
197         int cu = 0;
198         for (int y = 0; y < height; y += 8, r += 8 * stride, f += 8 * stride)
199         {
200             for (int x = 0; x < width; x += 8, cu++)
201             {
202                 int cmp = primitives.pu[LUMA_8x8].satd(r + x, stride, f + x, stride);
203                 cost += X265_MIN(cmp, cache.intraCost[cu]);
204             }
205         }
206     }
207     else if (cache.csp == X265_CSP_I444)
208         for (int y = 0; y < height; y += 16, r += 16 * stride, f += 16 * stride)
209             for (int x = 0; x < width; x += 16)
210                 cost += primitives.pu[LUMA_16x16].satd(r + x, stride, f + x, stride);
211     else
212         for (int y = 0; y < height; y += 8, r += 8 * stride, f += 8 * stride)
213             for (int x = 0; x < width; x += 8)
214                 cost += primitives.pu[LUMA_8x8].satd(r + x, stride, f + x, stride);
215 
216     return cost;
217 }
218 }
219 
220 namespace X265_NS {
weightAnalyse(Slice & slice,Frame & frame,x265_param & param)221 void weightAnalyse(Slice& slice, Frame& frame, x265_param& param)
222 {
223     WeightParam wp[2][MAX_NUM_REF][3];
224     PicYuv *fencPic = frame.m_fencPic;
225     Lowres& fenc    = frame.m_lowres;
226 
227     Cache cache;
228 
229     memset(&cache, 0, sizeof(cache));
230     cache.intraCost = fenc.intraCost;
231     cache.numPredDir = slice.isInterP() ? 1 : 2;
232     cache.lowresWidthInCU = fenc.width >> 3;
233     cache.lowresHeightInCU = fenc.lines >> 3;
234     cache.csp = fencPic->m_picCsp;
235     cache.hshift = CHROMA_H_SHIFT(cache.csp);
236     cache.vshift = CHROMA_V_SHIFT(cache.csp);
237 
238     /* Use single allocation for motion compensated ref and weight buffers */
239     pixel *mcbuf = X265_MALLOC(pixel, 2 * fencPic->m_stride * fencPic->m_picHeight);
240     if (!mcbuf)
241     {
242         slice.disableWeights();
243         return;
244     }
245     pixel *weightTemp = mcbuf + fencPic->m_stride * fencPic->m_picHeight;
246 
247     int lambda = (int)x265_lambda_tab[X265_LOOKAHEAD_QP];
248     int curPoc = slice.m_poc;
249     const float epsilon = 1.f / 128.f;
250 
251     int chromaDenom, lumaDenom, denom;
252     chromaDenom = lumaDenom = 7;
253     int numpixels[3];
254     int w16 = ((fencPic->m_picWidth  + 15) >> 4) << 4;
255     int h16 = ((fencPic->m_picHeight + 15) >> 4) << 4;
256     numpixels[0] = w16 * h16;
257     numpixels[1] = numpixels[2] = numpixels[0] >> (cache.hshift + cache.vshift);
258 
259     for (int list = 0; list < cache.numPredDir; list++)
260     {
261         WeightParam *weights = wp[list][0];
262         Frame *refFrame = slice.m_refFrameList[list][0];
263         Lowres& refLowres = refFrame->m_lowres;
264         int diffPoc = abs(curPoc - refFrame->m_poc);
265 
266         /* prepare estimates */
267         float guessScale[3], fencMean[3], refMean[3];
268         for (int plane = 0; plane < 3; plane++)
269         {
270             SET_WEIGHT(weights[plane], false, 1, 0, 0);
271             uint64_t fencVar = fenc.wp_ssd[plane] + !refLowres.wp_ssd[plane];
272             uint64_t refVar  = refLowres.wp_ssd[plane] + !refLowres.wp_ssd[plane];
273             guessScale[plane] = sqrt((float)fencVar / refVar);
274             fencMean[plane] = (float)fenc.wp_sum[plane] / (numpixels[plane]) / (1 << (X265_DEPTH - 8));
275             refMean[plane]  = (float)refLowres.wp_sum[plane] / (numpixels[plane]) / (1 << (X265_DEPTH - 8));
276         }
277 
278         /* make sure both our scale factors fit */
279         while (!list && chromaDenom > 0)
280         {
281             float thresh = 127.f / (1 << chromaDenom);
282             if (guessScale[1] < thresh && guessScale[2] < thresh)
283                 break;
284             chromaDenom--;
285         }
286 
287         SET_WEIGHT(weights[1], false, 1 << chromaDenom, chromaDenom, 0);
288         SET_WEIGHT(weights[2], false, 1 << chromaDenom, chromaDenom, 0);
289 
290         MV *mvs = NULL;
291 
292         for (int plane = 0; plane < 3; plane++)
293         {
294             denom = plane ? chromaDenom : lumaDenom;
295             if (plane && !weights[0].bPresentFlag)
296                 break;
297 
298             /* Early termination */
299             x265_emms();
300             if (fabsf(refMean[plane] - fencMean[plane]) < 0.5f && fabsf(1.f - guessScale[plane]) < epsilon)
301             {
302                 SET_WEIGHT(weights[plane], 0, 1 << denom, denom, 0);
303                 continue;
304             }
305 
306             if (plane)
307             {
308                 int scale = x265_clip3(0, 255, (int)(guessScale[plane] * (1 << denom) + 0.5f));
309                 if (scale > 127)
310                     continue;
311                 weights[plane].inputWeight = scale;
312             }
313             else
314             {
315                 weights[plane].setFromWeightAndOffset((int)(guessScale[plane] * (1 << denom) + 0.5f), 0, denom, !list);
316             }
317 
318             int mindenom = weights[plane].log2WeightDenom;
319             int minscale = weights[plane].inputWeight;
320             int minoff = 0;
321 
322             if (!plane && diffPoc <= param.bframes + 1)
323             {
324                 mvs = fenc.lowresMvs[list][diffPoc - 1];
325 
326                 /* test whether this motion search was performed by lookahead */
327                 if (mvs[0].x != 0x7FFF)
328                 {
329                     /* reference chroma planes must be extended prior to being
330                      * used as motion compensation sources */
331                     if (!refFrame->m_bChromaExtended)
332                     {
333                         refFrame->m_bChromaExtended = true;
334                         PicYuv *refPic = refFrame->m_fencPic;
335                         int width = refPic->m_picWidth >> cache.hshift;
336                         int height = refPic->m_picHeight >> cache.vshift;
337                         extendPicBorder(refPic->m_picOrg[1], refPic->m_strideC, width, height, refPic->m_chromaMarginX, refPic->m_chromaMarginY);
338                         extendPicBorder(refPic->m_picOrg[2], refPic->m_strideC, width, height, refPic->m_chromaMarginX, refPic->m_chromaMarginY);
339                     }
340                 }
341                 else
342                     mvs = 0;
343             }
344 
345             /* prepare inputs to weight analysis */
346             pixel *orig;
347             pixel *fref;
348             intptr_t stride;
349             int    width, height;
350             switch (plane)
351             {
352             case 0:
353                 orig = fenc.lowresPlane[0];
354                 stride = fenc.lumaStride;
355                 width = fenc.width;
356                 height = fenc.lines;
357                 fref = refLowres.lowresPlane[0];
358                 if (mvs)
359                 {
360                     mcLuma(mcbuf, refLowres, mvs);
361                     fref = mcbuf;
362                 }
363                 break;
364 
365             case 1:
366                 orig = fencPic->m_picOrg[1];
367                 stride = fencPic->m_strideC;
368                 fref = refFrame->m_fencPic->m_picOrg[1];
369 
370                 /* Clamp the chroma dimensions to the nearest multiple of
371                  * 8x8 blocks (or 16x16 for 4:4:4) since mcChroma uses lowres
372                  * blocks and weightCost measures 8x8 blocks. This
373                  * potentially ignores some edge pixels, but simplifies the
374                  * logic and prevents reading uninitialized pixels. Lowres
375                  * planes are border extended and require no clamping. */
376                 width =  ((fencPic->m_picWidth  >> 4) << 4) >> cache.hshift;
377                 height = ((fencPic->m_picHeight >> 4) << 4) >> cache.vshift;
378                 if (mvs)
379                 {
380                     mcChroma(mcbuf, fref, stride, mvs, cache, height, width);
381                     fref = mcbuf;
382                 }
383                 break;
384 
385             case 2:
386                 orig = fencPic->m_picOrg[2];
387                 stride = fencPic->m_strideC;
388                 fref = refFrame->m_fencPic->m_picOrg[2];
389                 width =  ((fencPic->m_picWidth  >> 4) << 4) >> cache.hshift;
390                 height = ((fencPic->m_picHeight >> 4) << 4) >> cache.vshift;
391                 if (mvs)
392                 {
393                     mcChroma(mcbuf, fref, stride, mvs, cache, height, width);
394                     fref = mcbuf;
395                 }
396                 break;
397 
398             default:
399                 slice.disableWeights();
400                 X265_FREE(mcbuf);
401                 return;
402             }
403 
404             uint32_t origscore = weightCost(orig, fref, weightTemp, stride, cache, width, height, NULL, !plane);
405             if (!origscore)
406             {
407                 SET_WEIGHT(weights[plane], 0, 1 << denom, denom, 0);
408                 continue;
409             }
410 
411             uint32_t minscore = origscore;
412             bool bFound = false;
413 
414             /* x264 uses a table lookup here, selecting search range based on preset */
415             static const int scaleDist = 4;
416             static const int offsetDist = 2;
417 
418             int startScale = x265_clip3(0, 127, minscale - scaleDist);
419             int endScale   = x265_clip3(0, 127, minscale + scaleDist);
420             for (int scale = startScale; scale <= endScale; scale++)
421             {
422                 int deltaWeight = scale - (1 << mindenom);
423                 if (deltaWeight > 127 || deltaWeight <= -128)
424                     continue;
425 
426                 x265_emms();
427                 int curScale = scale;
428                 int curOffset = (int)(fencMean[plane] - refMean[plane] * curScale / (1 << mindenom) + 0.5f);
429                 if (curOffset < -128 || curOffset > 127)
430                 {
431                     /* Rescale considering the constraints on curOffset. We do it in this order
432                      * because scale has a much wider range than offset (because of denom), so
433                      * it should almost never need to be clamped. */
434                     curOffset = x265_clip3(-128, 127, curOffset);
435                     curScale = (int)((1 << mindenom) * (fencMean[plane] - curOffset) / refMean[plane] + 0.5f);
436                     curScale = x265_clip3(0, 127, curScale);
437                 }
438 
439                 int startOffset = x265_clip3(-128, 127, curOffset - offsetDist);
440                 int endOffset   = x265_clip3(-128, 127, curOffset + offsetDist);
441                 for (int off = startOffset; off <= endOffset; off++)
442                 {
443                     WeightParam wsp;
444                     SET_WEIGHT(wsp, true, curScale, mindenom, off);
445                     uint32_t s = weightCost(orig, fref, weightTemp, stride, cache, width, height, &wsp, !plane) +
446                                  sliceHeaderCost(&wsp, lambda, !!plane);
447                     COPY4_IF_LT(minscore, s, minscale, curScale, minoff, off, bFound, true);
448 
449                     /* Don't check any more offsets if the previous one had a lower cost than the current one */
450                     if (minoff == startOffset && off != startOffset)
451                         break;
452                 }
453             }
454 
455             /* Use a smaller luma denominator if possible */
456             if (!(plane || list))
457             {
458                 while (mindenom > 0 && !(minscale & 1))
459                 {
460                     mindenom--;
461                     minscale >>= 1;
462                 }
463             }
464 
465             if (!bFound || (minscale == (1 << mindenom) && minoff == 0) || (float)minscore / origscore > 0.998f)
466             {
467                 SET_WEIGHT(weights[plane], false, 1 << denom, denom, 0);
468             }
469             else
470             {
471                 SET_WEIGHT(weights[plane], true, minscale, mindenom, minoff);
472             }
473         }
474 
475         if (weights[0].bPresentFlag)
476         {
477             // Make sure both chroma channels match
478             if (weights[1].bPresentFlag != weights[2].bPresentFlag)
479             {
480                 if (weights[1].bPresentFlag)
481                     weights[2] = weights[1];
482                 else
483                     weights[1] = weights[2];
484             }
485         }
486 
487         lumaDenom = weights[0].log2WeightDenom;
488         chromaDenom = weights[1].log2WeightDenom;
489 
490         /* reset weight states */
491         for (int ref = 1; ref < slice.m_numRefIdx[list]; ref++)
492         {
493             SET_WEIGHT(wp[list][ref][0], false, 1 << lumaDenom, lumaDenom, 0);
494             SET_WEIGHT(wp[list][ref][1], false, 1 << chromaDenom, chromaDenom, 0);
495             SET_WEIGHT(wp[list][ref][2], false, 1 << chromaDenom, chromaDenom, 0);
496         }
497     }
498 
499     X265_FREE(mcbuf);
500 
501     memcpy(slice.m_weightPredTable, wp, sizeof(WeightParam) * 2 * MAX_NUM_REF * 3);
502 
503     if (param.logLevel >= X265_LOG_FULL)
504     {
505         char buf[1024];
506         int p = 0;
507         bool bWeighted = false;
508 
509         p = sprintf(buf, "poc: %d weights:", slice.m_poc);
510         int numPredDir = slice.isInterP() ? 1 : 2;
511         for (int list = 0; list < numPredDir; list++)
512         {
513             WeightParam* w = &wp[list][0][0];
514             if (w[0].bPresentFlag || w[1].bPresentFlag || w[2].bPresentFlag)
515             {
516                 bWeighted = true;
517                 p += sprintf(buf + p, " [L%d:R0 ", list);
518                 if (w[0].bPresentFlag)
519                     p += sprintf(buf + p, "Y{%d/%d%+d}", w[0].inputWeight, 1 << w[0].log2WeightDenom, w[0].inputOffset);
520                 if (w[1].bPresentFlag)
521                     p += sprintf(buf + p, "U{%d/%d%+d}", w[1].inputWeight, 1 << w[1].log2WeightDenom, w[1].inputOffset);
522                 if (w[2].bPresentFlag)
523                     p += sprintf(buf + p, "V{%d/%d%+d}", w[2].inputWeight, 1 << w[2].log2WeightDenom, w[2].inputOffset);
524                 p += sprintf(buf + p, "]");
525             }
526         }
527 
528         if (bWeighted)
529         {
530             if (p < 80) // pad with spaces to ensure progress line overwritten
531                 sprintf(buf + p, "%*s", 80 - p, " ");
532             x265_log(&param, X265_LOG_FULL, "%s\n", buf);
533         }
534     }
535 }
536 }
537