1 /*****************************************************************************
2  * Copyright (C) 2013-2020 MulticoreWare, Inc
3  *
4  * Author: Shazeb Nawaz Khan <shazeb@multicorewareinc.com>
5  *         Steve Borho <steve@borho.org>
6  *         Kavitha Sampas <kavitha@multicorewareinc.com>
7  *         Min Chen <chenm003@163.com>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
22  *
23  * This program is also available under a commercial proprietary license.
24  * For more information, contact us at license @ x265.com.
25  *****************************************************************************/
26 
27 #include "common.h"
28 #include "frame.h"
29 #include "picyuv.h"
30 #include "lowres.h"
31 #include "slice.h"
32 #include "mv.h"
33 #include "bitstream.h"
34 #include "threading.h"
35 
36 using namespace X265_NS;
37 namespace {
38 struct Cache
39 {
40     const int * intraCost;
41     int         numPredDir;
42     int         csp;
43     int         hshift;
44     int         vshift;
45     int         lowresWidthInCU;
46     int         lowresHeightInCU;
47 };
48 
sliceHeaderCost(WeightParam * w,int lambda,int bChroma)49 int sliceHeaderCost(WeightParam *w, int lambda, int bChroma)
50 {
51     /* 4 times higher, because chroma is analyzed at full resolution. */
52     if (bChroma)
53         lambda *= 4;
54     int denomCost = bs_size_ue(w[0].log2WeightDenom) * (2 - bChroma);
55     return lambda * (10 + denomCost + 2 * (bs_size_se(w[0].inputWeight) + bs_size_se(w[0].inputOffset)));
56 }
57 
58 /* make a motion compensated copy of lowres ref into mcout with the same stride.
59  * The borders of mcout are not extended */
mcLuma(pixel * mcout,Lowres & ref,const MV * mvs)60 void mcLuma(pixel* mcout, Lowres& ref, const MV * mvs)
61 {
62     intptr_t stride = ref.lumaStride;
63     const int mvshift = 1 << 2;
64     const int cuSize = 8;
65     MV mvmin, mvmax;
66 
67     int cu = 0;
68 
69     for (int y = 0; y < ref.lines; y += cuSize)
70     {
71         intptr_t pixoff = y * stride;
72         mvmin.y = (int32_t)((-y - 8) * mvshift);
73         mvmax.y = (int32_t)((ref.lines - y - 1 + 8) * mvshift);
74 
75         for (int x = 0; x < ref.width; x += cuSize, pixoff += cuSize, cu++)
76         {
77             ALIGN_VAR_16(pixel, buf8x8[8 * 8]);
78             intptr_t bstride = 8;
79             mvmin.x = (int32_t)((-x - 8) * mvshift);
80             mvmax.x = (int32_t)((ref.width - x - 1 + 8) * mvshift);
81 
82             /* clip MV to available pixels */
83             MV mv = mvs[cu];
84             mv = mv.clipped(mvmin, mvmax);
85             pixel *tmp = ref.lowresMC(pixoff, mv, buf8x8, bstride, 0);
86             primitives.cu[BLOCK_8x8].copy_pp(mcout + pixoff, stride, tmp, bstride);
87         }
88     }
89 }
90 
91 /* use lowres MVs from lookahead to generate a motion compensated chroma plane.
92  * if a block had cheaper lowres cost as intra, we treat it as MV 0 */
mcChroma(pixel * mcout,pixel * src,intptr_t stride,const MV * mvs,const Cache & cache,int height,int width)93 void mcChroma(pixel *      mcout,
94               pixel *      src,
95               intptr_t     stride,
96               const MV *   mvs,
97               const Cache& cache,
98               int          height,
99               int          width)
100 {
101     /* the motion vectors correspond to 8x8 lowres luma blocks, or 16x16 fullres
102      * luma blocks. We have to adapt block size to chroma csp */
103     int csp = cache.csp;
104     int bw = 16 >> cache.hshift;
105     int bh = 16 >> cache.vshift;
106     const int mvshift = 1 << 2;
107     MV mvmin, mvmax;
108 
109     for (int y = 0; y < height; y += bh)
110     {
111         /* note: lowres block count per row might be different from chroma block
112          * count per row because of rounding issues, so be very careful with indexing
113          * into the lowres structures */
114         int cu = y * cache.lowresWidthInCU;
115         intptr_t pixoff = y * stride;
116         mvmin.y = (int32_t)((-y - 8) * mvshift);
117         mvmax.y = (int32_t)((height - y - 1 + 8) * mvshift);
118 
119         for (int x = 0; x < width; x += bw, cu++, pixoff += bw)
120         {
121             if (x < cache.lowresWidthInCU && y < cache.lowresHeightInCU)
122             {
123                 MV mv = mvs[cu]; // lowres MV
124                 mv <<= 1;        // fullres MV
125                 mv.x >>= cache.hshift;
126                 mv.y >>= cache.vshift;
127 
128                 /* clip MV to available pixels */
129                 mvmin.x = (int32_t)((-x - 8) * mvshift);
130                 mvmax.x = (int32_t)((width - x - 1 + 8) * mvshift);
131                 mv = mv.clipped(mvmin, mvmax);
132 
133                 intptr_t fpeloffset = (mv.y >> 2) * stride + (mv.x >> 2);
134                 pixel *temp = src + pixoff + fpeloffset;
135 
136                 int xFrac = mv.x & 7;
137                 int yFrac = mv.y & 7;
138                 if (!(yFrac | xFrac))
139                 {
140                     primitives.chroma[csp].pu[LUMA_16x16].copy_pp(mcout + pixoff, stride, temp, stride);
141                 }
142                 else if (!yFrac)
143                 {
144                     primitives.chroma[csp].pu[LUMA_16x16].filter_hpp(temp, stride, mcout + pixoff, stride, xFrac);
145                 }
146                 else if (!xFrac)
147                 {
148                     primitives.chroma[csp].pu[LUMA_16x16].filter_vpp(temp, stride, mcout + pixoff, stride, yFrac);
149                 }
150                 else
151                 {
152                     ALIGN_VAR_16(int16_t, immed[16 * (16 + NTAPS_CHROMA - 1)]);
153                     primitives.chroma[csp].pu[LUMA_16x16].filter_hps(temp, stride, immed, bw, xFrac, 1);
154                     primitives.chroma[csp].pu[LUMA_16x16].filter_vsp(immed + ((NTAPS_CHROMA >> 1) - 1) * bw, bw, mcout + pixoff, stride, yFrac);
155                 }
156             }
157             else
158             {
159                 primitives.chroma[csp].pu[LUMA_16x16].copy_pp(mcout + pixoff, stride, src + pixoff, stride);
160             }
161         }
162     }
163 }
164 
165 /* Measure sum of 8x8 satd costs between source frame and reference
166  * frame (potentially weighted, potentially motion compensated). We
167  * always use source images for this analysis since reference recon
168  * pixels have unreliable availability */
weightCost(pixel * fenc,pixel * ref,pixel * weightTemp,intptr_t stride,const Cache & cache,int width,int height,WeightParam * w,bool bLuma)169 uint32_t weightCost(pixel *         fenc,
170                     pixel *         ref,
171                     pixel *         weightTemp,
172                     intptr_t        stride,
173                     const Cache &   cache,
174                     int             width,
175                     int             height,
176                     WeightParam *   w,
177                     bool            bLuma)
178 {
179     if (w)
180     {
181         /* make a weighted copy of the reference plane */
182         int offset = w->inputOffset << (X265_DEPTH - 8);
183         int weight = w->inputWeight;
184         int denom = w->log2WeightDenom;
185         int round = denom ? 1 << (denom - 1) : 0;
186         int correction = IF_INTERNAL_PREC - X265_DEPTH; /* intermediate interpolation depth */
187         int pwidth = ((width + 31) >> 5) << 5;
188         primitives.weight_pp(ref, weightTemp, stride, pwidth, height,
189                              weight, round << correction, denom + correction, offset);
190         ref = weightTemp;
191     }
192 
193     uint32_t cost = 0;
194     pixel *f = fenc, *r = ref;
195 
196     if (bLuma)
197     {
198         int cu = 0;
199         for (int y = 0; y < height; y += 8, r += 8 * stride, f += 8 * stride)
200         {
201             for (int x = 0; x < width; x += 8, cu++)
202             {
203                 int cmp = primitives.pu[LUMA_8x8].satd(r + x, stride, f + x, stride);
204                 cost += X265_MIN(cmp, cache.intraCost[cu]);
205             }
206         }
207     }
208     else if (cache.csp == X265_CSP_I444)
209         for (int y = 0; y < height; y += 16, r += 16 * stride, f += 16 * stride)
210             for (int x = 0; x < width; x += 16)
211                 cost += primitives.pu[LUMA_16x16].satd(r + x, stride, f + x, stride);
212     else
213         for (int y = 0; y < height; y += 8, r += 8 * stride, f += 8 * stride)
214             for (int x = 0; x < width; x += 8)
215                 cost += primitives.pu[LUMA_8x8].satd(r + x, stride, f + x, stride);
216 
217     return cost;
218 }
219 }
220 
221 namespace X265_NS {
weightAnalyse(Slice & slice,Frame & frame,x265_param & param)222 void weightAnalyse(Slice& slice, Frame& frame, x265_param& param)
223 {
224     WeightParam wp[2][MAX_NUM_REF][3];
225     PicYuv *fencPic = frame.m_fencPic;
226     Lowres& fenc    = frame.m_lowres;
227 
228     Cache cache;
229 
230     memset(&cache, 0, sizeof(cache));
231     cache.intraCost = fenc.intraCost;
232     cache.numPredDir = slice.isInterP() ? 1 : 2;
233     cache.lowresWidthInCU = fenc.width >> 3;
234     cache.lowresHeightInCU = fenc.lines >> 3;
235     cache.csp = param.internalCsp;
236     cache.hshift = CHROMA_H_SHIFT(cache.csp);
237     cache.vshift = CHROMA_V_SHIFT(cache.csp);
238 
239     /* Use single allocation for motion compensated ref and weight buffers */
240     pixel *mcbuf = X265_MALLOC(pixel, 2 * fencPic->m_stride * fencPic->m_picHeight);
241     if (!mcbuf)
242     {
243         slice.disableWeights();
244         return;
245     }
246     pixel *weightTemp = mcbuf + fencPic->m_stride * fencPic->m_picHeight;
247 
248     int lambda = (int)x265_lambda_tab[X265_LOOKAHEAD_QP];
249     int curPoc = slice.m_poc;
250     const float epsilon = 1.f / 128.f;
251 
252     int chromaDenom, lumaDenom, denom;
253     chromaDenom = lumaDenom = 7;
254     int numpixels[3];
255     int w16 = ((fencPic->m_picWidth  + 15) >> 4) << 4;
256     int h16 = ((fencPic->m_picHeight + 15) >> 4) << 4;
257     numpixels[0] = w16 * h16;
258     numpixels[1] = numpixels[2] = numpixels[0] >> (cache.hshift + cache.vshift);
259 
260     for (int list = 0; list < cache.numPredDir; list++)
261     {
262         WeightParam *weights = wp[list][0];
263         Frame *refFrame = slice.m_refFrameList[list][0];
264         Lowres& refLowres = refFrame->m_lowres;
265         int diffPoc = abs(curPoc - refFrame->m_poc);
266 
267         /* prepare estimates */
268         float guessScale[3], fencMean[3], refMean[3];
269         for (int plane = 0; plane < (param.internalCsp != X265_CSP_I400 ? 3 : 1); plane++)
270         {
271             SET_WEIGHT(weights[plane], false, 1, 0, 0);
272             uint64_t fencVar = fenc.wp_ssd[plane] + !refLowres.wp_ssd[plane];
273             uint64_t refVar  = refLowres.wp_ssd[plane] + !refLowres.wp_ssd[plane];
274             guessScale[plane] = sqrt((float)fencVar / refVar);
275             fencMean[plane] = (float)fenc.wp_sum[plane] / (numpixels[plane]) / (1 << (X265_DEPTH - 8));
276             refMean[plane]  = (float)refLowres.wp_sum[plane] / (numpixels[plane]) / (1 << (X265_DEPTH - 8));
277         }
278 
279         /* make sure both our scale factors fit */
280         while (!list && chromaDenom > 0)
281         {
282             float thresh = 127.f / (1 << chromaDenom);
283             if (guessScale[1] < thresh && guessScale[2] < thresh)
284                 break;
285             chromaDenom--;
286         }
287 
288         SET_WEIGHT(weights[1], false, 1 << chromaDenom, chromaDenom, 0);
289         SET_WEIGHT(weights[2], false, 1 << chromaDenom, chromaDenom, 0);
290 
291         MV *mvs = NULL;
292 
293         for (int plane = 0; plane < (param.internalCsp != X265_CSP_I400 ? 3 : 1); plane++)
294         {
295             denom = plane ? chromaDenom : lumaDenom;
296             if (plane && !weights[0].wtPresent)
297                 break;
298 
299             /* Early termination */
300             x265_emms();
301             if (fabsf(refMean[plane] - fencMean[plane]) < 0.5f && fabsf(1.f - guessScale[plane]) < epsilon)
302             {
303                 SET_WEIGHT(weights[plane], 0, 1 << denom, denom, 0);
304                 continue;
305             }
306 
307             if (plane)
308             {
309                 int scale = x265_clip3(0, 255, (int)(guessScale[plane] * (1 << denom) + 0.5f));
310                 if (scale > 127)
311                     continue;
312                 weights[plane].inputWeight = scale;
313             }
314             else
315             {
316                 weights[plane].setFromWeightAndOffset((int)(guessScale[plane] * (1 << denom) + 0.5f), 0, denom, !list);
317             }
318 
319             int mindenom = weights[plane].log2WeightDenom;
320             int minscale = weights[plane].inputWeight;
321             int minoff = 0;
322 
323             if (!plane && diffPoc <= param.bframes + 1)
324             {
325                 mvs = fenc.lowresMvs[list][diffPoc];
326 
327                 /* test whether this motion search was performed by lookahead */
328                 if (mvs[0].x != 0x7FFF)
329                 {
330                     /* reference chroma planes must be extended prior to being
331                      * used as motion compensation sources */
332                     if (!refFrame->m_bChromaExtended && param.internalCsp != X265_CSP_I400 && frame.m_fencPic->m_picCsp != X265_CSP_I400)
333                     {
334                         refFrame->m_bChromaExtended = true;
335                         PicYuv *refPic = refFrame->m_fencPic;
336                         int width = refPic->m_picWidth >> cache.hshift;
337                         int height = refPic->m_picHeight >> cache.vshift;
338                         extendPicBorder(refPic->m_picOrg[1], refPic->m_strideC, width, height, refPic->m_chromaMarginX, refPic->m_chromaMarginY);
339                         extendPicBorder(refPic->m_picOrg[2], refPic->m_strideC, width, height, refPic->m_chromaMarginX, refPic->m_chromaMarginY);
340                     }
341                 }
342                 else
343                     mvs = 0;
344             }
345 
346             /* prepare inputs to weight analysis */
347             pixel *orig;
348             pixel *fref;
349             intptr_t stride;
350             int    width, height;
351             switch (plane)
352             {
353             case 0:
354                 orig = fenc.lowresPlane[0];
355                 stride = fenc.lumaStride;
356                 width = fenc.width;
357                 height = fenc.lines;
358                 fref = refLowres.lowresPlane[0];
359                 if (mvs)
360                 {
361                     mcLuma(mcbuf, refLowres, mvs);
362                     fref = mcbuf;
363                 }
364                 break;
365 
366             case 1:
367                 orig = fencPic->m_picOrg[1];
368                 stride = fencPic->m_strideC;
369                 fref = refFrame->m_fencPic->m_picOrg[1];
370 
371                 /* Clamp the chroma dimensions to the nearest multiple of
372                  * 8x8 blocks (or 16x16 for 4:4:4) since mcChroma uses lowres
373                  * blocks and weightCost measures 8x8 blocks. This
374                  * potentially ignores some edge pixels, but simplifies the
375                  * logic and prevents reading uninitialized pixels. Lowres
376                  * planes are border extended and require no clamping. */
377                 width =  ((fencPic->m_picWidth  >> 4) << 4) >> cache.hshift;
378                 height = ((fencPic->m_picHeight >> 4) << 4) >> cache.vshift;
379                 if (mvs)
380                 {
381                     mcChroma(mcbuf, fref, stride, mvs, cache, height, width);
382                     fref = mcbuf;
383                 }
384                 break;
385 
386             case 2:
387                 orig = fencPic->m_picOrg[2];
388                 stride = fencPic->m_strideC;
389                 fref = refFrame->m_fencPic->m_picOrg[2];
390                 width =  ((fencPic->m_picWidth  >> 4) << 4) >> cache.hshift;
391                 height = ((fencPic->m_picHeight >> 4) << 4) >> cache.vshift;
392                 if (mvs)
393                 {
394                     mcChroma(mcbuf, fref, stride, mvs, cache, height, width);
395                     fref = mcbuf;
396                 }
397                 break;
398 
399             default:
400                 slice.disableWeights();
401                 X265_FREE(mcbuf);
402                 return;
403             }
404 
405             uint32_t origscore = weightCost(orig, fref, weightTemp, stride, cache, width, height, NULL, !plane);
406             if (!origscore)
407             {
408                 SET_WEIGHT(weights[plane], 0, 1 << denom, denom, 0);
409                 continue;
410             }
411 
412             uint32_t minscore = origscore;
413             bool bFound = false;
414 
415             /* x264 uses a table lookup here, selecting search range based on preset */
416             static const int scaleDist = 4;
417             static const int offsetDist = 2;
418 
419             int startScale = x265_clip3(0, 127, minscale - scaleDist);
420             int endScale   = x265_clip3(0, 127, minscale + scaleDist);
421             for (int scale = startScale; scale <= endScale; scale++)
422             {
423                 int deltaWeight = scale - (1 << mindenom);
424                 if (deltaWeight > 127 || deltaWeight <= -128)
425                     continue;
426 
427                 x265_emms();
428                 int curScale = scale;
429                 int curOffset = (int)(fencMean[plane] - refMean[plane] * curScale / (1 << mindenom) + 0.5f);
430                 if (curOffset < -128 || curOffset > 127)
431                 {
432                     /* Rescale considering the constraints on curOffset. We do it in this order
433                      * because scale has a much wider range than offset (because of denom), so
434                      * it should almost never need to be clamped. */
435                     curOffset = x265_clip3(-128, 127, curOffset);
436                     curScale = (int)((1 << mindenom) * (fencMean[plane] - curOffset) / refMean[plane] + 0.5f);
437                     curScale = x265_clip3(0, 127, curScale);
438                 }
439 
440                 int startOffset = x265_clip3(-128, 127, curOffset - offsetDist);
441                 int endOffset   = x265_clip3(-128, 127, curOffset + offsetDist);
442                 for (int off = startOffset; off <= endOffset; off++)
443                 {
444                     WeightParam wsp;
445                     SET_WEIGHT(wsp, true, curScale, mindenom, off);
446                     uint32_t s = weightCost(orig, fref, weightTemp, stride, cache, width, height, &wsp, !plane) +
447                                  sliceHeaderCost(&wsp, lambda, !!plane);
448                     COPY4_IF_LT(minscore, s, minscale, curScale, minoff, off, bFound, true);
449 
450                     /* Don't check any more offsets if the previous one had a lower cost than the current one */
451                     if (minoff == startOffset && off != startOffset)
452                         break;
453                 }
454             }
455 
456             /* Use a smaller luma denominator if possible */
457             if (!(plane || list))
458             {
459                 if (mindenom > 0 && !(minscale & 1))
460                 {
461                     unsigned long idx;
462                     CTZ(idx, minscale);
463                     int shift = X265_MIN((int)idx, mindenom);
464                     mindenom -= shift;
465                     minscale >>= shift;
466                 }
467             }
468 
469             if (!bFound || (minscale == (1 << mindenom) && minoff == 0) || (float)minscore / origscore > 0.998f)
470             {
471                 SET_WEIGHT(weights[plane], false, 1 << denom, denom, 0);
472             }
473             else
474             {
475                 SET_WEIGHT(weights[plane], true, minscale, mindenom, minoff);
476             }
477         }
478 
479         if (weights[0].wtPresent)
480         {
481             // Make sure both chroma channels match
482             if (weights[1].wtPresent != weights[2].wtPresent)
483             {
484                 if (weights[1].wtPresent)
485                     weights[2] = weights[1];
486                 else
487                     weights[1] = weights[2];
488             }
489         }
490 
491         lumaDenom = weights[0].log2WeightDenom;
492         chromaDenom = weights[1].log2WeightDenom;
493 
494         /* reset weight states */
495         for (int ref = 1; ref < slice.m_numRefIdx[list]; ref++)
496         {
497             SET_WEIGHT(wp[list][ref][0], false, 1 << lumaDenom, lumaDenom, 0);
498             SET_WEIGHT(wp[list][ref][1], false, 1 << chromaDenom, chromaDenom, 0);
499             SET_WEIGHT(wp[list][ref][2], false, 1 << chromaDenom, chromaDenom, 0);
500         }
501     }
502 
503     X265_FREE(mcbuf);
504 
505     memcpy(slice.m_weightPredTable, wp, sizeof(WeightParam) * 2 * MAX_NUM_REF * 3);
506 
507     if (param.logLevel >= X265_LOG_FULL)
508     {
509         char buf[1024];
510         int p = 0;
511         bool bWeighted = false;
512 
513         p = sprintf(buf, "poc: %d weights:", slice.m_poc);
514         int numPredDir = slice.isInterP() ? 1 : 2;
515         for (int list = 0; list < numPredDir; list++)
516         {
517             WeightParam* w = &wp[list][0][0];
518             if (w[0].wtPresent || w[1].wtPresent || w[2].wtPresent)
519             {
520                 bWeighted = true;
521                 p += sprintf(buf + p, " [L%d:R0 ", list);
522                 if (w[0].wtPresent)
523                     p += sprintf(buf + p, "Y{%d/%d%+d}", w[0].inputWeight, 1 << w[0].log2WeightDenom, w[0].inputOffset);
524                 if (w[1].wtPresent)
525                     p += sprintf(buf + p, "U{%d/%d%+d}", w[1].inputWeight, 1 << w[1].log2WeightDenom, w[1].inputOffset);
526                 if (w[2].wtPresent)
527                     p += sprintf(buf + p, "V{%d/%d%+d}", w[2].inputWeight, 1 << w[2].log2WeightDenom, w[2].inputOffset);
528                 p += sprintf(buf + p, "]");
529             }
530         }
531 
532         if (bWeighted)
533         {
534             if (p < 80) // pad with spaces to ensure progress line overwritten
535                 sprintf(buf + p, "%*s", 80 - p, " ");
536             x265_log(&param, X265_LOG_FULL, "%s\n", buf);
537         }
538     }
539 }
540 }
541