1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
3 *
4 * Author: Shazeb Nawaz Khan <shazeb@multicorewareinc.com>
5 * Steve Borho <steve@borho.org>
6 * Kavitha Sampas <kavitha@multicorewareinc.com>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21 *
22 * This program is also available under a commercial proprietary license.
23 * For more information, contact us at license @ x265.com.
24 *****************************************************************************/
25
26 #include "common.h"
27 #include "frame.h"
28 #include "picyuv.h"
29 #include "lowres.h"
30 #include "slice.h"
31 #include "mv.h"
32 #include "bitstream.h"
33
34 using namespace X265_NS;
35 namespace {
36 struct Cache
37 {
38 const int * intraCost;
39 int numPredDir;
40 int csp;
41 int hshift;
42 int vshift;
43 int lowresWidthInCU;
44 int lowresHeightInCU;
45 };
46
sliceHeaderCost(WeightParam * w,int lambda,int bChroma)47 int sliceHeaderCost(WeightParam *w, int lambda, int bChroma)
48 {
49 /* 4 times higher, because chroma is analyzed at full resolution. */
50 if (bChroma)
51 lambda *= 4;
52 int denomCost = bs_size_ue(w[0].log2WeightDenom) * (2 - bChroma);
53 return lambda * (10 + denomCost + 2 * (bs_size_se(w[0].inputWeight) + bs_size_se(w[0].inputOffset)));
54 }
55
56 /* make a motion compensated copy of lowres ref into mcout with the same stride.
57 * The borders of mcout are not extended */
mcLuma(pixel * mcout,Lowres & ref,const MV * mvs)58 void mcLuma(pixel* mcout, Lowres& ref, const MV * mvs)
59 {
60 intptr_t stride = ref.lumaStride;
61 const int mvshift = 1 << 2;
62 const int cuSize = 8;
63 MV mvmin, mvmax;
64
65 int cu = 0;
66
67 for (int y = 0; y < ref.lines; y += cuSize)
68 {
69 intptr_t pixoff = y * stride;
70 mvmin.y = (int16_t)((-y - 8) * mvshift);
71 mvmax.y = (int16_t)((ref.lines - y - 1 + 8) * mvshift);
72
73 for (int x = 0; x < ref.width; x += cuSize, pixoff += cuSize, cu++)
74 {
75 ALIGN_VAR_16(pixel, buf8x8[8 * 8]);
76 intptr_t bstride = 8;
77 mvmin.x = (int16_t)((-x - 8) * mvshift);
78 mvmax.x = (int16_t)((ref.width - x - 1 + 8) * mvshift);
79
80 /* clip MV to available pixels */
81 MV mv = mvs[cu];
82 mv = mv.clipped(mvmin, mvmax);
83 pixel *tmp = ref.lowresMC(pixoff, mv, buf8x8, bstride);
84 primitives.cu[BLOCK_8x8].copy_pp(mcout + pixoff, stride, tmp, bstride);
85 }
86 }
87 }
88
89 /* use lowres MVs from lookahead to generate a motion compensated chroma plane.
90 * if a block had cheaper lowres cost as intra, we treat it as MV 0 */
mcChroma(pixel * mcout,pixel * src,intptr_t stride,const MV * mvs,const Cache & cache,int height,int width)91 void mcChroma(pixel * mcout,
92 pixel * src,
93 intptr_t stride,
94 const MV * mvs,
95 const Cache& cache,
96 int height,
97 int width)
98 {
99 /* the motion vectors correspond to 8x8 lowres luma blocks, or 16x16 fullres
100 * luma blocks. We have to adapt block size to chroma csp */
101 int csp = cache.csp;
102 int bw = 16 >> cache.hshift;
103 int bh = 16 >> cache.vshift;
104 const int mvshift = 1 << 2;
105 MV mvmin, mvmax;
106
107 for (int y = 0; y < height; y += bh)
108 {
109 /* note: lowres block count per row might be different from chroma block
110 * count per row because of rounding issues, so be very careful with indexing
111 * into the lowres structures */
112 int cu = y * cache.lowresWidthInCU;
113 intptr_t pixoff = y * stride;
114 mvmin.y = (int16_t)((-y - 8) * mvshift);
115 mvmax.y = (int16_t)((height - y - 1 + 8) * mvshift);
116
117 for (int x = 0; x < width; x += bw, cu++, pixoff += bw)
118 {
119 if (x < cache.lowresWidthInCU && y < cache.lowresHeightInCU)
120 {
121 MV mv = mvs[cu]; // lowres MV
122 mv <<= 1; // fullres MV
123 mv.x >>= cache.hshift;
124 mv.y >>= cache.vshift;
125
126 /* clip MV to available pixels */
127 mvmin.x = (int16_t)((-x - 8) * mvshift);
128 mvmax.x = (int16_t)((width - x - 1 + 8) * mvshift);
129 mv = mv.clipped(mvmin, mvmax);
130
131 intptr_t fpeloffset = (mv.y >> 2) * stride + (mv.x >> 2);
132 pixel *temp = src + pixoff + fpeloffset;
133
134 int xFrac = mv.x & 0x7;
135 int yFrac = mv.y & 0x7;
136 if ((yFrac | xFrac) == 0)
137 {
138 primitives.chroma[csp].pu[LUMA_16x16].copy_pp(mcout + pixoff, stride, temp, stride);
139 }
140 else if (yFrac == 0)
141 {
142 primitives.chroma[csp].pu[LUMA_16x16].filter_hpp(temp, stride, mcout + pixoff, stride, xFrac);
143 }
144 else if (xFrac == 0)
145 {
146 primitives.chroma[csp].pu[LUMA_16x16].filter_vpp(temp, stride, mcout + pixoff, stride, yFrac);
147 }
148 else
149 {
150 ALIGN_VAR_16(int16_t, imm[16 * (16 + NTAPS_CHROMA)]);
151 primitives.chroma[csp].pu[LUMA_16x16].filter_hps(temp, stride, imm, bw, xFrac, 1);
152 primitives.chroma[csp].pu[LUMA_16x16].filter_vsp(imm + ((NTAPS_CHROMA >> 1) - 1) * bw, bw, mcout + pixoff, stride, yFrac);
153 }
154 }
155 else
156 {
157 primitives.chroma[csp].pu[LUMA_16x16].copy_pp(mcout + pixoff, stride, src + pixoff, stride);
158 }
159 }
160 }
161 }
162
163 /* Measure sum of 8x8 satd costs between source frame and reference
164 * frame (potentially weighted, potentially motion compensated). We
165 * always use source images for this analysis since reference recon
166 * pixels have unreliable availability */
weightCost(pixel * fenc,pixel * ref,pixel * weightTemp,intptr_t stride,const Cache & cache,int width,int height,WeightParam * w,bool bLuma)167 uint32_t weightCost(pixel * fenc,
168 pixel * ref,
169 pixel * weightTemp,
170 intptr_t stride,
171 const Cache & cache,
172 int width,
173 int height,
174 WeightParam * w,
175 bool bLuma)
176 {
177 if (w)
178 {
179 /* make a weighted copy of the reference plane */
180 int offset = w->inputOffset << (X265_DEPTH - 8);
181 int weight = w->inputWeight;
182 int denom = w->log2WeightDenom;
183 int round = denom ? 1 << (denom - 1) : 0;
184 int correction = IF_INTERNAL_PREC - X265_DEPTH; /* intermediate interpolation depth */
185 int pwidth = ((width + 15) >> 4) << 4;
186
187 primitives.weight_pp(ref, weightTemp, stride, pwidth, height,
188 weight, round << correction, denom + correction, offset);
189 ref = weightTemp;
190 }
191
192 uint32_t cost = 0;
193 pixel *f = fenc, *r = ref;
194
195 if (bLuma)
196 {
197 int cu = 0;
198 for (int y = 0; y < height; y += 8, r += 8 * stride, f += 8 * stride)
199 {
200 for (int x = 0; x < width; x += 8, cu++)
201 {
202 int cmp = primitives.pu[LUMA_8x8].satd(r + x, stride, f + x, stride);
203 cost += X265_MIN(cmp, cache.intraCost[cu]);
204 }
205 }
206 }
207 else if (cache.csp == X265_CSP_I444)
208 for (int y = 0; y < height; y += 16, r += 16 * stride, f += 16 * stride)
209 for (int x = 0; x < width; x += 16)
210 cost += primitives.pu[LUMA_16x16].satd(r + x, stride, f + x, stride);
211 else
212 for (int y = 0; y < height; y += 8, r += 8 * stride, f += 8 * stride)
213 for (int x = 0; x < width; x += 8)
214 cost += primitives.pu[LUMA_8x8].satd(r + x, stride, f + x, stride);
215
216 return cost;
217 }
218 }
219
220 namespace X265_NS {
weightAnalyse(Slice & slice,Frame & frame,x265_param & param)221 void weightAnalyse(Slice& slice, Frame& frame, x265_param& param)
222 {
223 WeightParam wp[2][MAX_NUM_REF][3];
224 PicYuv *fencPic = frame.m_fencPic;
225 Lowres& fenc = frame.m_lowres;
226
227 Cache cache;
228
229 memset(&cache, 0, sizeof(cache));
230 cache.intraCost = fenc.intraCost;
231 cache.numPredDir = slice.isInterP() ? 1 : 2;
232 cache.lowresWidthInCU = fenc.width >> 3;
233 cache.lowresHeightInCU = fenc.lines >> 3;
234 cache.csp = fencPic->m_picCsp;
235 cache.hshift = CHROMA_H_SHIFT(cache.csp);
236 cache.vshift = CHROMA_V_SHIFT(cache.csp);
237
238 /* Use single allocation for motion compensated ref and weight buffers */
239 pixel *mcbuf = X265_MALLOC(pixel, 2 * fencPic->m_stride * fencPic->m_picHeight);
240 if (!mcbuf)
241 {
242 slice.disableWeights();
243 return;
244 }
245 pixel *weightTemp = mcbuf + fencPic->m_stride * fencPic->m_picHeight;
246
247 int lambda = (int)x265_lambda_tab[X265_LOOKAHEAD_QP];
248 int curPoc = slice.m_poc;
249 const float epsilon = 1.f / 128.f;
250
251 int chromaDenom, lumaDenom, denom;
252 chromaDenom = lumaDenom = 7;
253 int numpixels[3];
254 int w16 = ((fencPic->m_picWidth + 15) >> 4) << 4;
255 int h16 = ((fencPic->m_picHeight + 15) >> 4) << 4;
256 numpixels[0] = w16 * h16;
257 numpixels[1] = numpixels[2] = numpixels[0] >> (cache.hshift + cache.vshift);
258
259 for (int list = 0; list < cache.numPredDir; list++)
260 {
261 WeightParam *weights = wp[list][0];
262 Frame *refFrame = slice.m_refFrameList[list][0];
263 Lowres& refLowres = refFrame->m_lowres;
264 int diffPoc = abs(curPoc - refFrame->m_poc);
265
266 /* prepare estimates */
267 float guessScale[3], fencMean[3], refMean[3];
268 for (int plane = 0; plane < 3; plane++)
269 {
270 SET_WEIGHT(weights[plane], false, 1, 0, 0);
271 uint64_t fencVar = fenc.wp_ssd[plane] + !refLowres.wp_ssd[plane];
272 uint64_t refVar = refLowres.wp_ssd[plane] + !refLowres.wp_ssd[plane];
273 guessScale[plane] = sqrt((float)fencVar / refVar);
274 fencMean[plane] = (float)fenc.wp_sum[plane] / (numpixels[plane]) / (1 << (X265_DEPTH - 8));
275 refMean[plane] = (float)refLowres.wp_sum[plane] / (numpixels[plane]) / (1 << (X265_DEPTH - 8));
276 }
277
278 /* make sure both our scale factors fit */
279 while (!list && chromaDenom > 0)
280 {
281 float thresh = 127.f / (1 << chromaDenom);
282 if (guessScale[1] < thresh && guessScale[2] < thresh)
283 break;
284 chromaDenom--;
285 }
286
287 SET_WEIGHT(weights[1], false, 1 << chromaDenom, chromaDenom, 0);
288 SET_WEIGHT(weights[2], false, 1 << chromaDenom, chromaDenom, 0);
289
290 MV *mvs = NULL;
291
292 for (int plane = 0; plane < 3; plane++)
293 {
294 denom = plane ? chromaDenom : lumaDenom;
295 if (plane && !weights[0].bPresentFlag)
296 break;
297
298 /* Early termination */
299 x265_emms();
300 if (fabsf(refMean[plane] - fencMean[plane]) < 0.5f && fabsf(1.f - guessScale[plane]) < epsilon)
301 {
302 SET_WEIGHT(weights[plane], 0, 1 << denom, denom, 0);
303 continue;
304 }
305
306 if (plane)
307 {
308 int scale = x265_clip3(0, 255, (int)(guessScale[plane] * (1 << denom) + 0.5f));
309 if (scale > 127)
310 continue;
311 weights[plane].inputWeight = scale;
312 }
313 else
314 {
315 weights[plane].setFromWeightAndOffset((int)(guessScale[plane] * (1 << denom) + 0.5f), 0, denom, !list);
316 }
317
318 int mindenom = weights[plane].log2WeightDenom;
319 int minscale = weights[plane].inputWeight;
320 int minoff = 0;
321
322 if (!plane && diffPoc <= param.bframes + 1)
323 {
324 mvs = fenc.lowresMvs[list][diffPoc - 1];
325
326 /* test whether this motion search was performed by lookahead */
327 if (mvs[0].x != 0x7FFF)
328 {
329 /* reference chroma planes must be extended prior to being
330 * used as motion compensation sources */
331 if (!refFrame->m_bChromaExtended)
332 {
333 refFrame->m_bChromaExtended = true;
334 PicYuv *refPic = refFrame->m_fencPic;
335 int width = refPic->m_picWidth >> cache.hshift;
336 int height = refPic->m_picHeight >> cache.vshift;
337 extendPicBorder(refPic->m_picOrg[1], refPic->m_strideC, width, height, refPic->m_chromaMarginX, refPic->m_chromaMarginY);
338 extendPicBorder(refPic->m_picOrg[2], refPic->m_strideC, width, height, refPic->m_chromaMarginX, refPic->m_chromaMarginY);
339 }
340 }
341 else
342 mvs = 0;
343 }
344
345 /* prepare inputs to weight analysis */
346 pixel *orig;
347 pixel *fref;
348 intptr_t stride;
349 int width, height;
350 switch (plane)
351 {
352 case 0:
353 orig = fenc.lowresPlane[0];
354 stride = fenc.lumaStride;
355 width = fenc.width;
356 height = fenc.lines;
357 fref = refLowres.lowresPlane[0];
358 if (mvs)
359 {
360 mcLuma(mcbuf, refLowres, mvs);
361 fref = mcbuf;
362 }
363 break;
364
365 case 1:
366 orig = fencPic->m_picOrg[1];
367 stride = fencPic->m_strideC;
368 fref = refFrame->m_fencPic->m_picOrg[1];
369
370 /* Clamp the chroma dimensions to the nearest multiple of
371 * 8x8 blocks (or 16x16 for 4:4:4) since mcChroma uses lowres
372 * blocks and weightCost measures 8x8 blocks. This
373 * potentially ignores some edge pixels, but simplifies the
374 * logic and prevents reading uninitialized pixels. Lowres
375 * planes are border extended and require no clamping. */
376 width = ((fencPic->m_picWidth >> 4) << 4) >> cache.hshift;
377 height = ((fencPic->m_picHeight >> 4) << 4) >> cache.vshift;
378 if (mvs)
379 {
380 mcChroma(mcbuf, fref, stride, mvs, cache, height, width);
381 fref = mcbuf;
382 }
383 break;
384
385 case 2:
386 orig = fencPic->m_picOrg[2];
387 stride = fencPic->m_strideC;
388 fref = refFrame->m_fencPic->m_picOrg[2];
389 width = ((fencPic->m_picWidth >> 4) << 4) >> cache.hshift;
390 height = ((fencPic->m_picHeight >> 4) << 4) >> cache.vshift;
391 if (mvs)
392 {
393 mcChroma(mcbuf, fref, stride, mvs, cache, height, width);
394 fref = mcbuf;
395 }
396 break;
397
398 default:
399 slice.disableWeights();
400 X265_FREE(mcbuf);
401 return;
402 }
403
404 uint32_t origscore = weightCost(orig, fref, weightTemp, stride, cache, width, height, NULL, !plane);
405 if (!origscore)
406 {
407 SET_WEIGHT(weights[plane], 0, 1 << denom, denom, 0);
408 continue;
409 }
410
411 uint32_t minscore = origscore;
412 bool bFound = false;
413
414 /* x264 uses a table lookup here, selecting search range based on preset */
415 static const int scaleDist = 4;
416 static const int offsetDist = 2;
417
418 int startScale = x265_clip3(0, 127, minscale - scaleDist);
419 int endScale = x265_clip3(0, 127, minscale + scaleDist);
420 for (int scale = startScale; scale <= endScale; scale++)
421 {
422 int deltaWeight = scale - (1 << mindenom);
423 if (deltaWeight > 127 || deltaWeight <= -128)
424 continue;
425
426 x265_emms();
427 int curScale = scale;
428 int curOffset = (int)(fencMean[plane] - refMean[plane] * curScale / (1 << mindenom) + 0.5f);
429 if (curOffset < -128 || curOffset > 127)
430 {
431 /* Rescale considering the constraints on curOffset. We do it in this order
432 * because scale has a much wider range than offset (because of denom), so
433 * it should almost never need to be clamped. */
434 curOffset = x265_clip3(-128, 127, curOffset);
435 curScale = (int)((1 << mindenom) * (fencMean[plane] - curOffset) / refMean[plane] + 0.5f);
436 curScale = x265_clip3(0, 127, curScale);
437 }
438
439 int startOffset = x265_clip3(-128, 127, curOffset - offsetDist);
440 int endOffset = x265_clip3(-128, 127, curOffset + offsetDist);
441 for (int off = startOffset; off <= endOffset; off++)
442 {
443 WeightParam wsp;
444 SET_WEIGHT(wsp, true, curScale, mindenom, off);
445 uint32_t s = weightCost(orig, fref, weightTemp, stride, cache, width, height, &wsp, !plane) +
446 sliceHeaderCost(&wsp, lambda, !!plane);
447 COPY4_IF_LT(minscore, s, minscale, curScale, minoff, off, bFound, true);
448
449 /* Don't check any more offsets if the previous one had a lower cost than the current one */
450 if (minoff == startOffset && off != startOffset)
451 break;
452 }
453 }
454
455 /* Use a smaller luma denominator if possible */
456 if (!(plane || list))
457 {
458 while (mindenom > 0 && !(minscale & 1))
459 {
460 mindenom--;
461 minscale >>= 1;
462 }
463 }
464
465 if (!bFound || (minscale == (1 << mindenom) && minoff == 0) || (float)minscore / origscore > 0.998f)
466 {
467 SET_WEIGHT(weights[plane], false, 1 << denom, denom, 0);
468 }
469 else
470 {
471 SET_WEIGHT(weights[plane], true, minscale, mindenom, minoff);
472 }
473 }
474
475 if (weights[0].bPresentFlag)
476 {
477 // Make sure both chroma channels match
478 if (weights[1].bPresentFlag != weights[2].bPresentFlag)
479 {
480 if (weights[1].bPresentFlag)
481 weights[2] = weights[1];
482 else
483 weights[1] = weights[2];
484 }
485 }
486
487 lumaDenom = weights[0].log2WeightDenom;
488 chromaDenom = weights[1].log2WeightDenom;
489
490 /* reset weight states */
491 for (int ref = 1; ref < slice.m_numRefIdx[list]; ref++)
492 {
493 SET_WEIGHT(wp[list][ref][0], false, 1 << lumaDenom, lumaDenom, 0);
494 SET_WEIGHT(wp[list][ref][1], false, 1 << chromaDenom, chromaDenom, 0);
495 SET_WEIGHT(wp[list][ref][2], false, 1 << chromaDenom, chromaDenom, 0);
496 }
497 }
498
499 X265_FREE(mcbuf);
500
501 memcpy(slice.m_weightPredTable, wp, sizeof(WeightParam) * 2 * MAX_NUM_REF * 3);
502
503 if (param.logLevel >= X265_LOG_FULL)
504 {
505 char buf[1024];
506 int p = 0;
507 bool bWeighted = false;
508
509 p = sprintf(buf, "poc: %d weights:", slice.m_poc);
510 int numPredDir = slice.isInterP() ? 1 : 2;
511 for (int list = 0; list < numPredDir; list++)
512 {
513 WeightParam* w = &wp[list][0][0];
514 if (w[0].bPresentFlag || w[1].bPresentFlag || w[2].bPresentFlag)
515 {
516 bWeighted = true;
517 p += sprintf(buf + p, " [L%d:R0 ", list);
518 if (w[0].bPresentFlag)
519 p += sprintf(buf + p, "Y{%d/%d%+d}", w[0].inputWeight, 1 << w[0].log2WeightDenom, w[0].inputOffset);
520 if (w[1].bPresentFlag)
521 p += sprintf(buf + p, "U{%d/%d%+d}", w[1].inputWeight, 1 << w[1].log2WeightDenom, w[1].inputOffset);
522 if (w[2].bPresentFlag)
523 p += sprintf(buf + p, "V{%d/%d%+d}", w[2].inputWeight, 1 << w[2].log2WeightDenom, w[2].inputOffset);
524 p += sprintf(buf + p, "]");
525 }
526 }
527
528 if (bWeighted)
529 {
530 if (p < 80) // pad with spaces to ensure progress line overwritten
531 sprintf(buf + p, "%*s", 80 - p, " ");
532 x265_log(¶m, X265_LOG_FULL, "%s\n", buf);
533 }
534 }
535 }
536 }
537