1 /*****************************************************************************
2 * Copyright (C) 2013 x265 project
3 *
4 * Authors: Steve Borho <steve@borho.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
19 *
20 * This program is also available under a commercial proprietary license.
21 * For more information, contact us at license @ x265.com.
22 *****************************************************************************/
23
24 #include "common.h"
25 #include "primitives.h"
26 #include "lowres.h"
27 #include "motion.h"
28 #include "x265.h"
29
30 #if _MSC_VER
31 #pragma warning(disable: 4127) // conditional expression is constant (macros use this construct)
32 #endif
33
34 using namespace X265_NS;
35
36 namespace {
37
38 struct SubpelWorkload
39 {
40 int hpel_iters;
41 int hpel_dirs;
42 int qpel_iters;
43 int qpel_dirs;
44 bool hpel_satd;
45 };
46
47 const SubpelWorkload workload[X265_MAX_SUBPEL_LEVEL + 1] =
48 {
49 { 1, 4, 0, 4, false }, // 4 SAD HPEL only
50 { 1, 4, 1, 4, false }, // 4 SAD HPEL + 4 SATD QPEL
51 { 1, 4, 1, 4, true }, // 4 SATD HPEL + 4 SATD QPEL
52 { 2, 4, 1, 4, true }, // 2x4 SATD HPEL + 4 SATD QPEL
53 { 2, 4, 2, 4, true }, // 2x4 SATD HPEL + 2x4 SATD QPEL
54 { 1, 8, 1, 8, true }, // 8 SATD HPEL + 8 SATD QPEL (default)
55 { 2, 8, 1, 8, true }, // 2x8 SATD HPEL + 8 SATD QPEL
56 { 2, 8, 2, 8, true }, // 2x8 SATD HPEL + 2x8 SATD QPEL
57 };
58
59 static int sizeScale[NUM_PU_SIZES];
60 #define SAD_THRESH(v) (bcost < (((v >> 4) * sizeScale[partEnum])))
61
62 /* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */
63 const MV hex2[8] = { MV(-1, -2), MV(-2, 0), MV(-1, 2), MV(1, 2), MV(2, 0), MV(1, -2), MV(-1, -2), MV(-2, 0) };
64 const uint8_t mod6m1[8] = { 5, 0, 1, 2, 3, 4, 5, 0 }; /* (x-1)%6 */
65 const MV square1[9] = { MV(0, 0), MV(0, -1), MV(0, 1), MV(-1, 0), MV(1, 0), MV(-1, -1), MV(-1, 1), MV(1, -1), MV(1, 1) };
66 const MV hex4[16] =
67 {
68 MV(0, -4), MV(0, 4), MV(-2, -3), MV(2, -3),
69 MV(-4, -2), MV(4, -2), MV(-4, -1), MV(4, -1),
70 MV(-4, 0), MV(4, 0), MV(-4, 1), MV(4, 1),
71 MV(-4, 2), MV(4, 2), MV(-2, 3), MV(2, 3),
72 };
73 const MV offsets[] =
74 {
75 MV(-1, 0), MV(0, -1),
76 MV(-1, -1), MV(1, -1),
77 MV(-1, 0), MV(1, 0),
78 MV(-1, 1), MV(-1, -1),
79 MV(1, -1), MV(1, 1),
80 MV(-1, 0), MV(0, 1),
81 MV(-1, 1), MV(1, 1),
82 MV(1, 0), MV(0, 1),
83 }; // offsets for Two Point Search
84
85 /* sum of absolute differences between MV candidates, used for adaptive ME range */
predictorDifference(const MV * mvc,intptr_t numCandidates)86 inline int predictorDifference(const MV *mvc, intptr_t numCandidates)
87 {
88 int sum = 0;
89
90 for (int i = 0; i < numCandidates - 1; i++)
91 {
92 sum += abs(mvc[i].x - mvc[i + 1].x)
93 + abs(mvc[i].y - mvc[i + 1].y);
94 }
95
96 return sum;
97 }
98
99 }
100
MotionEstimate()101 MotionEstimate::MotionEstimate()
102 {
103 ctuAddr = -1;
104 absPartIdx = -1;
105 searchMethod = X265_HEX_SEARCH;
106 subpelRefine = 2;
107 blockwidth = blockheight = 0;
108 blockOffset = 0;
109 bChromaSATD = false;
110 chromaSatd = NULL;
111 }
112
init(int method,int refine,int csp)113 void MotionEstimate::init(int method, int refine, int csp)
114 {
115 searchMethod = method;
116 subpelRefine = refine;
117 fencPUYuv.create(FENC_STRIDE, csp);
118 }
119
initScales(void)120 void MotionEstimate::initScales(void)
121 {
122 #define SETUP_SCALE(W, H) \
123 sizeScale[LUMA_ ## W ## x ## H] = (H * H) >> 4;
124 SETUP_SCALE(4, 4);
125 SETUP_SCALE(8, 8);
126 SETUP_SCALE(8, 4);
127 SETUP_SCALE(4, 8);
128 SETUP_SCALE(16, 16);
129 SETUP_SCALE(16, 8);
130 SETUP_SCALE(8, 16);
131 SETUP_SCALE(16, 12);
132 SETUP_SCALE(12, 16);
133 SETUP_SCALE(4, 16);
134 SETUP_SCALE(16, 4);
135 SETUP_SCALE(32, 32);
136 SETUP_SCALE(32, 16);
137 SETUP_SCALE(16, 32);
138 SETUP_SCALE(32, 24);
139 SETUP_SCALE(24, 32);
140 SETUP_SCALE(32, 8);
141 SETUP_SCALE(8, 32);
142 SETUP_SCALE(64, 64);
143 SETUP_SCALE(64, 32);
144 SETUP_SCALE(32, 64);
145 SETUP_SCALE(64, 48);
146 SETUP_SCALE(48, 64);
147 SETUP_SCALE(64, 16);
148 SETUP_SCALE(16, 64);
149 #undef SETUP_SCALE
150 }
151
hpelIterationCount(int subme)152 int MotionEstimate::hpelIterationCount(int subme)
153 {
154 return workload[subme].hpel_iters +
155 workload[subme].qpel_iters / 2;
156 }
157
~MotionEstimate()158 MotionEstimate::~MotionEstimate()
159 {
160 fencPUYuv.destroy();
161 }
162
163 /* Called by lookahead, luma only, no use of PicYuv */
setSourcePU(pixel * fencY,intptr_t stride,intptr_t offset,int pwidth,int pheight)164 void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight)
165 {
166 partEnum = partitionFromSizes(pwidth, pheight);
167 X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
168 sad = primitives.pu[partEnum].sad;
169 satd = primitives.pu[partEnum].satd;
170 sad_x3 = primitives.pu[partEnum].sad_x3;
171 sad_x4 = primitives.pu[partEnum].sad_x4;
172
173 blockwidth = pwidth;
174 blockOffset = offset;
175 absPartIdx = ctuAddr = -1;
176
177 /* copy PU block into cache */
178 primitives.pu[partEnum].copy_pp(fencPUYuv.m_buf[0], FENC_STRIDE, fencY + offset, stride);
179 X265_CHECK(!bChromaSATD, "chroma distortion measurements impossible in this code path\n");
180 }
181
182 /* Called by Search::predInterSearch() or --pme equivalent, chroma residual might be considered */
setSourcePU(const Yuv & srcFencYuv,int _ctuAddr,int cuPartIdx,int puPartIdx,int pwidth,int pheight)183 void MotionEstimate::setSourcePU(const Yuv& srcFencYuv, int _ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight)
184 {
185 partEnum = partitionFromSizes(pwidth, pheight);
186 X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
187 sad = primitives.pu[partEnum].sad;
188 satd = primitives.pu[partEnum].satd;
189 sad_x3 = primitives.pu[partEnum].sad_x3;
190 sad_x4 = primitives.pu[partEnum].sad_x4;
191 chromaSatd = primitives.chroma[fencPUYuv.m_csp].pu[partEnum].satd;
192
193 /* Enable chroma residual cost if subpelRefine level is greater than 2 and chroma block size
194 * is an even multiple of 4x4 pixels (indicated by non-null chromaSatd pointer) */
195 bChromaSATD = subpelRefine > 2 && chromaSatd;
196 X265_CHECK(!(bChromaSATD && !workload[subpelRefine].hpel_satd), "Chroma SATD cannot be used with SAD hpel\n");
197
198 ctuAddr = _ctuAddr;
199 absPartIdx = cuPartIdx + puPartIdx;
200 blockwidth = pwidth;
201 blockOffset = 0;
202
203 /* copy PU from CU Yuv */
204 fencPUYuv.copyPUFromYuv(srcFencYuv, puPartIdx, partEnum, bChromaSATD);
205 }
206
207 #define COST_MV_PT_DIST(mx, my, point, dist) \
208 do \
209 { \
210 MV tmv(mx, my); \
211 int cost = sad(fenc, FENC_STRIDE, fref + mx + my * stride, stride); \
212 cost += mvcost(tmv << 2); \
213 if (cost < bcost) { \
214 bcost = cost; \
215 bmv = tmv; \
216 bPointNr = point; \
217 bDistance = dist; \
218 } \
219 } while (0)
220
221 #define COST_MV(mx, my) \
222 do \
223 { \
224 int cost = sad(fenc, FENC_STRIDE, fref + (mx) + (my) * stride, stride); \
225 cost += mvcost(MV(mx, my) << 2); \
226 COPY2_IF_LT(bcost, cost, bmv, MV(mx, my)); \
227 } while (0)
228
229 #define COST_MV_X3_DIR(m0x, m0y, m1x, m1y, m2x, m2y, costs) \
230 { \
231 pixel *pix_base = fref + bmv.x + bmv.y * stride; \
232 sad_x3(fenc, \
233 pix_base + (m0x) + (m0y) * stride, \
234 pix_base + (m1x) + (m1y) * stride, \
235 pix_base + (m2x) + (m2y) * stride, \
236 stride, costs); \
237 (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \
238 (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
239 (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
240 }
241
242 #define COST_MV_PT_DIST_X4(m0x, m0y, p0, d0, m1x, m1y, p1, d1, m2x, m2y, p2, d2, m3x, m3y, p3, d3) \
243 { \
244 sad_x4(fenc, \
245 fref + (m0x) + (m0y) * stride, \
246 fref + (m1x) + (m1y) * stride, \
247 fref + (m2x) + (m2y) * stride, \
248 fref + (m3x) + (m3y) * stride, \
249 stride, costs); \
250 (costs)[0] += mvcost(MV(m0x, m0y) << 2); \
251 (costs)[1] += mvcost(MV(m1x, m1y) << 2); \
252 (costs)[2] += mvcost(MV(m2x, m2y) << 2); \
253 (costs)[3] += mvcost(MV(m3x, m3y) << 2); \
254 COPY4_IF_LT(bcost, costs[0], bmv, MV(m0x, m0y), bPointNr, p0, bDistance, d0); \
255 COPY4_IF_LT(bcost, costs[1], bmv, MV(m1x, m1y), bPointNr, p1, bDistance, d1); \
256 COPY4_IF_LT(bcost, costs[2], bmv, MV(m2x, m2y), bPointNr, p2, bDistance, d2); \
257 COPY4_IF_LT(bcost, costs[3], bmv, MV(m3x, m3y), bPointNr, p3, bDistance, d3); \
258 }
259
260 #define COST_MV_X4(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y) \
261 { \
262 pixel *pix_base = fref + omv.x + omv.y * stride; \
263 sad_x4(fenc, \
264 pix_base + (m0x) + (m0y) * stride, \
265 pix_base + (m1x) + (m1y) * stride, \
266 pix_base + (m2x) + (m2y) * stride, \
267 pix_base + (m3x) + (m3y) * stride, \
268 stride, costs); \
269 costs[0] += mvcost((omv + MV(m0x, m0y)) << 2); \
270 costs[1] += mvcost((omv + MV(m1x, m1y)) << 2); \
271 costs[2] += mvcost((omv + MV(m2x, m2y)) << 2); \
272 costs[3] += mvcost((omv + MV(m3x, m3y)) << 2); \
273 COPY2_IF_LT(bcost, costs[0], bmv, omv + MV(m0x, m0y)); \
274 COPY2_IF_LT(bcost, costs[1], bmv, omv + MV(m1x, m1y)); \
275 COPY2_IF_LT(bcost, costs[2], bmv, omv + MV(m2x, m2y)); \
276 COPY2_IF_LT(bcost, costs[3], bmv, omv + MV(m3x, m3y)); \
277 }
278
279 #define COST_MV_X4_DIR(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y, costs) \
280 { \
281 pixel *pix_base = fref + bmv.x + bmv.y * stride; \
282 sad_x4(fenc, \
283 pix_base + (m0x) + (m0y) * stride, \
284 pix_base + (m1x) + (m1y) * stride, \
285 pix_base + (m2x) + (m2y) * stride, \
286 pix_base + (m3x) + (m3y) * stride, \
287 stride, costs); \
288 (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \
289 (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
290 (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
291 (costs)[3] += mvcost((bmv + MV(m3x, m3y)) << 2); \
292 }
293
294 #define DIA1_ITER(mx, my) \
295 { \
296 omv.x = mx; omv.y = my; \
297 COST_MV_X4(0, -1, 0, 1, -1, 0, 1, 0); \
298 }
299
300 #define CROSS(start, x_max, y_max) \
301 { \
302 int16_t i = start; \
303 if ((x_max) <= X265_MIN(mvmax.x - omv.x, omv.x - mvmin.x)) \
304 for (; i < (x_max) - 2; i += 4) { \
305 COST_MV_X4(i, 0, -i, 0, i + 2, 0, -i - 2, 0); } \
306 for (; i < (x_max); i += 2) \
307 { \
308 if (omv.x + i <= mvmax.x) \
309 COST_MV(omv.x + i, omv.y); \
310 if (omv.x - i >= mvmin.x) \
311 COST_MV(omv.x - i, omv.y); \
312 } \
313 i = start; \
314 if ((y_max) <= X265_MIN(mvmax.y - omv.y, omv.y - mvmin.y)) \
315 for (; i < (y_max) - 2; i += 4) { \
316 COST_MV_X4(0, i, 0, -i, 0, i + 2, 0, -i - 2); } \
317 for (; i < (y_max); i += 2) \
318 { \
319 if (omv.y + i <= mvmax.y) \
320 COST_MV(omv.x, omv.y + i); \
321 if (omv.y - i >= mvmin.y) \
322 COST_MV(omv.x, omv.y - i); \
323 } \
324 }
325
StarPatternSearch(ReferencePlanes * ref,const MV & mvmin,const MV & mvmax,MV & bmv,int & bcost,int & bPointNr,int & bDistance,int earlyExitIters,int merange)326 void MotionEstimate::StarPatternSearch(ReferencePlanes *ref,
327 const MV & mvmin,
328 const MV & mvmax,
329 MV & bmv,
330 int & bcost,
331 int & bPointNr,
332 int & bDistance,
333 int earlyExitIters,
334 int merange)
335 {
336 ALIGN_VAR_16(int, costs[16]);
337 pixel* fenc = fencPUYuv.m_buf[0];
338 pixel* fref = ref->fpelPlane[0] + blockOffset;
339 intptr_t stride = ref->lumaStride;
340
341 MV omv = bmv;
342 int saved = bcost;
343 int rounds = 0;
344
345 {
346 int16_t dist = 1;
347
348 /* bPointNr
349 2
350 4 * 5
351 7
352 */
353 const int16_t top = omv.y - dist;
354 const int16_t bottom = omv.y + dist;
355 const int16_t left = omv.x - dist;
356 const int16_t right = omv.x + dist;
357
358 if (top >= mvmin.y && left >= mvmin.x && right <= mvmax.x && bottom <= mvmax.y)
359 {
360 COST_MV_PT_DIST_X4(omv.x, top, 2, dist,
361 left, omv.y, 4, dist,
362 right, omv.y, 5, dist,
363 omv.x, bottom, 7, dist);
364 }
365 else
366 {
367 if (top >= mvmin.y) // check top
368 {
369 COST_MV_PT_DIST(omv.x, top, 2, dist);
370 }
371 if (left >= mvmin.x) // check middle left
372 {
373 COST_MV_PT_DIST(left, omv.y, 4, dist);
374 }
375 if (right <= mvmax.x) // check middle right
376 {
377 COST_MV_PT_DIST(right, omv.y, 5, dist);
378 }
379 if (bottom <= mvmax.y) // check bottom
380 {
381 COST_MV_PT_DIST(omv.x, bottom, 7, dist);
382 }
383 }
384 if (bcost < saved)
385 rounds = 0;
386 else if (++rounds >= earlyExitIters)
387 return;
388 }
389
390 for (int16_t dist = 2; dist <= 8; dist <<= 1)
391 {
392 /* bPointNr
393 2
394 1 3
395 4 * 5
396 6 8
397 7
398 Points 2, 4, 5, 7 are dist
399 Points 1, 3, 6, 8 are dist>>1
400 */
401 const int16_t top = omv.y - dist;
402 const int16_t bottom = omv.y + dist;
403 const int16_t left = omv.x - dist;
404 const int16_t right = omv.x + dist;
405 const int16_t top2 = omv.y - (dist >> 1);
406 const int16_t bottom2 = omv.y + (dist >> 1);
407 const int16_t left2 = omv.x - (dist >> 1);
408 const int16_t right2 = omv.x + (dist >> 1);
409 saved = bcost;
410
411 if (top >= mvmin.y && left >= mvmin.x &&
412 right <= mvmax.x && bottom <= mvmax.y) // check border
413 {
414 COST_MV_PT_DIST_X4(omv.x, top, 2, dist,
415 left2, top2, 1, dist >> 1,
416 right2, top2, 3, dist >> 1,
417 left, omv.y, 4, dist);
418 COST_MV_PT_DIST_X4(right, omv.y, 5, dist,
419 left2, bottom2, 6, dist >> 1,
420 right2, bottom2, 8, dist >> 1,
421 omv.x, bottom, 7, dist);
422 }
423 else // check border for each mv
424 {
425 if (top >= mvmin.y) // check top
426 {
427 COST_MV_PT_DIST(omv.x, top, 2, dist);
428 }
429 if (top2 >= mvmin.y) // check half top
430 {
431 if (left2 >= mvmin.x) // check half left
432 {
433 COST_MV_PT_DIST(left2, top2, 1, (dist >> 1));
434 }
435 if (right2 <= mvmax.x) // check half right
436 {
437 COST_MV_PT_DIST(right2, top2, 3, (dist >> 1));
438 }
439 }
440 if (left >= mvmin.x) // check left
441 {
442 COST_MV_PT_DIST(left, omv.y, 4, dist);
443 }
444 if (right <= mvmax.x) // check right
445 {
446 COST_MV_PT_DIST(right, omv.y, 5, dist);
447 }
448 if (bottom2 <= mvmax.y) // check half bottom
449 {
450 if (left2 >= mvmin.x) // check half left
451 {
452 COST_MV_PT_DIST(left2, bottom2, 6, (dist >> 1));
453 }
454 if (right2 <= mvmax.x) // check half right
455 {
456 COST_MV_PT_DIST(right2, bottom2, 8, (dist >> 1));
457 }
458 }
459 if (bottom <= mvmax.y) // check bottom
460 {
461 COST_MV_PT_DIST(omv.x, bottom, 7, dist);
462 }
463 }
464
465 if (bcost < saved)
466 rounds = 0;
467 else if (++rounds >= earlyExitIters)
468 return;
469 }
470
471 for (int16_t dist = 16; dist <= (int16_t)merange; dist <<= 1)
472 {
473 const int16_t top = omv.y - dist;
474 const int16_t bottom = omv.y + dist;
475 const int16_t left = omv.x - dist;
476 const int16_t right = omv.x + dist;
477
478 saved = bcost;
479 if (top >= mvmin.y && left >= mvmin.x &&
480 right <= mvmax.x && bottom <= mvmax.y) // check border
481 {
482 /* index
483 0
484 3
485 2
486 1
487 0 3 2 1 * 1 2 3 0
488 1
489 2
490 3
491 0
492 */
493
494 COST_MV_PT_DIST_X4(omv.x, top, 0, dist,
495 left, omv.y, 0, dist,
496 right, omv.y, 0, dist,
497 omv.x, bottom, 0, dist);
498
499 for (int16_t index = 1; index < 4; index++)
500 {
501 int16_t posYT = top + ((dist >> 2) * index);
502 int16_t posYB = bottom - ((dist >> 2) * index);
503 int16_t posXL = omv.x - ((dist >> 2) * index);
504 int16_t posXR = omv.x + ((dist >> 2) * index);
505
506 COST_MV_PT_DIST_X4(posXL, posYT, 0, dist,
507 posXR, posYT, 0, dist,
508 posXL, posYB, 0, dist,
509 posXR, posYB, 0, dist);
510 }
511 }
512 else // check border for each mv
513 {
514 if (top >= mvmin.y) // check top
515 {
516 COST_MV_PT_DIST(omv.x, top, 0, dist);
517 }
518 if (left >= mvmin.x) // check left
519 {
520 COST_MV_PT_DIST(left, omv.y, 0, dist);
521 }
522 if (right <= mvmax.x) // check right
523 {
524 COST_MV_PT_DIST(right, omv.y, 0, dist);
525 }
526 if (bottom <= mvmax.y) // check bottom
527 {
528 COST_MV_PT_DIST(omv.x, bottom, 0, dist);
529 }
530 for (int16_t index = 1; index < 4; index++)
531 {
532 int16_t posYT = top + ((dist >> 2) * index);
533 int16_t posYB = bottom - ((dist >> 2) * index);
534 int16_t posXL = omv.x - ((dist >> 2) * index);
535 int16_t posXR = omv.x + ((dist >> 2) * index);
536
537 if (posYT >= mvmin.y) // check top
538 {
539 if (posXL >= mvmin.x) // check left
540 {
541 COST_MV_PT_DIST(posXL, posYT, 0, dist);
542 }
543 if (posXR <= mvmax.x) // check right
544 {
545 COST_MV_PT_DIST(posXR, posYT, 0, dist);
546 }
547 }
548 if (posYB <= mvmax.y) // check bottom
549 {
550 if (posXL >= mvmin.x) // check left
551 {
552 COST_MV_PT_DIST(posXL, posYB, 0, dist);
553 }
554 if (posXR <= mvmax.x) // check right
555 {
556 COST_MV_PT_DIST(posXR, posYB, 0, dist);
557 }
558 }
559 }
560 }
561
562 if (bcost < saved)
563 rounds = 0;
564 else if (++rounds >= earlyExitIters)
565 return;
566 }
567 }
568
motionEstimate(ReferencePlanes * ref,const MV & mvmin,const MV & mvmax,const MV & qmvp,int numCandidates,const MV * mvc,int merange,MV & outQMv)569 int MotionEstimate::motionEstimate(ReferencePlanes *ref,
570 const MV & mvmin,
571 const MV & mvmax,
572 const MV & qmvp,
573 int numCandidates,
574 const MV * mvc,
575 int merange,
576 MV & outQMv)
577 {
578 ALIGN_VAR_16(int, costs[16]);
579 if (ctuAddr >= 0)
580 blockOffset = ref->reconPic->getLumaAddr(ctuAddr, absPartIdx) - ref->reconPic->getLumaAddr(0);
581 intptr_t stride = ref->lumaStride;
582 pixel* fenc = fencPUYuv.m_buf[0];
583 pixel* fref = ref->fpelPlane[0] + blockOffset;
584
585 setMVP(qmvp);
586
587 MV qmvmin = mvmin.toQPel();
588 MV qmvmax = mvmax.toQPel();
589
590 /* The term cost used here means satd/sad values for that particular search.
591 * The costs used in ME integer search only includes the SAD cost of motion
592 * residual and sqrtLambda times MVD bits. The subpel refine steps use SATD
593 * cost of residual and sqrtLambda * MVD bits. Mode decision will be based
594 * on video distortion cost (SSE/PSNR) plus lambda times all signaling bits
595 * (mode + MVD bits). */
596
597 // measure SAD cost at clipped QPEL MVP
598 MV pmv = qmvp.clipped(qmvmin, qmvmax);
599 MV bestpre = pmv;
600 int bprecost;
601
602 if (ref->isLowres)
603 bprecost = ref->lowresQPelCost(fenc, blockOffset, pmv, sad);
604 else
605 bprecost = subpelCompare(ref, pmv, sad);
606
607 /* re-measure full pel rounded MVP with SAD as search start point */
608 MV bmv = pmv.roundToFPel();
609 int bcost = bprecost;
610 if (pmv.isSubpel())
611 bcost = sad(fenc, FENC_STRIDE, fref + bmv.x + bmv.y * stride, stride) + mvcost(bmv << 2);
612
613 // measure SAD cost at MV(0) if MVP is not zero
614 if (pmv.notZero())
615 {
616 int cost = sad(fenc, FENC_STRIDE, fref, stride) + mvcost(MV(0, 0));
617 if (cost < bcost)
618 {
619 bcost = cost;
620 bmv = 0;
621 }
622 }
623
624 X265_CHECK(!(ref->isLowres && numCandidates), "lowres motion candidates not allowed\n")
625 // measure SAD cost at each QPEL motion vector candidate
626 for (int i = 0; i < numCandidates; i++)
627 {
628 MV m = mvc[i].clipped(qmvmin, qmvmax);
629 if (m.notZero() & (m != pmv ? 1 : 0) & (m != bestpre ? 1 : 0)) // check already measured
630 {
631 int cost = subpelCompare(ref, m, sad) + mvcost(m);
632 if (cost < bprecost)
633 {
634 bprecost = cost;
635 bestpre = m;
636 }
637 }
638 }
639
640 pmv = pmv.roundToFPel();
641 MV omv = bmv; // current search origin or starting point
642
643 switch (searchMethod)
644 {
645 case X265_DIA_SEARCH:
646 {
647 /* diamond search, radius 1 */
648 bcost <<= 4;
649 int i = merange;
650 do
651 {
652 COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs);
653 COPY1_IF_LT(bcost, (costs[0] << 4) + 1);
654 COPY1_IF_LT(bcost, (costs[1] << 4) + 3);
655 COPY1_IF_LT(bcost, (costs[2] << 4) + 4);
656 COPY1_IF_LT(bcost, (costs[3] << 4) + 12);
657 if (!(bcost & 15))
658 break;
659 bmv.x -= (bcost << 28) >> 30;
660 bmv.y -= (bcost << 30) >> 30;
661 bcost &= ~15;
662 }
663 while (--i && bmv.checkRange(mvmin, mvmax));
664 bcost >>= 4;
665 break;
666 }
667
668 case X265_HEX_SEARCH:
669 {
670 me_hex2:
671 /* hexagon search, radius 2 */
672 #if 0
673 for (int i = 0; i < merange / 2; i++)
674 {
675 omv = bmv;
676 COST_MV(omv.x - 2, omv.y);
677 COST_MV(omv.x - 1, omv.y + 2);
678 COST_MV(omv.x + 1, omv.y + 2);
679 COST_MV(omv.x + 2, omv.y);
680 COST_MV(omv.x + 1, omv.y - 2);
681 COST_MV(omv.x - 1, omv.y - 2);
682 if (omv == bmv)
683 break;
684 if (!bmv.checkRange(mvmin, mvmax))
685 break;
686 }
687
688 #else // if 0
689 /* equivalent to the above, but eliminates duplicate candidates */
690 COST_MV_X3_DIR(-2, 0, -1, 2, 1, 2, costs);
691 bcost <<= 3;
692 COPY1_IF_LT(bcost, (costs[0] << 3) + 2);
693 COPY1_IF_LT(bcost, (costs[1] << 3) + 3);
694 COPY1_IF_LT(bcost, (costs[2] << 3) + 4);
695 COST_MV_X3_DIR(2, 0, 1, -2, -1, -2, costs);
696 COPY1_IF_LT(bcost, (costs[0] << 3) + 5);
697 COPY1_IF_LT(bcost, (costs[1] << 3) + 6);
698 COPY1_IF_LT(bcost, (costs[2] << 3) + 7);
699
700 if (bcost & 7)
701 {
702 int dir = (bcost & 7) - 2;
703 bmv += hex2[dir + 1];
704
705 /* half hexagon, not overlapping the previous iteration */
706 for (int i = (merange >> 1) - 1; i > 0 && bmv.checkRange(mvmin, mvmax); i--)
707 {
708 COST_MV_X3_DIR(hex2[dir + 0].x, hex2[dir + 0].y,
709 hex2[dir + 1].x, hex2[dir + 1].y,
710 hex2[dir + 2].x, hex2[dir + 2].y,
711 costs);
712 bcost &= ~7;
713 COPY1_IF_LT(bcost, (costs[0] << 3) + 1);
714 COPY1_IF_LT(bcost, (costs[1] << 3) + 2);
715 COPY1_IF_LT(bcost, (costs[2] << 3) + 3);
716 if (!(bcost & 7))
717 break;
718 dir += (bcost & 7) - 2;
719 dir = mod6m1[dir + 1];
720 bmv += hex2[dir + 1];
721 }
722 }
723 bcost >>= 3;
724 #endif // if 0
725
726 /* square refine */
727 int dir = 0;
728 COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs);
729 COPY2_IF_LT(bcost, costs[0], dir, 1);
730 COPY2_IF_LT(bcost, costs[1], dir, 2);
731 COPY2_IF_LT(bcost, costs[2], dir, 3);
732 COPY2_IF_LT(bcost, costs[3], dir, 4);
733 COST_MV_X4_DIR(-1, -1, -1, 1, 1, -1, 1, 1, costs);
734 COPY2_IF_LT(bcost, costs[0], dir, 5);
735 COPY2_IF_LT(bcost, costs[1], dir, 6);
736 COPY2_IF_LT(bcost, costs[2], dir, 7);
737 COPY2_IF_LT(bcost, costs[3], dir, 8);
738 bmv += square1[dir];
739 break;
740 }
741
742 case X265_UMH_SEARCH:
743 {
744 int ucost1, ucost2;
745 int16_t cross_start = 1;
746
747 /* refine predictors */
748 omv = bmv;
749 ucost1 = bcost;
750 DIA1_ITER(pmv.x, pmv.y);
751 if (pmv.notZero())
752 DIA1_ITER(0, 0);
753
754 ucost2 = bcost;
755 if (bmv.notZero() && bmv != pmv)
756 DIA1_ITER(bmv.x, bmv.y);
757 if (bcost == ucost2)
758 cross_start = 3;
759
760 /* Early Termination */
761 omv = bmv;
762 if (bcost == ucost2 && SAD_THRESH(2000))
763 {
764 COST_MV_X4(0, -2, -1, -1, 1, -1, -2, 0);
765 COST_MV_X4(2, 0, -1, 1, 1, 1, 0, 2);
766 if (bcost == ucost1 && SAD_THRESH(500))
767 break;
768 if (bcost == ucost2)
769 {
770 int16_t range = (int16_t)(merange >> 1) | 1;
771 CROSS(3, range, range);
772 COST_MV_X4(-1, -2, 1, -2, -2, -1, 2, -1);
773 COST_MV_X4(-2, 1, 2, 1, -1, 2, 1, 2);
774 if (bcost == ucost2)
775 break;
776 cross_start = range + 2;
777 }
778 }
779
780 // TODO: Need to study x264's logic for building mvc list to understand why they
781 // have special cases here for 16x16, and whether they apply to HEVC CTU
782
783 // adaptive search range based on mvc variability
784 if (numCandidates)
785 {
786 /* range multipliers based on casual inspection of some statistics of
787 * average distance between current predictor and final mv found by ESA.
788 * these have not been tuned much by actual encoding. */
789 static const uint8_t range_mul[4][4] =
790 {
791 { 3, 3, 4, 4 },
792 { 3, 4, 4, 4 },
793 { 4, 4, 4, 5 },
794 { 4, 4, 5, 6 },
795 };
796
797 int mvd;
798 int sad_ctx, mvd_ctx;
799 int denom = 1;
800
801 if (numCandidates == 1)
802 {
803 if (LUMA_64x64 == partEnum)
804 /* mvc is probably the same as mvp, so the difference isn't meaningful.
805 * but prediction usually isn't too bad, so just use medium range */
806 mvd = 25;
807 else
808 mvd = abs(qmvp.x - mvc[0].x) + abs(qmvp.y - mvc[0].y);
809 }
810 else
811 {
812 /* calculate the degree of agreement between predictors. */
813
814 /* in 64x64, mvc includes all the neighbors used to make mvp,
815 * so don't count mvp separately. */
816
817 denom = numCandidates - 1;
818 mvd = 0;
819 if (partEnum != LUMA_64x64)
820 {
821 mvd = abs(qmvp.x - mvc[0].x) + abs(qmvp.y - mvc[0].y);
822 denom++;
823 }
824 mvd += predictorDifference(mvc, numCandidates);
825 }
826
827 sad_ctx = SAD_THRESH(1000) ? 0
828 : SAD_THRESH(2000) ? 1
829 : SAD_THRESH(4000) ? 2 : 3;
830 mvd_ctx = mvd < 10 * denom ? 0
831 : mvd < 20 * denom ? 1
832 : mvd < 40 * denom ? 2 : 3;
833
834 merange = (merange * range_mul[mvd_ctx][sad_ctx]) >> 2;
835 }
836
837 /* FIXME if the above DIA2/OCT2/CROSS found a new mv, it has not updated omx/omy.
838 * we are still centered on the same place as the DIA2. is this desirable? */
839 CROSS(cross_start, merange, merange >> 1);
840 COST_MV_X4(-2, -2, -2, 2, 2, -2, 2, 2);
841
842 /* hexagon grid */
843 omv = bmv;
844 const uint16_t *p_cost_omvx = m_cost_mvx + omv.x * 4;
845 const uint16_t *p_cost_omvy = m_cost_mvy + omv.y * 4;
846 uint16_t i = 1;
847 do
848 {
849 if (4 * i > X265_MIN4(mvmax.x - omv.x, omv.x - mvmin.x,
850 mvmax.y - omv.y, omv.y - mvmin.y))
851 {
852 for (int j = 0; j < 16; j++)
853 {
854 MV mv = omv + (hex4[j] * i);
855 if (mv.checkRange(mvmin, mvmax))
856 COST_MV(mv.x, mv.y);
857 }
858 }
859 else
860 {
861 int16_t dir = 0;
862 pixel *fref_base = fref + omv.x + (omv.y - 4 * i) * stride;
863 size_t dy = (size_t)i * stride;
864 #define SADS(k, x0, y0, x1, y1, x2, y2, x3, y3) \
865 sad_x4(fenc, \
866 fref_base x0 * i + (y0 - 2 * k + 4) * dy, \
867 fref_base x1 * i + (y1 - 2 * k + 4) * dy, \
868 fref_base x2 * i + (y2 - 2 * k + 4) * dy, \
869 fref_base x3 * i + (y3 - 2 * k + 4) * dy, \
870 stride, costs + 4 * k); \
871 fref_base += 2 * dy;
872 #define ADD_MVCOST(k, x, y) costs[k] += p_cost_omvx[x * 4 * i] + p_cost_omvy[y * 4 * i]
873 #define MIN_MV(k, x, y) COPY2_IF_LT(bcost, costs[k], dir, x * 16 + (y & 15))
874
875 SADS(0, +0, -4, +0, +4, -2, -3, +2, -3);
876 SADS(1, -4, -2, +4, -2, -4, -1, +4, -1);
877 SADS(2, -4, +0, +4, +0, -4, +1, +4, +1);
878 SADS(3, -4, +2, +4, +2, -2, +3, +2, +3);
879 ADD_MVCOST(0, 0, -4);
880 ADD_MVCOST(1, 0, 4);
881 ADD_MVCOST(2, -2, -3);
882 ADD_MVCOST(3, 2, -3);
883 ADD_MVCOST(4, -4, -2);
884 ADD_MVCOST(5, 4, -2);
885 ADD_MVCOST(6, -4, -1);
886 ADD_MVCOST(7, 4, -1);
887 ADD_MVCOST(8, -4, 0);
888 ADD_MVCOST(9, 4, 0);
889 ADD_MVCOST(10, -4, 1);
890 ADD_MVCOST(11, 4, 1);
891 ADD_MVCOST(12, -4, 2);
892 ADD_MVCOST(13, 4, 2);
893 ADD_MVCOST(14, -2, 3);
894 ADD_MVCOST(15, 2, 3);
895 MIN_MV(0, 0, -4);
896 MIN_MV(1, 0, 4);
897 MIN_MV(2, -2, -3);
898 MIN_MV(3, 2, -3);
899 MIN_MV(4, -4, -2);
900 MIN_MV(5, 4, -2);
901 MIN_MV(6, -4, -1);
902 MIN_MV(7, 4, -1);
903 MIN_MV(8, -4, 0);
904 MIN_MV(9, 4, 0);
905 MIN_MV(10, -4, 1);
906 MIN_MV(11, 4, 1);
907 MIN_MV(12, -4, 2);
908 MIN_MV(13, 4, 2);
909 MIN_MV(14, -2, 3);
910 MIN_MV(15, 2, 3);
911 #undef SADS
912 #undef ADD_MVCOST
913 #undef MIN_MV
914 if (dir)
915 {
916 bmv.x = omv.x + i * (dir >> 4);
917 bmv.y = omv.y + i * ((dir << 28) >> 28);
918 }
919 }
920 }
921 while (++i <= merange >> 2);
922 if (bmv.checkRange(mvmin, mvmax))
923 goto me_hex2;
924 break;
925 }
926
927 case X265_STAR_SEARCH: // Adapted from HM ME
928 {
929 int bPointNr = 0;
930 int bDistance = 0;
931
932 const int EarlyExitIters = 3;
933 StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, EarlyExitIters, merange);
934 if (bDistance == 1)
935 {
936 // if best distance was only 1, check two missing points. If no new point is found, stop
937 if (bPointNr)
938 {
939 /* For a given direction 1 to 8, check nearest two outer X pixels
940 X X
941 X 1 2 3 X
942 4 * 5
943 X 6 7 8 X
944 X X
945 */
946 int saved = bcost;
947 const MV mv1 = bmv + offsets[(bPointNr - 1) * 2];
948 const MV mv2 = bmv + offsets[(bPointNr - 1) * 2 + 1];
949 if (mv1.checkRange(mvmin, mvmax))
950 {
951 COST_MV(mv1.x, mv1.y);
952 }
953 if (mv2.checkRange(mvmin, mvmax))
954 {
955 COST_MV(mv2.x, mv2.y);
956 }
957 if (bcost == saved)
958 break;
959 }
960 else
961 break;
962 }
963
964 const int RasterDistance = 5;
965 if (bDistance > RasterDistance)
966 {
967 // raster search refinement if original search distance was too big
968 MV tmv;
969 for (tmv.y = mvmin.y; tmv.y <= mvmax.y; tmv.y += RasterDistance)
970 {
971 for (tmv.x = mvmin.x; tmv.x <= mvmax.x; tmv.x += RasterDistance)
972 {
973 if (tmv.x + (RasterDistance * 3) <= mvmax.x)
974 {
975 pixel *pix_base = fref + tmv.y * stride + tmv.x;
976 sad_x4(fenc,
977 pix_base,
978 pix_base + RasterDistance,
979 pix_base + RasterDistance * 2,
980 pix_base + RasterDistance * 3,
981 stride, costs);
982 costs[0] += mvcost(tmv << 2);
983 COPY2_IF_LT(bcost, costs[0], bmv, tmv);
984 tmv.x += RasterDistance;
985 costs[1] += mvcost(tmv << 2);
986 COPY2_IF_LT(bcost, costs[1], bmv, tmv);
987 tmv.x += RasterDistance;
988 costs[2] += mvcost(tmv << 2);
989 COPY2_IF_LT(bcost, costs[2], bmv, tmv);
990 tmv.x += RasterDistance;
991 costs[3] += mvcost(tmv << 3);
992 COPY2_IF_LT(bcost, costs[3], bmv, tmv);
993 }
994 else
995 COST_MV(tmv.x, tmv.y);
996 }
997 }
998 }
999
1000 while (bDistance > 0)
1001 {
1002 // center a new search around current best
1003 bDistance = 0;
1004 bPointNr = 0;
1005 const int MaxIters = 32;
1006 StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, MaxIters, merange);
1007
1008 if (bDistance == 1)
1009 {
1010 if (!bPointNr)
1011 break;
1012
1013 /* For a given direction 1 to 8, check nearest 2 outer X pixels
1014 X X
1015 X 1 2 3 X
1016 4 * 5
1017 X 6 7 8 X
1018 X X
1019 */
1020 const MV mv1 = bmv + offsets[(bPointNr - 1) * 2];
1021 const MV mv2 = bmv + offsets[(bPointNr - 1) * 2 + 1];
1022 if (mv1.checkRange(mvmin, mvmax))
1023 {
1024 COST_MV(mv1.x, mv1.y);
1025 }
1026 if (mv2.checkRange(mvmin, mvmax))
1027 {
1028 COST_MV(mv2.x, mv2.y);
1029 }
1030 break;
1031 }
1032 }
1033
1034 break;
1035 }
1036
1037 case X265_FULL_SEARCH:
1038 {
1039 // dead slow exhaustive search, but at least it uses sad_x4()
1040 MV tmv;
1041 for (tmv.y = mvmin.y; tmv.y <= mvmax.y; tmv.y++)
1042 {
1043 for (tmv.x = mvmin.x; tmv.x <= mvmax.x; tmv.x++)
1044 {
1045 if (tmv.x + 3 <= mvmax.x)
1046 {
1047 pixel *pix_base = fref + tmv.y * stride + tmv.x;
1048 sad_x4(fenc,
1049 pix_base,
1050 pix_base + 1,
1051 pix_base + 2,
1052 pix_base + 3,
1053 stride, costs);
1054 costs[0] += mvcost(tmv << 2);
1055 COPY2_IF_LT(bcost, costs[0], bmv, tmv);
1056 tmv.x++;
1057 costs[1] += mvcost(tmv << 2);
1058 COPY2_IF_LT(bcost, costs[1], bmv, tmv);
1059 tmv.x++;
1060 costs[2] += mvcost(tmv << 2);
1061 COPY2_IF_LT(bcost, costs[2], bmv, tmv);
1062 tmv.x++;
1063 costs[3] += mvcost(tmv << 2);
1064 COPY2_IF_LT(bcost, costs[3], bmv, tmv);
1065 }
1066 else
1067 COST_MV(tmv.x, tmv.y);
1068 }
1069 }
1070
1071 break;
1072 }
1073
1074 default:
1075 X265_CHECK(0, "invalid motion estimate mode\n");
1076 break;
1077 }
1078
1079 if (bprecost < bcost)
1080 {
1081 bmv = bestpre;
1082 bcost = bprecost;
1083 }
1084 else
1085 bmv = bmv.toQPel(); // promote search bmv to qpel
1086
1087 const SubpelWorkload& wl = workload[this->subpelRefine];
1088
1089 if (!bcost)
1090 {
1091 /* if there was zero residual at the clipped MVP, we can skip subpel
1092 * refine, but we do need to include the mvcost in the returned cost */
1093 bcost = mvcost(bmv);
1094 }
1095 else if (ref->isLowres)
1096 {
1097 int bdir = 0;
1098 for (int i = 1; i <= wl.hpel_dirs; i++)
1099 {
1100 MV qmv = bmv + square1[i] * 2;
1101 int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad) + mvcost(qmv);
1102 COPY2_IF_LT(bcost, cost, bdir, i);
1103 }
1104
1105 bmv += square1[bdir] * 2;
1106 bcost = ref->lowresQPelCost(fenc, blockOffset, bmv, satd) + mvcost(bmv);
1107
1108 bdir = 0;
1109 for (int i = 1; i <= wl.qpel_dirs; i++)
1110 {
1111 MV qmv = bmv + square1[i];
1112 int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd) + mvcost(qmv);
1113 COPY2_IF_LT(bcost, cost, bdir, i);
1114 }
1115
1116 bmv += square1[bdir];
1117 }
1118 else
1119 {
1120 pixelcmp_t hpelcomp;
1121
1122 if (wl.hpel_satd)
1123 {
1124 bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
1125 hpelcomp = satd;
1126 }
1127 else
1128 hpelcomp = sad;
1129
1130 for (int iter = 0; iter < wl.hpel_iters; iter++)
1131 {
1132 int bdir = 0;
1133 for (int i = 1; i <= wl.hpel_dirs; i++)
1134 {
1135 MV qmv = bmv + square1[i] * 2;
1136 int cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv);
1137 COPY2_IF_LT(bcost, cost, bdir, i);
1138 }
1139
1140 if (bdir)
1141 bmv += square1[bdir] * 2;
1142 else
1143 break;
1144 }
1145
1146 /* if HPEL search used SAD, remeasure with SATD before QPEL */
1147 if (!wl.hpel_satd)
1148 bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
1149
1150 for (int iter = 0; iter < wl.qpel_iters; iter++)
1151 {
1152 int bdir = 0;
1153 for (int i = 1; i <= wl.qpel_dirs; i++)
1154 {
1155 MV qmv = bmv + square1[i];
1156 int cost = subpelCompare(ref, qmv, satd) + mvcost(qmv);
1157 COPY2_IF_LT(bcost, cost, bdir, i);
1158 }
1159
1160 if (bdir)
1161 bmv += square1[bdir];
1162 else
1163 break;
1164 }
1165 }
1166
1167 x265_emms();
1168 outQMv = bmv;
1169 return bcost;
1170 }
1171
subpelCompare(ReferencePlanes * ref,const MV & qmv,pixelcmp_t cmp)1172 int MotionEstimate::subpelCompare(ReferencePlanes *ref, const MV& qmv, pixelcmp_t cmp)
1173 {
1174 intptr_t refStride = ref->lumaStride;
1175 pixel *fref = ref->fpelPlane[0] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * refStride;
1176 int xFrac = qmv.x & 0x3;
1177 int yFrac = qmv.y & 0x3;
1178 int cost;
1179 intptr_t lclStride = fencPUYuv.m_size;
1180 X265_CHECK(lclStride == FENC_STRIDE, "fenc buffer is assumed to have FENC_STRIDE by sad_x3 and sad_x4\n");
1181
1182 if (!(yFrac | xFrac))
1183 cost = cmp(fencPUYuv.m_buf[0], lclStride, fref, refStride);
1184 else
1185 {
1186 /* we are taking a short-cut here if the reference is weighted. To be
1187 * accurate we should be interpolating unweighted pixels and weighting
1188 * the final 16bit values prior to rounding and down shifting. Instead we
1189 * are simply interpolating the weighted full-pel pixels. Not 100%
1190 * accurate but good enough for fast qpel ME */
1191 ALIGN_VAR_32(pixel, subpelbuf[64 * 64]);
1192 if (!yFrac)
1193 primitives.pu[partEnum].luma_hpp(fref, refStride, subpelbuf, lclStride, xFrac);
1194 else if (!xFrac)
1195 primitives.pu[partEnum].luma_vpp(fref, refStride, subpelbuf, lclStride, yFrac);
1196 else
1197 primitives.pu[partEnum].luma_hvpp(fref, refStride, subpelbuf, lclStride, xFrac, yFrac);
1198
1199 cost = cmp(fencPUYuv.m_buf[0], lclStride, subpelbuf, lclStride);
1200 }
1201
1202 if (bChromaSATD)
1203 {
1204 int csp = fencPUYuv.m_csp;
1205 int hshift = fencPUYuv.m_hChromaShift;
1206 int vshift = fencPUYuv.m_vChromaShift;
1207 int shiftHor = (2 + hshift);
1208 int shiftVer = (2 + vshift);
1209 lclStride = fencPUYuv.m_csize;
1210
1211 intptr_t refStrideC = ref->reconPic->m_strideC;
1212 intptr_t refOffset = (qmv.x >> shiftHor) + (qmv.y >> shiftVer) * refStrideC;
1213
1214 const pixel* refCb = ref->getCbAddr(ctuAddr, absPartIdx) + refOffset;
1215 const pixel* refCr = ref->getCrAddr(ctuAddr, absPartIdx) + refOffset;
1216
1217 xFrac = qmv.x & ((1 << shiftHor) - 1);
1218 yFrac = qmv.y & ((1 << shiftVer) - 1);
1219
1220 if (!(yFrac | xFrac))
1221 {
1222 cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, refCb, refStrideC);
1223 cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, refCr, refStrideC);
1224 }
1225 else
1226 {
1227 ALIGN_VAR_32(pixel, subpelbuf[64 * 64]);
1228 if (!yFrac)
1229 {
1230 primitives.chroma[csp].pu[partEnum].filter_hpp(refCb, refStrideC, subpelbuf, lclStride, xFrac << (1 - hshift));
1231 cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride);
1232
1233 primitives.chroma[csp].pu[partEnum].filter_hpp(refCr, refStrideC, subpelbuf, lclStride, xFrac << (1 - hshift));
1234 cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride);
1235 }
1236 else if (!xFrac)
1237 {
1238 primitives.chroma[csp].pu[partEnum].filter_vpp(refCb, refStrideC, subpelbuf, lclStride, yFrac << (1 - vshift));
1239 cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride);
1240
1241 primitives.chroma[csp].pu[partEnum].filter_vpp(refCr, refStrideC, subpelbuf, lclStride, yFrac << (1 - vshift));
1242 cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride);
1243 }
1244 else
1245 {
1246 ALIGN_VAR_32(int16_t, immed[64 * (64 + NTAPS_CHROMA)]);
1247
1248 int extStride = blockwidth >> hshift;
1249 int filterSize = NTAPS_CHROMA;
1250 int halfFilterSize = (filterSize >> 1);
1251
1252 primitives.chroma[csp].pu[partEnum].filter_hps(refCb, refStrideC, immed, extStride, xFrac << (1 - hshift), 1);
1253 primitives.chroma[csp].pu[partEnum].filter_vsp(immed + (halfFilterSize - 1) * extStride, extStride, subpelbuf, lclStride, yFrac << (1 - vshift));
1254 cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride);
1255
1256 primitives.chroma[csp].pu[partEnum].filter_hps(refCr, refStrideC, immed, extStride, xFrac << (1 - hshift), 1);
1257 primitives.chroma[csp].pu[partEnum].filter_vsp(immed + (halfFilterSize - 1) * extStride, extStride, subpelbuf, lclStride, yFrac << (1 - vshift));
1258 cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride);
1259 }
1260 }
1261 }
1262
1263 return cost;
1264 }
1265