1 /*****************************************************************************
2  * Copyright (C) 2013 x265 project
3  *
4  * Authors: Steve Borho <steve@borho.org>
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
19  *
20  * This program is also available under a commercial proprietary license.
21  * For more information, contact us at license @ x265.com.
22  *****************************************************************************/
23 
24 #include "common.h"
25 #include "primitives.h"
26 #include "lowres.h"
27 #include "motion.h"
28 #include "x265.h"
29 
30 #if _MSC_VER
31 #pragma warning(disable: 4127) // conditional  expression is constant (macros use this construct)
32 #endif
33 
34 using namespace X265_NS;
35 
36 namespace {
37 
38 struct SubpelWorkload
39 {
40     int hpel_iters;
41     int hpel_dirs;
42     int qpel_iters;
43     int qpel_dirs;
44     bool hpel_satd;
45 };
46 
47 const SubpelWorkload workload[X265_MAX_SUBPEL_LEVEL + 1] =
48 {
49     { 1, 4, 0, 4, false }, // 4 SAD HPEL only
50     { 1, 4, 1, 4, false }, // 4 SAD HPEL + 4 SATD QPEL
51     { 1, 4, 1, 4, true },  // 4 SATD HPEL + 4 SATD QPEL
52     { 2, 4, 1, 4, true },  // 2x4 SATD HPEL + 4 SATD QPEL
53     { 2, 4, 2, 4, true },  // 2x4 SATD HPEL + 2x4 SATD QPEL
54     { 1, 8, 1, 8, true },  // 8 SATD HPEL + 8 SATD QPEL (default)
55     { 2, 8, 1, 8, true },  // 2x8 SATD HPEL + 8 SATD QPEL
56     { 2, 8, 2, 8, true },  // 2x8 SATD HPEL + 2x8 SATD QPEL
57 };
58 
59 static int sizeScale[NUM_PU_SIZES];
60 #define SAD_THRESH(v) (bcost < (((v >> 4) * sizeScale[partEnum])))
61 
62 /* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */
63 const MV hex2[8] = { MV(-1, -2), MV(-2, 0), MV(-1, 2), MV(1, 2), MV(2, 0), MV(1, -2), MV(-1, -2), MV(-2, 0) };
64 const uint8_t mod6m1[8] = { 5, 0, 1, 2, 3, 4, 5, 0 };  /* (x-1)%6 */
65 const MV square1[9] = { MV(0, 0), MV(0, -1), MV(0, 1), MV(-1, 0), MV(1, 0), MV(-1, -1), MV(-1, 1), MV(1, -1), MV(1, 1) };
66 const MV hex4[16] =
67 {
68     MV(0, -4), MV(0, 4), MV(-2, -3), MV(2, -3),
69     MV(-4, -2), MV(4, -2), MV(-4, -1), MV(4, -1),
70     MV(-4, 0), MV(4, 0), MV(-4, 1), MV(4, 1),
71     MV(-4, 2), MV(4, 2), MV(-2, 3), MV(2, 3),
72 };
73 const MV offsets[] =
74 {
75     MV(-1, 0), MV(0, -1),
76     MV(-1, -1), MV(1, -1),
77     MV(-1, 0), MV(1, 0),
78     MV(-1, 1), MV(-1, -1),
79     MV(1, -1), MV(1, 1),
80     MV(-1, 0), MV(0, 1),
81     MV(-1, 1), MV(1, 1),
82     MV(1, 0), MV(0, 1),
83 }; // offsets for Two Point Search
84 
85 /* sum of absolute differences between MV candidates, used for adaptive ME range */
predictorDifference(const MV * mvc,intptr_t numCandidates)86 inline int predictorDifference(const MV *mvc, intptr_t numCandidates)
87 {
88     int sum = 0;
89 
90     for (int i = 0; i < numCandidates - 1; i++)
91     {
92         sum += abs(mvc[i].x - mvc[i + 1].x)
93             +  abs(mvc[i].y - mvc[i + 1].y);
94     }
95 
96     return sum;
97 }
98 
99 }
100 
MotionEstimate()101 MotionEstimate::MotionEstimate()
102 {
103     ctuAddr = -1;
104     absPartIdx = -1;
105     searchMethod = X265_HEX_SEARCH;
106     subpelRefine = 2;
107     blockwidth = blockheight = 0;
108     blockOffset = 0;
109     bChromaSATD = false;
110     chromaSatd = NULL;
111 }
112 
init(int method,int refine,int csp)113 void MotionEstimate::init(int method, int refine, int csp)
114 {
115     searchMethod = method;
116     subpelRefine = refine;
117     fencPUYuv.create(FENC_STRIDE, csp);
118 }
119 
initScales(void)120 void MotionEstimate::initScales(void)
121 {
122 #define SETUP_SCALE(W, H) \
123     sizeScale[LUMA_ ## W ## x ## H] = (H * H) >> 4;
124     SETUP_SCALE(4, 4);
125     SETUP_SCALE(8, 8);
126     SETUP_SCALE(8, 4);
127     SETUP_SCALE(4, 8);
128     SETUP_SCALE(16, 16);
129     SETUP_SCALE(16, 8);
130     SETUP_SCALE(8, 16);
131     SETUP_SCALE(16, 12);
132     SETUP_SCALE(12, 16);
133     SETUP_SCALE(4, 16);
134     SETUP_SCALE(16, 4);
135     SETUP_SCALE(32, 32);
136     SETUP_SCALE(32, 16);
137     SETUP_SCALE(16, 32);
138     SETUP_SCALE(32, 24);
139     SETUP_SCALE(24, 32);
140     SETUP_SCALE(32, 8);
141     SETUP_SCALE(8, 32);
142     SETUP_SCALE(64, 64);
143     SETUP_SCALE(64, 32);
144     SETUP_SCALE(32, 64);
145     SETUP_SCALE(64, 48);
146     SETUP_SCALE(48, 64);
147     SETUP_SCALE(64, 16);
148     SETUP_SCALE(16, 64);
149 #undef SETUP_SCALE
150 }
151 
hpelIterationCount(int subme)152 int MotionEstimate::hpelIterationCount(int subme)
153 {
154     return workload[subme].hpel_iters +
155            workload[subme].qpel_iters / 2;
156 }
157 
~MotionEstimate()158 MotionEstimate::~MotionEstimate()
159 {
160     fencPUYuv.destroy();
161 }
162 
163 /* Called by lookahead, luma only, no use of PicYuv */
setSourcePU(pixel * fencY,intptr_t stride,intptr_t offset,int pwidth,int pheight)164 void MotionEstimate::setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight)
165 {
166     partEnum = partitionFromSizes(pwidth, pheight);
167     X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
168     sad = primitives.pu[partEnum].sad;
169     satd = primitives.pu[partEnum].satd;
170     sad_x3 = primitives.pu[partEnum].sad_x3;
171     sad_x4 = primitives.pu[partEnum].sad_x4;
172 
173     blockwidth = pwidth;
174     blockOffset = offset;
175     absPartIdx = ctuAddr = -1;
176 
177     /* copy PU block into cache */
178     primitives.pu[partEnum].copy_pp(fencPUYuv.m_buf[0], FENC_STRIDE, fencY + offset, stride);
179     X265_CHECK(!bChromaSATD, "chroma distortion measurements impossible in this code path\n");
180 }
181 
182 /* Called by Search::predInterSearch() or --pme equivalent, chroma residual might be considered */
setSourcePU(const Yuv & srcFencYuv,int _ctuAddr,int cuPartIdx,int puPartIdx,int pwidth,int pheight)183 void MotionEstimate::setSourcePU(const Yuv& srcFencYuv, int _ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight)
184 {
185     partEnum = partitionFromSizes(pwidth, pheight);
186     X265_CHECK(LUMA_4x4 != partEnum, "4x4 inter partition detected!\n");
187     sad = primitives.pu[partEnum].sad;
188     satd = primitives.pu[partEnum].satd;
189     sad_x3 = primitives.pu[partEnum].sad_x3;
190     sad_x4 = primitives.pu[partEnum].sad_x4;
191     chromaSatd = primitives.chroma[fencPUYuv.m_csp].pu[partEnum].satd;
192 
193     /* Enable chroma residual cost if subpelRefine level is greater than 2 and chroma block size
194      * is an even multiple of 4x4 pixels (indicated by non-null chromaSatd pointer) */
195     bChromaSATD = subpelRefine > 2 && chromaSatd;
196     X265_CHECK(!(bChromaSATD && !workload[subpelRefine].hpel_satd), "Chroma SATD cannot be used with SAD hpel\n");
197 
198     ctuAddr = _ctuAddr;
199     absPartIdx = cuPartIdx + puPartIdx;
200     blockwidth = pwidth;
201     blockOffset = 0;
202 
203     /* copy PU from CU Yuv */
204     fencPUYuv.copyPUFromYuv(srcFencYuv, puPartIdx, partEnum, bChromaSATD);
205 }
206 
207 #define COST_MV_PT_DIST(mx, my, point, dist) \
208     do \
209     { \
210         MV tmv(mx, my); \
211         int cost = sad(fenc, FENC_STRIDE, fref + mx + my * stride, stride); \
212         cost += mvcost(tmv << 2); \
213         if (cost < bcost) { \
214             bcost = cost; \
215             bmv = tmv; \
216             bPointNr = point; \
217             bDistance = dist; \
218         } \
219     } while (0)
220 
221 #define COST_MV(mx, my) \
222     do \
223     { \
224         int cost = sad(fenc, FENC_STRIDE, fref + (mx) + (my) * stride, stride); \
225         cost += mvcost(MV(mx, my) << 2); \
226         COPY2_IF_LT(bcost, cost, bmv, MV(mx, my)); \
227     } while (0)
228 
229 #define COST_MV_X3_DIR(m0x, m0y, m1x, m1y, m2x, m2y, costs) \
230     { \
231         pixel *pix_base = fref + bmv.x + bmv.y * stride; \
232         sad_x3(fenc, \
233                pix_base + (m0x) + (m0y) * stride, \
234                pix_base + (m1x) + (m1y) * stride, \
235                pix_base + (m2x) + (m2y) * stride, \
236                stride, costs); \
237         (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \
238         (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
239         (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
240     }
241 
242 #define COST_MV_PT_DIST_X4(m0x, m0y, p0, d0, m1x, m1y, p1, d1, m2x, m2y, p2, d2, m3x, m3y, p3, d3) \
243     { \
244         sad_x4(fenc, \
245                fref + (m0x) + (m0y) * stride, \
246                fref + (m1x) + (m1y) * stride, \
247                fref + (m2x) + (m2y) * stride, \
248                fref + (m3x) + (m3y) * stride, \
249                stride, costs); \
250         (costs)[0] += mvcost(MV(m0x, m0y) << 2); \
251         (costs)[1] += mvcost(MV(m1x, m1y) << 2); \
252         (costs)[2] += mvcost(MV(m2x, m2y) << 2); \
253         (costs)[3] += mvcost(MV(m3x, m3y) << 2); \
254         COPY4_IF_LT(bcost, costs[0], bmv, MV(m0x, m0y), bPointNr, p0, bDistance, d0); \
255         COPY4_IF_LT(bcost, costs[1], bmv, MV(m1x, m1y), bPointNr, p1, bDistance, d1); \
256         COPY4_IF_LT(bcost, costs[2], bmv, MV(m2x, m2y), bPointNr, p2, bDistance, d2); \
257         COPY4_IF_LT(bcost, costs[3], bmv, MV(m3x, m3y), bPointNr, p3, bDistance, d3); \
258     }
259 
260 #define COST_MV_X4(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y) \
261     { \
262         pixel *pix_base = fref + omv.x + omv.y * stride; \
263         sad_x4(fenc, \
264                pix_base + (m0x) + (m0y) * stride, \
265                pix_base + (m1x) + (m1y) * stride, \
266                pix_base + (m2x) + (m2y) * stride, \
267                pix_base + (m3x) + (m3y) * stride, \
268                stride, costs); \
269         costs[0] += mvcost((omv + MV(m0x, m0y)) << 2); \
270         costs[1] += mvcost((omv + MV(m1x, m1y)) << 2); \
271         costs[2] += mvcost((omv + MV(m2x, m2y)) << 2); \
272         costs[3] += mvcost((omv + MV(m3x, m3y)) << 2); \
273         COPY2_IF_LT(bcost, costs[0], bmv, omv + MV(m0x, m0y)); \
274         COPY2_IF_LT(bcost, costs[1], bmv, omv + MV(m1x, m1y)); \
275         COPY2_IF_LT(bcost, costs[2], bmv, omv + MV(m2x, m2y)); \
276         COPY2_IF_LT(bcost, costs[3], bmv, omv + MV(m3x, m3y)); \
277     }
278 
279 #define COST_MV_X4_DIR(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y, costs) \
280     { \
281         pixel *pix_base = fref + bmv.x + bmv.y * stride; \
282         sad_x4(fenc, \
283                pix_base + (m0x) + (m0y) * stride, \
284                pix_base + (m1x) + (m1y) * stride, \
285                pix_base + (m2x) + (m2y) * stride, \
286                pix_base + (m3x) + (m3y) * stride, \
287                stride, costs); \
288         (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \
289         (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
290         (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
291         (costs)[3] += mvcost((bmv + MV(m3x, m3y)) << 2); \
292     }
293 
294 #define DIA1_ITER(mx, my) \
295     { \
296         omv.x = mx; omv.y = my; \
297         COST_MV_X4(0, -1, 0, 1, -1, 0, 1, 0); \
298     }
299 
300 #define CROSS(start, x_max, y_max) \
301     { \
302         int16_t i = start; \
303         if ((x_max) <= X265_MIN(mvmax.x - omv.x, omv.x - mvmin.x)) \
304             for (; i < (x_max) - 2; i += 4) { \
305                 COST_MV_X4(i, 0, -i, 0, i + 2, 0, -i - 2, 0); } \
306         for (; i < (x_max); i += 2) \
307         { \
308             if (omv.x + i <= mvmax.x) \
309                 COST_MV(omv.x + i, omv.y); \
310             if (omv.x - i >= mvmin.x) \
311                 COST_MV(omv.x - i, omv.y); \
312         } \
313         i = start; \
314         if ((y_max) <= X265_MIN(mvmax.y - omv.y, omv.y - mvmin.y)) \
315             for (; i < (y_max) - 2; i += 4) { \
316                 COST_MV_X4(0, i, 0, -i, 0, i + 2, 0, -i - 2); } \
317         for (; i < (y_max); i += 2) \
318         { \
319             if (omv.y + i <= mvmax.y) \
320                 COST_MV(omv.x, omv.y + i); \
321             if (omv.y - i >= mvmin.y) \
322                 COST_MV(omv.x, omv.y - i); \
323         } \
324     }
325 
StarPatternSearch(ReferencePlanes * ref,const MV & mvmin,const MV & mvmax,MV & bmv,int & bcost,int & bPointNr,int & bDistance,int earlyExitIters,int merange)326 void MotionEstimate::StarPatternSearch(ReferencePlanes *ref,
327                                        const MV &       mvmin,
328                                        const MV &       mvmax,
329                                        MV &             bmv,
330                                        int &            bcost,
331                                        int &            bPointNr,
332                                        int &            bDistance,
333                                        int              earlyExitIters,
334                                        int              merange)
335 {
336     ALIGN_VAR_16(int, costs[16]);
337     pixel* fenc = fencPUYuv.m_buf[0];
338     pixel* fref = ref->fpelPlane[0] + blockOffset;
339     intptr_t stride = ref->lumaStride;
340 
341     MV omv = bmv;
342     int saved = bcost;
343     int rounds = 0;
344 
345     {
346         int16_t dist = 1;
347 
348         /* bPointNr
349               2
350             4 * 5
351               7
352          */
353         const int16_t top    = omv.y - dist;
354         const int16_t bottom = omv.y + dist;
355         const int16_t left   = omv.x - dist;
356         const int16_t right  = omv.x + dist;
357 
358         if (top >= mvmin.y && left >= mvmin.x && right <= mvmax.x && bottom <= mvmax.y)
359         {
360             COST_MV_PT_DIST_X4(omv.x,  top,    2, dist,
361                                left,  omv.y,   4, dist,
362                                right, omv.y,   5, dist,
363                                omv.x,  bottom, 7, dist);
364         }
365         else
366         {
367             if (top >= mvmin.y) // check top
368             {
369                 COST_MV_PT_DIST(omv.x, top, 2, dist);
370             }
371             if (left >= mvmin.x) // check middle left
372             {
373                 COST_MV_PT_DIST(left, omv.y, 4, dist);
374             }
375             if (right <= mvmax.x) // check middle right
376             {
377                 COST_MV_PT_DIST(right, omv.y, 5, dist);
378             }
379             if (bottom <= mvmax.y) // check bottom
380             {
381                 COST_MV_PT_DIST(omv.x, bottom, 7, dist);
382             }
383         }
384         if (bcost < saved)
385             rounds = 0;
386         else if (++rounds >= earlyExitIters)
387             return;
388     }
389 
390     for (int16_t dist = 2; dist <= 8; dist <<= 1)
391     {
392         /* bPointNr
393               2
394              1 3
395             4 * 5
396              6 8
397               7
398          Points 2, 4, 5, 7 are dist
399          Points 1, 3, 6, 8 are dist>>1
400          */
401         const int16_t top     = omv.y - dist;
402         const int16_t bottom  = omv.y + dist;
403         const int16_t left    = omv.x - dist;
404         const int16_t right   = omv.x + dist;
405         const int16_t top2    = omv.y - (dist >> 1);
406         const int16_t bottom2 = omv.y + (dist >> 1);
407         const int16_t left2   = omv.x - (dist >> 1);
408         const int16_t right2  = omv.x + (dist >> 1);
409         saved = bcost;
410 
411         if (top >= mvmin.y && left >= mvmin.x &&
412             right <= mvmax.x && bottom <= mvmax.y) // check border
413         {
414             COST_MV_PT_DIST_X4(omv.x,  top,   2, dist,
415                                left2,  top2,  1, dist >> 1,
416                                right2, top2,  3, dist >> 1,
417                                left,   omv.y, 4, dist);
418             COST_MV_PT_DIST_X4(right,  omv.y,   5, dist,
419                                left2,  bottom2, 6, dist >> 1,
420                                right2, bottom2, 8, dist >> 1,
421                                omv.x,  bottom,  7, dist);
422         }
423         else // check border for each mv
424         {
425             if (top >= mvmin.y) // check top
426             {
427                 COST_MV_PT_DIST(omv.x, top, 2, dist);
428             }
429             if (top2 >= mvmin.y) // check half top
430             {
431                 if (left2 >= mvmin.x) // check half left
432                 {
433                     COST_MV_PT_DIST(left2, top2, 1, (dist >> 1));
434                 }
435                 if (right2 <= mvmax.x) // check half right
436                 {
437                     COST_MV_PT_DIST(right2, top2, 3, (dist >> 1));
438                 }
439             }
440             if (left >= mvmin.x) // check left
441             {
442                 COST_MV_PT_DIST(left, omv.y, 4, dist);
443             }
444             if (right <= mvmax.x) // check right
445             {
446                 COST_MV_PT_DIST(right, omv.y, 5, dist);
447             }
448             if (bottom2 <= mvmax.y) // check half bottom
449             {
450                 if (left2 >= mvmin.x) // check half left
451                 {
452                     COST_MV_PT_DIST(left2, bottom2, 6, (dist >> 1));
453                 }
454                 if (right2 <= mvmax.x) // check half right
455                 {
456                     COST_MV_PT_DIST(right2, bottom2, 8, (dist >> 1));
457                 }
458             }
459             if (bottom <= mvmax.y) // check bottom
460             {
461                 COST_MV_PT_DIST(omv.x, bottom, 7, dist);
462             }
463         }
464 
465         if (bcost < saved)
466             rounds = 0;
467         else if (++rounds >= earlyExitIters)
468             return;
469     }
470 
471     for (int16_t dist = 16; dist <= (int16_t)merange; dist <<= 1)
472     {
473         const int16_t top    = omv.y - dist;
474         const int16_t bottom = omv.y + dist;
475         const int16_t left   = omv.x - dist;
476         const int16_t right  = omv.x + dist;
477 
478         saved = bcost;
479         if (top >= mvmin.y && left >= mvmin.x &&
480             right <= mvmax.x && bottom <= mvmax.y) // check border
481         {
482             /* index
483                   0
484                   3
485                   2
486                   1
487           0 3 2 1 * 1 2 3 0
488                   1
489                   2
490                   3
491                   0
492             */
493 
494             COST_MV_PT_DIST_X4(omv.x,  top,    0, dist,
495                                left,   omv.y,  0, dist,
496                                right,  omv.y,  0, dist,
497                                omv.x,  bottom, 0, dist);
498 
499             for (int16_t index = 1; index < 4; index++)
500             {
501                 int16_t posYT = top    + ((dist >> 2) * index);
502                 int16_t posYB = bottom - ((dist >> 2) * index);
503                 int16_t posXL = omv.x  - ((dist >> 2) * index);
504                 int16_t posXR = omv.x  + ((dist >> 2) * index);
505 
506                 COST_MV_PT_DIST_X4(posXL, posYT, 0, dist,
507                                    posXR, posYT, 0, dist,
508                                    posXL, posYB, 0, dist,
509                                    posXR, posYB, 0, dist);
510             }
511         }
512         else // check border for each mv
513         {
514             if (top >= mvmin.y) // check top
515             {
516                 COST_MV_PT_DIST(omv.x, top, 0, dist);
517             }
518             if (left >= mvmin.x) // check left
519             {
520                 COST_MV_PT_DIST(left, omv.y, 0, dist);
521             }
522             if (right <= mvmax.x) // check right
523             {
524                 COST_MV_PT_DIST(right, omv.y, 0, dist);
525             }
526             if (bottom <= mvmax.y) // check bottom
527             {
528                 COST_MV_PT_DIST(omv.x, bottom, 0, dist);
529             }
530             for (int16_t index = 1; index < 4; index++)
531             {
532                 int16_t posYT = top    + ((dist >> 2) * index);
533                 int16_t posYB = bottom - ((dist >> 2) * index);
534                 int16_t posXL = omv.x - ((dist >> 2) * index);
535                 int16_t posXR = omv.x + ((dist >> 2) * index);
536 
537                 if (posYT >= mvmin.y) // check top
538                 {
539                     if (posXL >= mvmin.x) // check left
540                     {
541                         COST_MV_PT_DIST(posXL, posYT, 0, dist);
542                     }
543                     if (posXR <= mvmax.x) // check right
544                     {
545                         COST_MV_PT_DIST(posXR, posYT, 0, dist);
546                     }
547                 }
548                 if (posYB <= mvmax.y) // check bottom
549                 {
550                     if (posXL >= mvmin.x) // check left
551                     {
552                         COST_MV_PT_DIST(posXL, posYB, 0, dist);
553                     }
554                     if (posXR <= mvmax.x) // check right
555                     {
556                         COST_MV_PT_DIST(posXR, posYB, 0, dist);
557                     }
558                 }
559             }
560         }
561 
562         if (bcost < saved)
563             rounds = 0;
564         else if (++rounds >= earlyExitIters)
565             return;
566     }
567 }
568 
motionEstimate(ReferencePlanes * ref,const MV & mvmin,const MV & mvmax,const MV & qmvp,int numCandidates,const MV * mvc,int merange,MV & outQMv)569 int MotionEstimate::motionEstimate(ReferencePlanes *ref,
570                                    const MV &       mvmin,
571                                    const MV &       mvmax,
572                                    const MV &       qmvp,
573                                    int              numCandidates,
574                                    const MV *       mvc,
575                                    int              merange,
576                                    MV &             outQMv)
577 {
578     ALIGN_VAR_16(int, costs[16]);
579     if (ctuAddr >= 0)
580         blockOffset = ref->reconPic->getLumaAddr(ctuAddr, absPartIdx) - ref->reconPic->getLumaAddr(0);
581     intptr_t stride = ref->lumaStride;
582     pixel* fenc = fencPUYuv.m_buf[0];
583     pixel* fref = ref->fpelPlane[0] + blockOffset;
584 
585     setMVP(qmvp);
586 
587     MV qmvmin = mvmin.toQPel();
588     MV qmvmax = mvmax.toQPel();
589 
590     /* The term cost used here means satd/sad values for that particular search.
591      * The costs used in ME integer search only includes the SAD cost of motion
592      * residual and sqrtLambda times MVD bits.  The subpel refine steps use SATD
593      * cost of residual and sqrtLambda * MVD bits.  Mode decision will be based
594      * on video distortion cost (SSE/PSNR) plus lambda times all signaling bits
595      * (mode + MVD bits). */
596 
597     // measure SAD cost at clipped QPEL MVP
598     MV pmv = qmvp.clipped(qmvmin, qmvmax);
599     MV bestpre = pmv;
600     int bprecost;
601 
602     if (ref->isLowres)
603         bprecost = ref->lowresQPelCost(fenc, blockOffset, pmv, sad);
604     else
605         bprecost = subpelCompare(ref, pmv, sad);
606 
607     /* re-measure full pel rounded MVP with SAD as search start point */
608     MV bmv = pmv.roundToFPel();
609     int bcost = bprecost;
610     if (pmv.isSubpel())
611         bcost = sad(fenc, FENC_STRIDE, fref + bmv.x + bmv.y * stride, stride) + mvcost(bmv << 2);
612 
613     // measure SAD cost at MV(0) if MVP is not zero
614     if (pmv.notZero())
615     {
616         int cost = sad(fenc, FENC_STRIDE, fref, stride) + mvcost(MV(0, 0));
617         if (cost < bcost)
618         {
619             bcost = cost;
620             bmv = 0;
621         }
622     }
623 
624     X265_CHECK(!(ref->isLowres && numCandidates), "lowres motion candidates not allowed\n")
625     // measure SAD cost at each QPEL motion vector candidate
626     for (int i = 0; i < numCandidates; i++)
627     {
628         MV m = mvc[i].clipped(qmvmin, qmvmax);
629         if (m.notZero() & (m != pmv ? 1 : 0) & (m != bestpre ? 1 : 0)) // check already measured
630         {
631             int cost = subpelCompare(ref, m, sad) + mvcost(m);
632             if (cost < bprecost)
633             {
634                 bprecost = cost;
635                 bestpre = m;
636             }
637         }
638     }
639 
640     pmv = pmv.roundToFPel();
641     MV omv = bmv;  // current search origin or starting point
642 
643     switch (searchMethod)
644     {
645     case X265_DIA_SEARCH:
646     {
647         /* diamond search, radius 1 */
648         bcost <<= 4;
649         int i = merange;
650         do
651         {
652             COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs);
653             COPY1_IF_LT(bcost, (costs[0] << 4) + 1);
654             COPY1_IF_LT(bcost, (costs[1] << 4) + 3);
655             COPY1_IF_LT(bcost, (costs[2] << 4) + 4);
656             COPY1_IF_LT(bcost, (costs[3] << 4) + 12);
657             if (!(bcost & 15))
658                 break;
659             bmv.x -= (bcost << 28) >> 30;
660             bmv.y -= (bcost << 30) >> 30;
661             bcost &= ~15;
662         }
663         while (--i && bmv.checkRange(mvmin, mvmax));
664         bcost >>= 4;
665         break;
666     }
667 
668     case X265_HEX_SEARCH:
669     {
670 me_hex2:
671         /* hexagon search, radius 2 */
672 #if 0
673         for (int i = 0; i < merange / 2; i++)
674         {
675             omv = bmv;
676             COST_MV(omv.x - 2, omv.y);
677             COST_MV(omv.x - 1, omv.y + 2);
678             COST_MV(omv.x + 1, omv.y + 2);
679             COST_MV(omv.x + 2, omv.y);
680             COST_MV(omv.x + 1, omv.y - 2);
681             COST_MV(omv.x - 1, omv.y - 2);
682             if (omv == bmv)
683                 break;
684             if (!bmv.checkRange(mvmin, mvmax))
685                 break;
686         }
687 
688 #else // if 0
689       /* equivalent to the above, but eliminates duplicate candidates */
690         COST_MV_X3_DIR(-2, 0, -1, 2,  1, 2, costs);
691         bcost <<= 3;
692         COPY1_IF_LT(bcost, (costs[0] << 3) + 2);
693         COPY1_IF_LT(bcost, (costs[1] << 3) + 3);
694         COPY1_IF_LT(bcost, (costs[2] << 3) + 4);
695         COST_MV_X3_DIR(2, 0,  1, -2, -1, -2, costs);
696         COPY1_IF_LT(bcost, (costs[0] << 3) + 5);
697         COPY1_IF_LT(bcost, (costs[1] << 3) + 6);
698         COPY1_IF_LT(bcost, (costs[2] << 3) + 7);
699 
700         if (bcost & 7)
701         {
702             int dir = (bcost & 7) - 2;
703             bmv += hex2[dir + 1];
704 
705             /* half hexagon, not overlapping the previous iteration */
706             for (int i = (merange >> 1) - 1; i > 0 && bmv.checkRange(mvmin, mvmax); i--)
707             {
708                 COST_MV_X3_DIR(hex2[dir + 0].x, hex2[dir + 0].y,
709                                hex2[dir + 1].x, hex2[dir + 1].y,
710                                hex2[dir + 2].x, hex2[dir + 2].y,
711                                costs);
712                 bcost &= ~7;
713                 COPY1_IF_LT(bcost, (costs[0] << 3) + 1);
714                 COPY1_IF_LT(bcost, (costs[1] << 3) + 2);
715                 COPY1_IF_LT(bcost, (costs[2] << 3) + 3);
716                 if (!(bcost & 7))
717                     break;
718                 dir += (bcost & 7) - 2;
719                 dir = mod6m1[dir + 1];
720                 bmv += hex2[dir + 1];
721             }
722         }
723         bcost >>= 3;
724 #endif // if 0
725 
726         /* square refine */
727         int dir = 0;
728         COST_MV_X4_DIR(0, -1,  0, 1, -1, 0, 1, 0, costs);
729         COPY2_IF_LT(bcost, costs[0], dir, 1);
730         COPY2_IF_LT(bcost, costs[1], dir, 2);
731         COPY2_IF_LT(bcost, costs[2], dir, 3);
732         COPY2_IF_LT(bcost, costs[3], dir, 4);
733         COST_MV_X4_DIR(-1, -1, -1, 1, 1, -1, 1, 1, costs);
734         COPY2_IF_LT(bcost, costs[0], dir, 5);
735         COPY2_IF_LT(bcost, costs[1], dir, 6);
736         COPY2_IF_LT(bcost, costs[2], dir, 7);
737         COPY2_IF_LT(bcost, costs[3], dir, 8);
738         bmv += square1[dir];
739         break;
740     }
741 
742     case X265_UMH_SEARCH:
743     {
744         int ucost1, ucost2;
745         int16_t cross_start = 1;
746 
747         /* refine predictors */
748         omv = bmv;
749         ucost1 = bcost;
750         DIA1_ITER(pmv.x, pmv.y);
751         if (pmv.notZero())
752             DIA1_ITER(0, 0);
753 
754         ucost2 = bcost;
755         if (bmv.notZero() && bmv != pmv)
756             DIA1_ITER(bmv.x, bmv.y);
757         if (bcost == ucost2)
758             cross_start = 3;
759 
760         /* Early Termination */
761         omv = bmv;
762         if (bcost == ucost2 && SAD_THRESH(2000))
763         {
764             COST_MV_X4(0, -2, -1, -1, 1, -1, -2, 0);
765             COST_MV_X4(2, 0, -1, 1, 1, 1,  0, 2);
766             if (bcost == ucost1 && SAD_THRESH(500))
767                 break;
768             if (bcost == ucost2)
769             {
770                 int16_t range = (int16_t)(merange >> 1) | 1;
771                 CROSS(3, range, range);
772                 COST_MV_X4(-1, -2, 1, -2, -2, -1, 2, -1);
773                 COST_MV_X4(-2, 1, 2, 1, -1, 2, 1, 2);
774                 if (bcost == ucost2)
775                     break;
776                 cross_start = range + 2;
777             }
778         }
779 
780         // TODO: Need to study x264's logic for building mvc list to understand why they
781         //       have special cases here for 16x16, and whether they apply to HEVC CTU
782 
783         // adaptive search range based on mvc variability
784         if (numCandidates)
785         {
786             /* range multipliers based on casual inspection of some statistics of
787              * average distance between current predictor and final mv found by ESA.
788              * these have not been tuned much by actual encoding. */
789             static const uint8_t range_mul[4][4] =
790             {
791                 { 3, 3, 4, 4 },
792                 { 3, 4, 4, 4 },
793                 { 4, 4, 4, 5 },
794                 { 4, 4, 5, 6 },
795             };
796 
797             int mvd;
798             int sad_ctx, mvd_ctx;
799             int denom = 1;
800 
801             if (numCandidates == 1)
802             {
803                 if (LUMA_64x64 == partEnum)
804                     /* mvc is probably the same as mvp, so the difference isn't meaningful.
805                      * but prediction usually isn't too bad, so just use medium range */
806                     mvd = 25;
807                 else
808                     mvd = abs(qmvp.x - mvc[0].x) + abs(qmvp.y - mvc[0].y);
809             }
810             else
811             {
812                 /* calculate the degree of agreement between predictors. */
813 
814                 /* in 64x64, mvc includes all the neighbors used to make mvp,
815                  * so don't count mvp separately. */
816 
817                 denom = numCandidates - 1;
818                 mvd = 0;
819                 if (partEnum != LUMA_64x64)
820                 {
821                     mvd = abs(qmvp.x - mvc[0].x) + abs(qmvp.y - mvc[0].y);
822                     denom++;
823                 }
824                 mvd += predictorDifference(mvc, numCandidates);
825             }
826 
827             sad_ctx = SAD_THRESH(1000) ? 0
828                 : SAD_THRESH(2000) ? 1
829                 : SAD_THRESH(4000) ? 2 : 3;
830             mvd_ctx = mvd < 10 * denom ? 0
831                 : mvd < 20 * denom ? 1
832                 : mvd < 40 * denom ? 2 : 3;
833 
834             merange = (merange * range_mul[mvd_ctx][sad_ctx]) >> 2;
835         }
836 
837         /* FIXME if the above DIA2/OCT2/CROSS found a new mv, it has not updated omx/omy.
838          * we are still centered on the same place as the DIA2. is this desirable? */
839         CROSS(cross_start, merange, merange >> 1);
840         COST_MV_X4(-2, -2, -2, 2, 2, -2, 2, 2);
841 
842         /* hexagon grid */
843         omv = bmv;
844         const uint16_t *p_cost_omvx = m_cost_mvx + omv.x * 4;
845         const uint16_t *p_cost_omvy = m_cost_mvy + omv.y * 4;
846         uint16_t i = 1;
847         do
848         {
849             if (4 * i > X265_MIN4(mvmax.x - omv.x, omv.x - mvmin.x,
850                                   mvmax.y - omv.y, omv.y - mvmin.y))
851             {
852                 for (int j = 0; j < 16; j++)
853                 {
854                     MV mv = omv + (hex4[j] * i);
855                     if (mv.checkRange(mvmin, mvmax))
856                         COST_MV(mv.x, mv.y);
857                 }
858             }
859             else
860             {
861                 int16_t dir = 0;
862                 pixel *fref_base = fref + omv.x + (omv.y - 4 * i) * stride;
863                 size_t dy = (size_t)i * stride;
864 #define SADS(k, x0, y0, x1, y1, x2, y2, x3, y3) \
865     sad_x4(fenc, \
866            fref_base x0 * i + (y0 - 2 * k + 4) * dy, \
867            fref_base x1 * i + (y1 - 2 * k + 4) * dy, \
868            fref_base x2 * i + (y2 - 2 * k + 4) * dy, \
869            fref_base x3 * i + (y3 - 2 * k + 4) * dy, \
870            stride, costs + 4 * k); \
871     fref_base += 2 * dy;
872 #define ADD_MVCOST(k, x, y) costs[k] += p_cost_omvx[x * 4 * i] + p_cost_omvy[y * 4 * i]
873 #define MIN_MV(k, x, y)     COPY2_IF_LT(bcost, costs[k], dir, x * 16 + (y & 15))
874 
875                 SADS(0, +0, -4, +0, +4, -2, -3, +2, -3);
876                 SADS(1, -4, -2, +4, -2, -4, -1, +4, -1);
877                 SADS(2, -4, +0, +4, +0, -4, +1, +4, +1);
878                 SADS(3, -4, +2, +4, +2, -2, +3, +2, +3);
879                 ADD_MVCOST(0, 0, -4);
880                 ADD_MVCOST(1, 0, 4);
881                 ADD_MVCOST(2, -2, -3);
882                 ADD_MVCOST(3, 2, -3);
883                 ADD_MVCOST(4, -4, -2);
884                 ADD_MVCOST(5, 4, -2);
885                 ADD_MVCOST(6, -4, -1);
886                 ADD_MVCOST(7, 4, -1);
887                 ADD_MVCOST(8, -4, 0);
888                 ADD_MVCOST(9, 4, 0);
889                 ADD_MVCOST(10, -4, 1);
890                 ADD_MVCOST(11, 4, 1);
891                 ADD_MVCOST(12, -4, 2);
892                 ADD_MVCOST(13, 4, 2);
893                 ADD_MVCOST(14, -2, 3);
894                 ADD_MVCOST(15, 2, 3);
895                 MIN_MV(0, 0, -4);
896                 MIN_MV(1, 0, 4);
897                 MIN_MV(2, -2, -3);
898                 MIN_MV(3, 2, -3);
899                 MIN_MV(4, -4, -2);
900                 MIN_MV(5, 4, -2);
901                 MIN_MV(6, -4, -1);
902                 MIN_MV(7, 4, -1);
903                 MIN_MV(8, -4, 0);
904                 MIN_MV(9, 4, 0);
905                 MIN_MV(10, -4, 1);
906                 MIN_MV(11, 4, 1);
907                 MIN_MV(12, -4, 2);
908                 MIN_MV(13, 4, 2);
909                 MIN_MV(14, -2, 3);
910                 MIN_MV(15, 2, 3);
911 #undef SADS
912 #undef ADD_MVCOST
913 #undef MIN_MV
914                 if (dir)
915                 {
916                     bmv.x = omv.x + i * (dir >> 4);
917                     bmv.y = omv.y + i * ((dir << 28) >> 28);
918                 }
919             }
920         }
921         while (++i <= merange >> 2);
922         if (bmv.checkRange(mvmin, mvmax))
923             goto me_hex2;
924         break;
925     }
926 
927     case X265_STAR_SEARCH: // Adapted from HM ME
928     {
929         int bPointNr = 0;
930         int bDistance = 0;
931 
932         const int EarlyExitIters = 3;
933         StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, EarlyExitIters, merange);
934         if (bDistance == 1)
935         {
936             // if best distance was only 1, check two missing points.  If no new point is found, stop
937             if (bPointNr)
938             {
939                 /* For a given direction 1 to 8, check nearest two outer X pixels
940                      X   X
941                    X 1 2 3 X
942                      4 * 5
943                    X 6 7 8 X
944                      X   X
945                 */
946                 int saved = bcost;
947                 const MV mv1 = bmv + offsets[(bPointNr - 1) * 2];
948                 const MV mv2 = bmv + offsets[(bPointNr - 1) * 2 + 1];
949                 if (mv1.checkRange(mvmin, mvmax))
950                 {
951                     COST_MV(mv1.x, mv1.y);
952                 }
953                 if (mv2.checkRange(mvmin, mvmax))
954                 {
955                     COST_MV(mv2.x, mv2.y);
956                 }
957                 if (bcost == saved)
958                     break;
959             }
960             else
961                 break;
962         }
963 
964         const int RasterDistance = 5;
965         if (bDistance > RasterDistance)
966         {
967             // raster search refinement if original search distance was too big
968             MV tmv;
969             for (tmv.y = mvmin.y; tmv.y <= mvmax.y; tmv.y += RasterDistance)
970             {
971                 for (tmv.x = mvmin.x; tmv.x <= mvmax.x; tmv.x += RasterDistance)
972                 {
973                     if (tmv.x + (RasterDistance * 3) <= mvmax.x)
974                     {
975                         pixel *pix_base = fref + tmv.y * stride + tmv.x;
976                         sad_x4(fenc,
977                                pix_base,
978                                pix_base + RasterDistance,
979                                pix_base + RasterDistance * 2,
980                                pix_base + RasterDistance * 3,
981                                stride, costs);
982                         costs[0] += mvcost(tmv << 2);
983                         COPY2_IF_LT(bcost, costs[0], bmv, tmv);
984                         tmv.x += RasterDistance;
985                         costs[1] += mvcost(tmv << 2);
986                         COPY2_IF_LT(bcost, costs[1], bmv, tmv);
987                         tmv.x += RasterDistance;
988                         costs[2] += mvcost(tmv << 2);
989                         COPY2_IF_LT(bcost, costs[2], bmv, tmv);
990                         tmv.x += RasterDistance;
991                         costs[3] += mvcost(tmv << 3);
992                         COPY2_IF_LT(bcost, costs[3], bmv, tmv);
993                     }
994                     else
995                         COST_MV(tmv.x, tmv.y);
996                 }
997             }
998         }
999 
1000         while (bDistance > 0)
1001         {
1002             // center a new search around current best
1003             bDistance = 0;
1004             bPointNr = 0;
1005             const int MaxIters = 32;
1006             StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, MaxIters, merange);
1007 
1008             if (bDistance == 1)
1009             {
1010                 if (!bPointNr)
1011                     break;
1012 
1013                 /* For a given direction 1 to 8, check nearest 2 outer X pixels
1014                         X   X
1015                     X 1 2 3 X
1016                         4 * 5
1017                     X 6 7 8 X
1018                         X   X
1019                 */
1020                 const MV mv1 = bmv + offsets[(bPointNr - 1) * 2];
1021                 const MV mv2 = bmv + offsets[(bPointNr - 1) * 2 + 1];
1022                 if (mv1.checkRange(mvmin, mvmax))
1023                 {
1024                     COST_MV(mv1.x, mv1.y);
1025                 }
1026                 if (mv2.checkRange(mvmin, mvmax))
1027                 {
1028                     COST_MV(mv2.x, mv2.y);
1029                 }
1030                 break;
1031             }
1032         }
1033 
1034         break;
1035     }
1036 
1037     case X265_FULL_SEARCH:
1038     {
1039         // dead slow exhaustive search, but at least it uses sad_x4()
1040         MV tmv;
1041         for (tmv.y = mvmin.y; tmv.y <= mvmax.y; tmv.y++)
1042         {
1043             for (tmv.x = mvmin.x; tmv.x <= mvmax.x; tmv.x++)
1044             {
1045                 if (tmv.x + 3 <= mvmax.x)
1046                 {
1047                     pixel *pix_base = fref + tmv.y * stride + tmv.x;
1048                     sad_x4(fenc,
1049                            pix_base,
1050                            pix_base + 1,
1051                            pix_base + 2,
1052                            pix_base + 3,
1053                            stride, costs);
1054                     costs[0] += mvcost(tmv << 2);
1055                     COPY2_IF_LT(bcost, costs[0], bmv, tmv);
1056                     tmv.x++;
1057                     costs[1] += mvcost(tmv << 2);
1058                     COPY2_IF_LT(bcost, costs[1], bmv, tmv);
1059                     tmv.x++;
1060                     costs[2] += mvcost(tmv << 2);
1061                     COPY2_IF_LT(bcost, costs[2], bmv, tmv);
1062                     tmv.x++;
1063                     costs[3] += mvcost(tmv << 2);
1064                     COPY2_IF_LT(bcost, costs[3], bmv, tmv);
1065                 }
1066                 else
1067                     COST_MV(tmv.x, tmv.y);
1068             }
1069         }
1070 
1071         break;
1072     }
1073 
1074     default:
1075         X265_CHECK(0, "invalid motion estimate mode\n");
1076         break;
1077     }
1078 
1079     if (bprecost < bcost)
1080     {
1081         bmv = bestpre;
1082         bcost = bprecost;
1083     }
1084     else
1085         bmv = bmv.toQPel(); // promote search bmv to qpel
1086 
1087     const SubpelWorkload& wl = workload[this->subpelRefine];
1088 
1089     if (!bcost)
1090     {
1091         /* if there was zero residual at the clipped MVP, we can skip subpel
1092          * refine, but we do need to include the mvcost in the returned cost */
1093         bcost = mvcost(bmv);
1094     }
1095     else if (ref->isLowres)
1096     {
1097         int bdir = 0;
1098         for (int i = 1; i <= wl.hpel_dirs; i++)
1099         {
1100             MV qmv = bmv + square1[i] * 2;
1101             int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad) + mvcost(qmv);
1102             COPY2_IF_LT(bcost, cost, bdir, i);
1103         }
1104 
1105         bmv += square1[bdir] * 2;
1106         bcost = ref->lowresQPelCost(fenc, blockOffset, bmv, satd) + mvcost(bmv);
1107 
1108         bdir = 0;
1109         for (int i = 1; i <= wl.qpel_dirs; i++)
1110         {
1111             MV qmv = bmv + square1[i];
1112             int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd) + mvcost(qmv);
1113             COPY2_IF_LT(bcost, cost, bdir, i);
1114         }
1115 
1116         bmv += square1[bdir];
1117     }
1118     else
1119     {
1120         pixelcmp_t hpelcomp;
1121 
1122         if (wl.hpel_satd)
1123         {
1124             bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
1125             hpelcomp = satd;
1126         }
1127         else
1128             hpelcomp = sad;
1129 
1130         for (int iter = 0; iter < wl.hpel_iters; iter++)
1131         {
1132             int bdir = 0;
1133             for (int i = 1; i <= wl.hpel_dirs; i++)
1134             {
1135                 MV qmv = bmv + square1[i] * 2;
1136                 int cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv);
1137                 COPY2_IF_LT(bcost, cost, bdir, i);
1138             }
1139 
1140             if (bdir)
1141                 bmv += square1[bdir] * 2;
1142             else
1143                 break;
1144         }
1145 
1146         /* if HPEL search used SAD, remeasure with SATD before QPEL */
1147         if (!wl.hpel_satd)
1148             bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);
1149 
1150         for (int iter = 0; iter < wl.qpel_iters; iter++)
1151         {
1152             int bdir = 0;
1153             for (int i = 1; i <= wl.qpel_dirs; i++)
1154             {
1155                 MV qmv = bmv + square1[i];
1156                 int cost = subpelCompare(ref, qmv, satd) + mvcost(qmv);
1157                 COPY2_IF_LT(bcost, cost, bdir, i);
1158             }
1159 
1160             if (bdir)
1161                 bmv += square1[bdir];
1162             else
1163                 break;
1164         }
1165     }
1166 
1167     x265_emms();
1168     outQMv = bmv;
1169     return bcost;
1170 }
1171 
subpelCompare(ReferencePlanes * ref,const MV & qmv,pixelcmp_t cmp)1172 int MotionEstimate::subpelCompare(ReferencePlanes *ref, const MV& qmv, pixelcmp_t cmp)
1173 {
1174     intptr_t refStride = ref->lumaStride;
1175     pixel *fref = ref->fpelPlane[0] + blockOffset + (qmv.x >> 2) + (qmv.y >> 2) * refStride;
1176     int xFrac = qmv.x & 0x3;
1177     int yFrac = qmv.y & 0x3;
1178     int cost;
1179     intptr_t lclStride = fencPUYuv.m_size;
1180     X265_CHECK(lclStride == FENC_STRIDE, "fenc buffer is assumed to have FENC_STRIDE by sad_x3 and sad_x4\n");
1181 
1182     if (!(yFrac | xFrac))
1183         cost = cmp(fencPUYuv.m_buf[0], lclStride, fref, refStride);
1184     else
1185     {
1186         /* we are taking a short-cut here if the reference is weighted. To be
1187          * accurate we should be interpolating unweighted pixels and weighting
1188          * the final 16bit values prior to rounding and down shifting. Instead we
1189          * are simply interpolating the weighted full-pel pixels. Not 100%
1190          * accurate but good enough for fast qpel ME */
1191         ALIGN_VAR_32(pixel, subpelbuf[64 * 64]);
1192         if (!yFrac)
1193             primitives.pu[partEnum].luma_hpp(fref, refStride, subpelbuf, lclStride, xFrac);
1194         else if (!xFrac)
1195             primitives.pu[partEnum].luma_vpp(fref, refStride, subpelbuf, lclStride, yFrac);
1196         else
1197             primitives.pu[partEnum].luma_hvpp(fref, refStride, subpelbuf, lclStride, xFrac, yFrac);
1198 
1199         cost = cmp(fencPUYuv.m_buf[0], lclStride, subpelbuf, lclStride);
1200     }
1201 
1202     if (bChromaSATD)
1203     {
1204         int csp    = fencPUYuv.m_csp;
1205         int hshift = fencPUYuv.m_hChromaShift;
1206         int vshift = fencPUYuv.m_vChromaShift;
1207         int shiftHor = (2 + hshift);
1208         int shiftVer = (2 + vshift);
1209         lclStride = fencPUYuv.m_csize;
1210 
1211         intptr_t refStrideC = ref->reconPic->m_strideC;
1212         intptr_t refOffset = (qmv.x >> shiftHor) + (qmv.y >> shiftVer) * refStrideC;
1213 
1214         const pixel* refCb = ref->getCbAddr(ctuAddr, absPartIdx) + refOffset;
1215         const pixel* refCr = ref->getCrAddr(ctuAddr, absPartIdx) + refOffset;
1216 
1217         xFrac = qmv.x & ((1 << shiftHor) - 1);
1218         yFrac = qmv.y & ((1 << shiftVer) - 1);
1219 
1220         if (!(yFrac | xFrac))
1221         {
1222             cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, refCb, refStrideC);
1223             cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, refCr, refStrideC);
1224         }
1225         else
1226         {
1227             ALIGN_VAR_32(pixel, subpelbuf[64 * 64]);
1228             if (!yFrac)
1229             {
1230                 primitives.chroma[csp].pu[partEnum].filter_hpp(refCb, refStrideC, subpelbuf, lclStride, xFrac << (1 - hshift));
1231                 cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride);
1232 
1233                 primitives.chroma[csp].pu[partEnum].filter_hpp(refCr, refStrideC, subpelbuf, lclStride, xFrac << (1 - hshift));
1234                 cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride);
1235             }
1236             else if (!xFrac)
1237             {
1238                 primitives.chroma[csp].pu[partEnum].filter_vpp(refCb, refStrideC, subpelbuf, lclStride, yFrac << (1 - vshift));
1239                 cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride);
1240 
1241                 primitives.chroma[csp].pu[partEnum].filter_vpp(refCr, refStrideC, subpelbuf, lclStride, yFrac << (1 - vshift));
1242                 cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride);
1243             }
1244             else
1245             {
1246                 ALIGN_VAR_32(int16_t, immed[64 * (64 + NTAPS_CHROMA)]);
1247 
1248                 int extStride = blockwidth >> hshift;
1249                 int filterSize = NTAPS_CHROMA;
1250                 int halfFilterSize = (filterSize >> 1);
1251 
1252                 primitives.chroma[csp].pu[partEnum].filter_hps(refCb, refStrideC, immed, extStride, xFrac << (1 - hshift), 1);
1253                 primitives.chroma[csp].pu[partEnum].filter_vsp(immed + (halfFilterSize - 1) * extStride, extStride, subpelbuf, lclStride, yFrac << (1 - vshift));
1254                 cost += chromaSatd(fencPUYuv.m_buf[1], lclStride, subpelbuf, lclStride);
1255 
1256                 primitives.chroma[csp].pu[partEnum].filter_hps(refCr, refStrideC, immed, extStride, xFrac << (1 - hshift), 1);
1257                 primitives.chroma[csp].pu[partEnum].filter_vsp(immed + (halfFilterSize - 1) * extStride, extStride, subpelbuf, lclStride, yFrac << (1 - vshift));
1258                 cost += chromaSatd(fencPUYuv.m_buf[2], lclStride, subpelbuf, lclStride);
1259             }
1260         }
1261     }
1262 
1263     return cost;
1264 }
1265